+ if mwe_pass != 'first':
+ mwe_dataset = pd.read_feather(f'/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather')
+ mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
+ mwe_phrases = list(mwe_dataset.phrase)
+ mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
+ mwe_tokenizer = MWETokenizer(mwe_phrases)
+ mwe_tokenize = mwe_tokenizer.tokenize
+
+ else:
+ mwe_tokenize = MWETokenizer().tokenize