+ if mwe_pass != 'first':
+ mwe_dataset = ds.dataset(f'/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet',format='parquet')
+ mwe_dataset = mwe_dataset.to_pandas(columns=['phrase','phraseCount','phrasePWMI'])
+ mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
+ mwe_phrases = list(mwe_dataset.phrase[0:1000])
+
+
+ mwe_tokenize = MWETokenizer(mwe_phrases).tokenize
+