-# choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions.
-#
-df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3)
-df = df.toPandas()
-df.to_feather("/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather")
-df.to_csv("/gscratch/comdata/users/nathante/reddit_multiword_expressions.csv")
+ df = spark.read.parquet(str(pwmi_dir))
+ df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI')
+
+ # choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions.
+ #
+ df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3)
+ df = df.toPandas()
+ df.to_feather(ngram_dir / "multiword_expressions.feather")
+ df.to_csv(ngram_dir / "multiword_expressions.csv")
+
+if __name__ == '__main__':
+ fire.Fire(main)