refactor clustring in object oriented style

[cdsc_reddit.git] / similarities / tfidf.py
diff --git a/similarities/tfidf.py b/similarities/tfidf.py

index f0b5d6471898045ce5559f7d41868331c671c784..002e89f785b37fd9df3c903775ab6f71846909d4 100644 (file)
--- a/similarities/tfidf.py
+++ b/similarities/tfidf.py
@@ -15,17 +15,16 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_
      else:
          include_subs = select_topN_subreddits(topN)
  
-    df = func(df, include_subs, term_colname)
-
-    df.write.parquet(outpath,mode='overwrite',compression='snappy')
+    dfwriter = func(df, include_subs, term_colname)
  
+    dfwriter.parquet(outpath,mode='overwrite',compression='snappy')
      spark.stop()
  
  def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits):
      return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
  
-def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
-    return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, included_subreddits)
+def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits):
+    return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
  
  def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
                    topN=25000):