similarities/tfidf.py

   1 import fire
   2 from pyspark.sql import SparkSession
   3 from pyspark.sql import functions as f
   4 from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
   5 from functools import partial
   6
   7 inpath = '/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet'
   8 # include_terms is a path to a parquet file that contains a column of term_colname + '_id' to include.
   9 def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None):
  10     spark = SparkSession.builder.getOrCreate()
  11
  12     df = spark.read.parquet(inpath)
  13
  14     df = df.filter(~ f.col(term_colname).isin(exclude))
  15
  16     if included_subreddits is not None:
  17         include_subs = set(map(str.strip,open(included_subreddits)))
  18     else:
  19         include_subs = select_topN_subreddits(topN)
  20
  21     include_subs = spark.sparkContext.broadcast(include_subs)
  22
  23     #    term_id = term_colname + "_id"
  24
  25     if included_terms is not None:
  26         terms_df = spark.read.parquet(included_terms)
  27         terms_df = terms_df.select(term_colname).distinct()
  28         df = df.join(terms_df, on=term_colname, how='left_semi')
  29
  30     dfwriter = func(df, include_subs.value, term_colname)
  31
  32     dfwriter.parquet(outpath,mode='overwrite',compression='snappy')
  33     spark.stop()
  34
  35 def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits, min_df, max_df):
  36     tfidf_func = partial(tfidf_dataset, max_df=max_df, min_df=min_df)
  37     return _tfidf_wrapper(tfidf_func, inpath, outpath, topN, term_colname, exclude, included_subreddits)
  38
  39 def tfidf_weekly(inpath, outpath, static_tfidf_path, topN, term_colname, exclude, included_subreddits):
  40     return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=static_tfidf_path)
  41
  42
  43 def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
  44                   outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
  45                   topN=None,
  46                   included_subreddits=None,
  47                   min_df=None,
  48                   max_df=None):
  49
  50     return tfidf(inpath,
  51                  outpath,
  52                  topN,
  53                  'author',
  54                  ['[deleted]','AutoModerator'],
  55                  included_subreddits=included_subreddits,
  56                  min_df=min_df,
  57                  max_df=max_df
  58                  )
  59
  60 def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
  61                 outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
  62                 topN=None,
  63                 included_subreddits=None,
  64                 min_df=None,
  65                 max_df=None):
  66
  67     return tfidf(inpath,
  68                  outpath,
  69                  topN,
  70                  'term',
  71                  [],
  72                  included_subreddits=included_subreddits,
  73                  min_df=min_df,
  74                  max_df=max_df
  75                  )
  76
  77 def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
  78                          static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet",
  79                          outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
  80                          topN=None,
  81                          included_subreddits=None):
  82
  83     return tfidf_weekly(inpath,
  84                         outpath,
  85                         static_tfidf_path,
  86                         topN,
  87                         'author',
  88                         ['[deleted]','AutoModerator'],
  89                         included_subreddits=included_subreddits
  90                         )
  91
  92 def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
  93                        static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet",
  94                        outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
  95                        topN=None,
  96                        included_subreddits=None):
  97
  98
  99     return tfidf_weekly(inpath,
 100                         outpath,
 101                         static_tfidf_path,
 102                         topN,
 103                         'term',
 104                         [],
 105                         included_subreddits=included_subreddits
 106                         )
 107
 108
 109 if __name__ == "__main__":
 110     fire.Fire({'authors':tfidf_authors,
 111                'terms':tfidf_terms,
 112                'authors_weekly':tfidf_authors_weekly,
 113                'terms_weekly':tfidf_terms_weekly})