-def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF):
- print("loading tfidf", flush=True)
- tfidf_ds = ds.dataset(infile)
+
+# does reindex_tfidf, but without reindexing.
+def reindex_tfidf(*args, **kwargs):
+ df, tfidf_ds, ds_filter = _pull_or_reindex_tfidf(*args, **kwargs, reindex=True)
+
+ print("assigning names")
+ subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id'])
+ batches = subreddit_names.to_batches()
+
+ with Pool(cpu_count()) as pool:
+ chunks = pool.imap_unordered(pull_names,batches)
+ subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
+ subreddit_names = subreddit_names.set_index("subreddit_id")
+
+ new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
+ new_ids = new_ids.set_index('subreddit_id')
+ subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
+ subreddit_names = subreddit_names.drop("subreddit_id",1)
+ subreddit_names = subreddit_names.sort_values("subreddit_id_new")
+ return(df, subreddit_names)
+
+def pull_tfidf(*args, **kwargs):
+ df, _, _ = _pull_or_reindex_tfidf(*args, **kwargs, reindex=False)
+ return df
+
+def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
+ print(f"loading tfidf {infile}", flush=True)
+ if week is not None:
+ tfidf_ds = ds.dataset(infile, partitioning='hive')
+ else:
+ tfidf_ds = ds.dataset(infile)