X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/65deba5e4e4ad9e3f23e82573491f7d6b190e644..9345f9de9437d5965ad4ee5874bc24199e077d48:/similarities/similarities_helper.py diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 202220c..6925a15 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -95,6 +95,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu projection = { 'subreddit_id':ds.field('subreddit_id'), term_id:ds.field(term_id), + 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} print(projection, flush=True) @@ -102,7 +103,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = df.to_pandas(split_blocks=True,self_destruct=True) - + print("assigning indexes",flush=True) if reindex: print("assigning indexes",flush=True) df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + 1 @@ -127,17 +128,6 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu return (df, tfidf_ds, ds_filter) - # with Pool(cpu_count()) as pool: - # chunks = pool.imap_unordered(pull_names,batches) - # subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() - - # subreddit_names = subreddit_names.set_index("subreddit_id") - # new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() - # new_ids = new_ids.set_index('subreddit_id') - # subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() - # subreddit_names = subreddit_names.drop("subreddit_id",1) - # subreddit_names = subreddit_names.sort_values("subreddit_id_new") - # return(df, subreddit_names) def pull_names(batch): return(batch.to_pandas().drop_duplicates()) @@ -239,8 +229,7 @@ def test_lsi_sims(): # if n_components is a list we'll return a list of similarities with different latent dimensionalities # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations. # this function takes the svd and then the column similarities of it -# lsi_model_load = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI/1000_term_LSIMOD.pkl" -def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model=None): +def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None): # first compute the lsi of the matrix # then take the column similarities @@ -251,21 +240,24 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 svd_components = n_components[0] - if lsi_model is None: + if lsi_model_load is not None and Path(lsi_model_load).exists(): + print("loading LSI") + mod = pickle.load(open(lsi_model_load ,'rb')) + lsi_model_save = lsi_model_load + + else: print("running LSI",flush=True) svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) mod = svd.fit(tfidfmat.T) - else: - mod = lsi_model - lsimat = mod.transform(tfidfmat.T) if lsi_model_save is not None: - Path(lsi_model_save).parent.mkdir(exist_ok=True,parents=True) + Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True) pickle.dump(mod, open(lsi_model_save,'wb')) - print(n_components) + print(n_components, flush=True) + lsimat = mod.transform(tfidfmat.T) for n_dims in n_components: - print("computing similarities") + print("computing similarities", flush=True) sims = column_similarities(lsimat[:,np.arange(n_dims)]) yield (sims, n_dims) @@ -381,7 +373,7 @@ def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None): def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None): term = term_colname term_id = term + '_id' - + # aggregate counts by week. now subreddit-term is distinct df = df.filter(df.subreddit.isin(include_subs)) df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf')) @@ -390,7 +382,7 @@ def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, mi dfwriter = df.write return dfwriter -def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): +def select_topN_subreddits(topN, path="../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits