From: Nathan TeBlunthuis Date: Tue, 3 Aug 2021 22:13:21 +0000 (-0700) Subject: Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/ce549c6c97058325ac6f1b9dab20406af1dbb2af?ds=sidebyside;hp=--cc Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex --- ce549c6c97058325ac6f1b9dab20406af1dbb2af diff --cc similarities/cosine_similarities.py index 98f1454,cc681b1..b9bab17 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@@ -2,16 -2,15 +2,18 @@@ import pandas as p import fire from pathlib import Path from similarities_helper import similarities, column_similarities +from functools import partial - def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): + def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): - return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) +# change so that these take in an input as an optional argument (for speed, but also for idf). +def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): - return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', + def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): + + return cosine_similarities(infile, 'term', outfile, min_df, diff --cc similarities/similarities_helper.py index a4983b3,1492983..13845d1 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@@ -97,6 -68,8 +97,7 @@@ def _pull_or_reindex_tfidf(infile, term 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} - tfidf_ds = ds.dataset(infile) + df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = df.to_pandas(split_blocks=True,self_destruct=True) @@@ -122,8 -88,21 +123,19 @@@ else: # tf_fam = tf_weight.Norm05 df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf - print("assigning names") - subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id']) - batches = subreddit_names.to_batches() + return (df, tfidf_ds, ds_filter) + with Pool(cpu_count()) as pool: + chunks = pool.imap_unordered(pull_names,batches) + subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + + subreddit_names = subreddit_names.set_index("subreddit_id") + new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + new_ids = new_ids.set_index('subreddit_id') + subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + subreddit_names = subreddit_names.drop("subreddit_id",1) + subreddit_names = subreddit_names.sort_values("subreddit_id_new") + return(df, subreddit_names) def pull_names(batch): return(batch.to_pandas().drop_duplicates()) @@@ -287,21 -254,20 +297,20 @@@ def build_weekly_tfidf_dataset(df, incl idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1) # collect the dictionary to make a pydict of terms to indexes - terms = idf.select([term,'week']).distinct() # terms are distinct + terms = idf.select([term]).distinct() # terms are distinct - terms = terms.withColumn(term_id,f.row_number().over(Window.partitionBy('week').orderBy(term))) # term ids are distinct + terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct # make subreddit ids - subreddits = df.select(['subreddit','week']).distinct() - subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.partitionBy("week").orderBy("subreddit"))) + subreddits = df.select(['subreddit']).distinct() + subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit"))) - # df = df.cache() - df = df.join(subreddits,on=['subreddit','week']) + df = df.join(subreddits,on=['subreddit']) # map terms to indexes in the tfs and the idfs - df = df.join(terms,on=[term,'week']) # subreddit-term-id is unique + df = df.join(terms,on=[term]) # subreddit-term-id is unique - idf = idf.join(terms,on=[term,'week']) + idf = idf.join(terms,on=[term]) # join on subreddit/term to create tf/dfs indexed by term df = df.join(idf, on=[term_id, term,'week']) diff --cc similarities/tfidf.py index 94dcbf5,110536e..19d3013 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@@ -51,8 -51,8 +51,8 @@@ def tfidf_terms(outpath='/gscratch/comd ) def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', - topN=25000, + topN=None, - include_subreddits=None): + included_subreddits=None): return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", outpath, @@@ -63,7 -63,8 +63,8 @@@ ) def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', - topN=25000): - topN=25000, ++ topN=None, + included_subreddits=None): return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",