From: Nathan TeBlunthuis Date: Tue, 3 Aug 2021 22:02:08 +0000 (-0700) Subject: Merge branch 'master' of code:cdsc_reddit into excise_reindex X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/2d21ff1137dfaf83c5a51fdcd8900503c50a06ab?ds=sidebyside Merge branch 'master' of code:cdsc_reddit into excise_reindex --- 2d21ff1137dfaf83c5a51fdcd8900503c50a06ab diff --cc similarities/similarities_helper.py index e59563e,3ace8f2..1492983 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@@ -32,82 -60,23 +32,82 @@@ def reindex_tfidf(infile, term_colname if included_subreddits is None: included_subreddits = select_topN_subreddits(topN) else: - included_subreddits = set(open(included_subreddits)) + included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits)))) - if exclude_phrases == True: - tfidf = tfidf.filter(~f.col(term_colname).contains("_")) + ds_filter = ds.field("subreddit").isin(included_subreddits) + + if min_df is not None: + ds_filter &= ds.field("count") >= min_df + + if max_df is not None: + ds_filter &= ds.field("count") <= max_df + + if week is not None: + ds_filter &= ds.field("week") == week + + if from_date is not None: + ds_filter &= ds.field("week") >= from_date - print("creating temporary parquet with matrix indicies") - tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits) + if to_date is not None: + ds_filter &= ds.field("week") <= to_date - tfidf = spark.read.parquet(tempdir.name) - subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas() + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + + projection = { + 'subreddit_id':ds.field('subreddit_id'), + term_id:ds.field(term_id), + 'relative_tf':ds.field("relative_tf").cast('float32') + } + + if not rescale_idf: + projection = { + 'subreddit_id':ds.field('subreddit_id'), + term_id:ds.field(term_id), + 'relative_tf':ds.field('relative_tf').cast('float32'), + 'tf_idf':ds.field('tf_idf').cast('float32')} + + tfidf_ds = ds.dataset(infile) + + df = tfidf_ds.to_table(filter=ds_filter,columns=projection) + + df = df.to_pandas(split_blocks=True,self_destruct=True) + print("assigning indexes",flush=True) + df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + grouped = df.groupby(term_id) + df[term_id_new] = grouped.ngroup() + + if rescale_idf: + print("computing idf", flush=True) + df['new_count'] = grouped[term_id].transform('count') + N_docs = df.subreddit_id_new.max() + 1 + df['idf'] = np.log(N_docs/(1+df.new_count),dtype='float32') + 1 + if tf_family == tf_weight.MaxTF: + df["tf_idf"] = df.relative_tf * df.idf + else: # tf_fam = tf_weight.Norm05 + df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf + + print("assigning names") + subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id']) + batches = subreddit_names.to_batches() + + with Pool(cpu_count()) as pool: + chunks = pool.imap_unordered(pull_names,batches) + subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + + subreddit_names = subreddit_names.set_index("subreddit_id") + new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + new_ids = new_ids.set_index('subreddit_id') + subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + subreddit_names = subreddit_names.drop("subreddit_id",1) subreddit_names = subreddit_names.sort_values("subreddit_id_new") - subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1 - spark.stop() - return (tempdir, subreddit_names) + return(df, subreddit_names) +def pull_names(batch): + return(batch.to_pandas().drop_duplicates()) -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): +def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): ''' tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. '''