From: Nate E TeBlunthuis Date: Tue, 23 Feb 2021 00:03:48 +0000 (-0800) Subject: Changes from hyak. X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/4dc949de5fb8d3eac04bae125c819100002c9522?ds=inline;hp=-c Changes from hyak. --- 4dc949de5fb8d3eac04bae125c819100002c9522 diff --git a/clustering/Makefile b/clustering/Makefile index 115b218..20d7808 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -1,10 +1,32 @@ #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28' -all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather +all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather +#all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather # $srun_cdsc python3 - ./clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85 + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85 /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather # $srun_cdsc python3 - ./clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5 + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5 + +/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet +# $srun_cdsc + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85 + +# it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap +# /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather +# ./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85 + +/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet + + start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather + + +# /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather + +# python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather + +/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather +# $srun_cdsc python3 + start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather diff --git a/clustering/fit_tsne.py b/clustering/fit_tsne.py index 28b0fd3..c9f45f6 100644 --- a/clustering/fit_tsne.py +++ b/clustering/fit_tsne.py @@ -5,7 +5,7 @@ from numpy import random import numpy as np from sklearn.manifold import TSNE -similarities = "term_similarities_10000.feather" +similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20): ''' diff --git a/density/Makefile b/density/Makefile index 43075a4..d223399 100644 --- a/density/Makefile +++ b/density/Makefile @@ -1,7 +1,10 @@ -all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather +all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather - python3 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum + start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum /gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather - python3 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum diff --git a/density/job_script.sh b/density/job_script.sh index 553d0a1..7dfac14 100755 --- a/density/job_script.sh +++ b/density/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 overlap_density.py wang_overlaps --inpath=/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet --to_date=2020-04-13 +spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum stop-all.sh diff --git a/density/overlap_density.py b/density/overlap_density.py index a1e9f6d..5a8e91a 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -5,7 +5,7 @@ import numpy as np import sys sys.path.append("..") sys.path.append("../similarities") -from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval +from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member @@ -72,5 +72,5 @@ if __name__ == "__main__": fire.Fire({'authors':author_overlap_density, 'terms':term_overlap_density, 'author_weekly':author_overlap_density_weekly, - 'term_weekly':term_overlap_density_weekly, - 'wang_overlaps':wang_overlap_density}) + 'term_weekly':term_overlap_density_weekly}) + diff --git a/similarities/Makefile b/similarities/Makefile index 51fd0fa..0ec0342 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,13 +1,25 @@ -all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet +all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms.parquet -/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet - start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.feather +# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet -/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather -/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.feather +# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet +# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather -/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet +/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv + start_spark_and_run.sh 1 tfidf.py terms --topN=10000 + +/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv + start_spark_and_run.sh 1 tfidf.py authors --topN=10000 + +/gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet + start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather + +/gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet + start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather + +# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet +# start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet + +/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet + start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet diff --git a/similarities/__pycache__/similarities_helper.cpython-37.pyc b/similarities/__pycache__/similarities_helper.cpython-37.pyc index e5e4965..eb607f3 100644 Binary files a/similarities/__pycache__/similarities_helper.cpython-37.pyc and b/similarities/__pycache__/similarities_helper.cpython-37.pyc differ diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 609e477..c1e99c9 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -1,30 +1,53 @@ import pandas as pd import fire from pathlib import Path -from similarities_helper import similarities +from similarities_helper import similarities, column_similarities -def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False,from_date=None, to_date=None): - return similiarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date) +def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): -def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + + +def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', 'term', outfile, min_df, + max_df, + included_subreddits, + topN, + exclude_phrases, + from_date, + to_date) + +def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): + return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', + 'author', + outfile, + min_df, + max_df, included_subreddits, topN, - exclude_phrasesby.) + exclude_phrases=False, + from_date=from_date, + to_date=to_date) -def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000, from_date=None, to_date=None): +def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', 'author', outfile, min_df, + max_df, included_subreddits, topN, - exclude_phrases=False) + exclude_phrases=False, + from_date=from_date, + to_date=to_date, + tfidf_colname='relative_tf') + if __name__ == "__main__": fire.Fire({'term':term_cosine_similarities, - 'author':author_cosine_similarities}) + 'author':author_cosine_similarities, + 'author-tf':author_tf_similarities}) diff --git a/similarities/job_script.sh b/similarities/job_script.sh index e79a061..03e77de 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 wang_similarity.py --infile=/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet --max_df=10 --outfile=/gscratch/comdata/output/reddit_similarity/wang_similarity_1000_max10.feather +spark-submit --master spark://$(hostname):18899 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather stop-all.sh diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 69516a6..9e33c9d 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -75,17 +75,20 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre spark.stop() return (tempdir, subreddit_names) -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): +def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): + ''' + tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. + ''' if from_date is not None or to_date is not None: - tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) + tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) else: - tempdir, subreddit_names = reindex_tfidf(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) + tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) print("loading matrix") # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname) - mat = read_tfidf_matrix(tempdir.name, term_colname) + mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) print('computing similarities') sims = simfunc(mat) del mat @@ -108,14 +111,24 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non sims.to_feather(outfile) tempdir.cleanup() -def read_tfidf_matrix_weekly(path, term_colname, week): +def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'): term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas() + return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1)))) + +def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'): + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + dataset = ds.dataset(path,format='parquet') + print(f"tfidf_colname:{tfidf_colname}") + entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas() + return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + def write_weekly_similarities(path, sims, week, names): sims['week'] = week @@ -127,15 +140,6 @@ def write_weekly_similarities(path, sims, week, names): sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values) sims.to_parquet(p / week.isoformat()) -def read_tfidf_matrix(path,term_colname): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) - def column_overlaps(mat): non_zeros = (mat != 0).astype('double') @@ -383,7 +387,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm return df -def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"): +def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonswf.csv"): rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits diff --git a/similarities/tfidf.py b/similarities/tfidf.py index b7b4e63..885dae2 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -58,12 +58,13 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', topN=25000): + return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", - outpath, - topN, - 'term', - [] - ) + outpath, + topN, + 'term', + [] + ) if __name__ == "__main__": diff --git a/similarities/wang_similarity.py b/similarities/wang_similarity.py index 99dc3cb..452e07a 100644 --- a/similarities/wang_similarity.py +++ b/similarities/wang_similarity.py @@ -12,7 +12,7 @@ infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None): - return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=None, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date) + return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date) if __name__ == "__main__": fire.Fire(wang_overlaps) diff --git a/visualization/subreddit_author_tf_similarities_10000.html b/visualization/subreddit_author_tf_similarities_10000.html index 722f5b0..eac12c5 100644 --- a/visualization/subreddit_author_tf_similarities_10000.html +++ b/visualization/subreddit_author_tf_similarities_10000.html @@ -14,7 +14,7 @@