From: Nathan TeBlunthuis Date: Tue, 3 Aug 2021 22:13:21 +0000 (-0700) Subject: Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/ce549c6c97058325ac6f1b9dab20406af1dbb2af?hp=6e43294a41e030e557d7e612f1e6ddb063482689 Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex --- diff --git a/clustering/Makefile b/clustering/Makefile index 69c6c15..9643f52 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -2,9 +2,9 @@ srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering -kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" -hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" -affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]" +kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] +hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] +affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI @@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) +${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py + $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 +${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py + $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 clean_affinity: rm -f ${authors_10k_output}/affinity/selection_data.csv diff --git a/clustering/grid_sweep.py b/clustering/grid_sweep.py index 636dcbc..c0365d0 100644 --- a/clustering/grid_sweep.py +++ b/clustering/grid_sweep.py @@ -7,6 +7,7 @@ class grid_sweep: def __init__(self, jobtype, inpath, outpath, namer, *args): self.jobtype = jobtype self.namer = namer + print(*args) grid = list(product(*args)) inpath = Path(inpath) outpath = Path(outpath) diff --git a/clustering/hdbscan_clustering_lsi.py b/clustering/hdbscan_clustering_lsi.py index 73b5276..cbd44bd 100644 --- a/clustering/hdbscan_clustering_lsi.py +++ b/clustering/hdbscan_clustering_lsi.py @@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep): self.lsi_dim = lsi_dim self.jobtype = hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs) def namer(self, *args, **kwargs): @@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2] obj = hdbscan_lsi_grid_sweep(inpath, lsi_dimensions, outpath, - map(int,min_cluster_sizes), - map(int,min_samples), - map(float,cluster_selection_epsilons), + list(map(int,min_cluster_sizes)), + list(map(int,min_samples)), + list(map(float,cluster_selection_epsilons)), cluster_selection_methods ) diff --git a/clustering/kmeans_clustering_lsi.py b/clustering/kmeans_clustering_lsi.py index 20d582b..bb006f3 100644 --- a/clustering/kmeans_clustering_lsi.py +++ b/clustering/kmeans_clustering_lsi.py @@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep): print(kwargs) self.lsi_dim = lsi_dim self.jobtype = kmeans_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs) def namer(self, *args, **kwargs): s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) diff --git a/clustering/pick_best_clustering.py b/clustering/pick_best_clustering.py index 91c443e..c541d23 100644 --- a/clustering/pick_best_clustering.py +++ b/clustering/pick_best_clustering.py @@ -2,15 +2,15 @@ import fire import pandas as pd from pathlib import Path import shutil - -selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv" +selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv" outpath = 'test_best.feather' +min_clusters=50; max_isolates=5000; min_cluster_size=2 # pick the best clustering according to silhouette score subject to contraints -def pick_best_clustering(selection_data, output, min_clusters, max_isolates): +def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size): df = pd.read_csv(selection_data,index_col=0) - df = df.sort_values("silhouette_score") + df = df.sort_values("silhouette_score",ascending=False) # not sure I fixed the bug underlying this fully or not. df['n_isolates_str'] = df.n_isolates.str.strip("[]") @@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates): df.loc[df.n_isolates_0,'n_isolates'] = 0 df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l)) - best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]] + best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]] print(best_cluster.to_dict()) best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather") - shutil.copy(best_path,output) if __name__ == "__main__": diff --git a/clustering/selection.py b/clustering/selection.py index d2fa6de..81641db 100644 --- a/clustering/selection.py +++ b/clustering/selection.py @@ -1,7 +1,38 @@ -import fire -from select_affinity import select_affinity_clustering -from select_kmeans import select_kmeans_clustering +import pandas as pd +import plotnine as pn +from pathlib import Path +from clustering.fit_tsne import fit_tsne +from visualization.tsne_vis import build_visualization + +df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0) + +# plot silhouette_score as a function of isolates +df = df.sort_values("silhouette_score") + +df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1])) +p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point() +p.save("isolates_x_score.png") + +p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point() +p.save("clusters_x_isolates.png") + +# the best result for hdbscan seems like this one: it has a decent number of +# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables +best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]] + +best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]] + +tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather") + +if not tnse_data.exists(): + fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather", + tnse_data) + +build_visualization("./clustering/authors-tf_lsi850_tsne.feather", + Path(best_eom.outpath)/(best_eom['name']+'.feather'), + "./authors-tf_lsi850_best_eom.html") + +build_visualization("./clustering/authors-tf_lsi850_tsne.feather", + Path(best_leaf.outpath)/(best_leaf['name']+'.feather'), + "./authors-tf_lsi850_best_leaf.html") -if __name__ == "__main__": - fire.Fire({"kmeans":select_kmeans_clustering, - "affinity":select_affinity_clustering}) diff --git a/density/Makefile b/density/Makefile index d223399..90eba82 100644 --- a/density/Makefile +++ b/density/Makefile @@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum diff --git a/density/job_script.sh b/density/job_script.sh index 7dfac14..e411ba7 100755 --- a/density/job_script.sh +++ b/density/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum -stop-all.sh +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh diff --git a/density/overlap_density.py b/density/overlap_density.py index 5a8e91a..2036824 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -1,11 +1,12 @@ import pandas as pd from pandas.core.groupby import DataFrameGroupBy as GroupBy +from pathlib import Path import fire import numpy as np import sys sys.path.append("..") sys.path.append("../similarities") -from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval +from similarities.similarities_helper import reindex_tfidf # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member @@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): df = pd.read_feather(inpath) - df = df.drop('subreddit',1) + df = df.drop('_subreddit',1) np.fill_diagonal(df.values,0) df = agg(df, 0).reset_index() df = df.rename({0:'overlap_density'},axis='columns') + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) df.to_feather(outpath) return df @@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum): # exclude the diagonal df = df.loc[df.subreddit != df.variable] res = agg(df.groupby(['subreddit','week'])).reset_index() + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) res.to_feather(outpath) return res diff --git a/dumps/pull_pushshift_comments.sh b/dumps/pull_pushshift_comments.sh index 3f6d2c9..40d82d8 100755 --- a/dumps/pull_pushshift_comments.sh +++ b/dumps/pull_pushshift_comments.sh @@ -8,7 +8,5 @@ wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url -# starting in 2020 we use daily dumps not monthly dumps -wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/ ./check_comments_shas.py diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 98f1454..b9bab17 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -4,44 +4,49 @@ from pathlib import Path from similarities_helper import similarities, column_similarities from functools import partial -def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): +def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): - return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): - return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', +def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): + + return cosine_similarities(infile, 'term', outfile, min_df, max_df, included_subreddits, topN, + exclude_phrases, from_date, to_date ) -def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): - return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', +def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): + return cosine_similarities(infile, 'author', outfile, min_df, max_df, included_subreddits, topN, + exclude_phrases=False, from_date=from_date, to_date=to_date ) -def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): - return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', +def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): + return cosine_similarities(infile, 'author', outfile, min_df, max_df, included_subreddits, topN, + exclude_phrases=False, from_date=from_date, to_date=to_date, tfidf_colname='relative_tf' diff --git a/similarities/job_script.sh b/similarities/job_script.sh index 1f363cd..0c37103 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000 +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index a4983b3..13845d1 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} + df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = df.to_pandas(split_blocks=True,self_destruct=True) @@ -124,6 +125,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu return (df, tfidf_ds, ds_filter) + with Pool(cpu_count()) as pool: + chunks = pool.imap_unordered(pull_names,batches) + subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + + subreddit_names = subreddit_names.set_index("subreddit_id") + new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + new_ids = new_ids.set_index('subreddit_id') + subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + subreddit_names = subreddit_names.drop("subreddit_id",1) + subreddit_names = subreddit_names.sort_values("subreddit_id_new") + return(df, subreddit_names) def pull_names(batch): return(batch.to_pandas().drop_duplicates()) @@ -165,7 +177,6 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non print(f'computing similarities on mat. mat.shape:{mat.shape}') print(f"size of mat is:{mat.data.nbytes}",flush=True) - # transform this to debug term tfidf sims = simfunc(mat) del mat @@ -256,13 +267,12 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 yield (sims, n_dims) else: return sims + def column_similarities(mat): return 1 - pairwise_distances(mat,metric='cosine') -# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing. -# this affords taking the LSI similarities. -# fill all 0s if we don't have it. + def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): term = term_colname term_id = term + '_id' @@ -295,7 +305,6 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig subreddits = df.select(['subreddit']).distinct() subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit"))) - # df = df.cache() df = df.join(subreddits,on=['subreddit']) # map terms to indexes in the tfs and the idfs diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 94dcbf5..19d3013 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -52,7 +52,7 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', topN=None, - include_subreddits=None): + included_subreddits=None): return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", outpath, @@ -63,7 +63,8 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi ) def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', - topN=25000): + topN=None, + included_subreddits=None): return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", @@ -71,7 +72,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf topN, 'term', [], - included_subreddits=None + included_subreddits=included_subreddits ) diff --git a/similarities/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py index 1197b51..ff9293c 100644 --- a/similarities/top_subreddits_by_comments.py +++ b/similarities/top_subreddits_by_comments.py @@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%")) df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) df = df.join(prop_nsfw,on='subreddit') -df = df.filter(df.prop_nsfw < 0.5) +#df = df.filter(df.prop_nsfw < 0.5) win = Window.orderBy(f.col('n_comments').desc()) df = df.withColumn('comments_rank', f.rank().over(win)) @@ -26,4 +26,4 @@ df = df.toPandas() df = df.sort_values("n_comments") -df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False) +df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)