]> code.communitydata.science - cdsc_reddit.git/blobdiff - density/overlap_density.py
Merge branch 'master' of code:cdsc_reddit into excise_reindex
[cdsc_reddit.git] / density / overlap_density.py
index 2bddb8bb97f3d1983a3a9ef2a1db12d82fed0667..20368249cd72c210a91e5d639213ce6edba6feef 100644 (file)
@@ -1,14 +1,25 @@
 import pandas as pd
 from pandas.core.groupby import DataFrameGroupBy as GroupBy
+from pathlib import Path
 import fire
 import numpy as np
+import sys
+sys.path.append("..")
+sys.path.append("../similarities")
+from similarities.similarities_helper import reindex_tfidf
+
+# this is the mean of the ratio of the overlap to the focal size.
+# mean shared membership per focal community member
+# the input is the author tf-idf matrix
 
 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
     df = pd.read_feather(inpath)
-    df = df.drop('subreddit',1)
+    df = df.drop('_subreddit',1)
     np.fill_diagonal(df.values,0)
     df = agg(df, 0).reset_index()
     df = df.rename({0:'overlap_density'},axis='columns')
+    outpath = Path(outpath)
+    outpath.parent.mkdir(parents=True, exist_ok = True)
     df.to_feather(outpath)
     return df
 
@@ -17,9 +28,21 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
     # exclude the diagonal
     df = df.loc[df.subreddit != df.variable]
     res = agg(df.groupby(['subreddit','week'])).reset_index()
+    outpath = Path(outpath)
+    outpath.parent.mkdir(parents=True, exist_ok = True)
     res.to_feather(outpath)
     return res
 
+
+# inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
+# min_df=1;
+# included_subreddits=None;
+# topN=10000;
+# outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
+
+# to_date=2019-10-28
+
+
 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
     if type(agg) == str:
@@ -55,3 +78,4 @@ if __name__ == "__main__":
                'terms':term_overlap_density,
                'author_weekly':author_overlap_density_weekly,
                'term_weekly':term_overlap_density_weekly})
+

Community Data Science Collective || Want to submit a patch?