]> code.communitydata.science - cdsc_reddit.git/blob - density/overlap_density.py
Merge branch 'master' of code:cdsc_reddit into excise_reindex
[cdsc_reddit.git] / density / overlap_density.py
1 import pandas as pd
2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
3 from pathlib import Path
4 import fire
5 import numpy as np
6 import sys
7 sys.path.append("..")
8 sys.path.append("../similarities")
9 from similarities.similarities_helper import reindex_tfidf
10
11 # this is the mean of the ratio of the overlap to the focal size.
12 # mean shared membership per focal community member
13 # the input is the author tf-idf matrix
14
15 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
16     df = pd.read_feather(inpath)
17     df = df.drop('_subreddit',1)
18     np.fill_diagonal(df.values,0)
19     df = agg(df, 0).reset_index()
20     df = df.rename({0:'overlap_density'},axis='columns')
21     outpath = Path(outpath)
22     outpath.parent.mkdir(parents=True, exist_ok = True)
23     df.to_feather(outpath)
24     return df
25
26 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
27     df = pd.read_parquet(inpath)
28     # exclude the diagonal
29     df = df.loc[df.subreddit != df.variable]
30     res = agg(df.groupby(['subreddit','week'])).reset_index()
31     outpath = Path(outpath)
32     outpath.parent.mkdir(parents=True, exist_ok = True)
33     res.to_feather(outpath)
34     return res
35
36
37 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
38 # min_df=1;
39 # included_subreddits=None;
40 # topN=10000;
41 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
42
43 # to_date=2019-10-28
44
45
46 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
47                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
48     if type(agg) == str:
49         agg = eval(agg)
50
51     overlap_density(inpath, outpath, agg)
52
53 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
54                          outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
55
56     if type(agg) == str:
57         agg = eval(agg)
58
59     overlap_density(inpath, outpath, agg)
60
61 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
62                                   outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
63     if type(agg) == str:
64         agg = eval(agg)
65
66     overlap_density_weekly(inpath, outpath, agg)
67
68 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
69                                 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
70     if type(agg) == str:
71         agg = eval(agg)
72
73     overlap_density_weekly(inpath, outpath, agg)
74
75
76 if __name__ == "__main__":
77     fire.Fire({'authors':author_overlap_density,
78                'terms':term_overlap_density,
79                'author_weekly':author_overlap_density_weekly,
80                'term_weekly':term_overlap_density_weekly})
81

Community Data Science Collective || Want to submit a patch?