]> code.communitydata.science - cdsc_reddit.git/blob - density/overlap_density.py
add note to try other tf normalization strategies.
[cdsc_reddit.git] / density / overlap_density.py
1 import pandas as pd
2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
3 import fire
4 import numpy as np
5 import sys
6 sys.path.append("..")
7 sys.path.append("../similarities")
8 from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
9
10 # this is the mean of the ratio of the overlap to the focal size.
11 # mean shared membership per focal community member
12 # the input is the author tf-idf matrix
13
14 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
15     df = pd.read_feather(inpath)
16     df = df.drop('subreddit',1)
17     np.fill_diagonal(df.values,0)
18     df = agg(df, 0).reset_index()
19     df = df.rename({0:'overlap_density'},axis='columns')
20     df.to_feather(outpath)
21     return df
22
23 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
24     df = pd.read_parquet(inpath)
25     # exclude the diagonal
26     df = df.loc[df.subreddit != df.variable]
27     res = agg(df.groupby(['subreddit','week'])).reset_index()
28     res.to_feather(outpath)
29     return res
30
31
32 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
33 # min_df=1;
34 # included_subreddits=None;
35 # topN=10000;
36 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
37
38 # to_date=2019-10-28
39
40
41 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
42                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
43     if type(agg) == str:
44         agg = eval(agg)
45
46     overlap_density(inpath, outpath, agg)
47
48 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
49                          outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
50
51     if type(agg) == str:
52         agg = eval(agg)
53
54     overlap_density(inpath, outpath, agg)
55
56 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
57                                   outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
58     if type(agg) == str:
59         agg = eval(agg)
60
61     overlap_density_weekly(inpath, outpath, agg)
62
63 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
64                                 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
65     if type(agg) == str:
66         agg = eval(agg)
67
68     overlap_density_weekly(inpath, outpath, agg)
69
70
71 if __name__ == "__main__":
72     fire.Fire({'authors':author_overlap_density,
73                'terms':term_overlap_density,
74                'author_weekly':author_overlap_density_weekly,
75                'term_weekly':term_overlap_density_weekly})
76

Community Data Science Collective || Want to submit a patch?