2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
7 sys.path.append("../similarities")
8 from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval
10 # this is the mean of the ratio of the overlap to the focal size.
11 # mean shared membership per focal community member
12 # the input is the author tf-idf matrix
14 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
15 df = pd.read_feather(inpath)
16 df = df.drop('subreddit',1)
17 np.fill_diagonal(df.values,0)
18 df = agg(df, 0).reset_index()
19 df = df.rename({0:'overlap_density'},axis='columns')
20 df.to_feather(outpath)
23 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
24 df = pd.read_parquet(inpath)
25 # exclude the diagonal
26 df = df.loc[df.subreddit != df.variable]
27 res = agg(df.groupby(['subreddit','week'])).reset_index()
28 res.to_feather(outpath)
32 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
34 # included_subreddits=None;
36 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
41 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
42 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
46 overlap_density(inpath, outpath, agg)
48 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
49 outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
54 overlap_density(inpath, outpath, agg)
56 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
57 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
61 overlap_density_weekly(inpath, outpath, agg)
63 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
64 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
68 overlap_density_weekly(inpath, outpath, agg)
71 if __name__ == "__main__":
72 fire.Fire({'authors':author_overlap_density,
73 'terms':term_overlap_density,
74 'author_weekly':author_overlap_density_weekly,
75 'term_weekly':term_overlap_density_weekly,
76 'wang_overlaps':wang_overlap_density})