2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
3 from pathlib import Path
8 sys.path.append("../similarities")
9 from similarities.similarities_helper import reindex_tfidf
11 # this is the mean of the ratio of the overlap to the focal size.
12 # mean shared membership per focal community member
13 # the input is the author tf-idf matrix
15 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
16 df = pd.read_feather(inpath)
17 df = df.drop('_subreddit',1)
18 np.fill_diagonal(df.values,0)
19 df = agg(df, 0).reset_index()
20 df = df.rename({0:'overlap_density'},axis='columns')
21 outpath = Path(outpath)
22 outpath.parent.mkdir(parents=True, exist_ok = True)
23 df.to_feather(outpath)
26 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
27 df = pd.read_parquet(inpath)
28 # exclude the diagonal
29 df = df.loc[df.subreddit != df.variable]
30 res = agg(df.groupby(['subreddit','week'])).reset_index()
31 outpath = Path(outpath)
32 outpath.parent.mkdir(parents=True, exist_ok = True)
33 res.to_feather(outpath)
37 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
39 # included_subreddits=None;
41 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
46 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
47 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
51 overlap_density(inpath, outpath, agg)
53 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
54 outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
59 overlap_density(inpath, outpath, agg)
61 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
62 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
66 overlap_density_weekly(inpath, outpath, agg)
68 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
69 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
73 overlap_density_weekly(inpath, outpath, agg)
76 if __name__ == "__main__":
77 fire.Fire({'authors':author_overlap_density,
78 'terms':term_overlap_density,
79 'author_weekly':author_overlap_density_weekly,
80 'term_weekly':term_overlap_density_weekly})