2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
6 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
7 df = pd.read_feather(inpath)
8 df = df.drop('subreddit',1)
9 np.fill_diagonal(df.values,0)
10 df = agg(df, 0).reset_index()
11 df = df.rename({0:'overlap_density'},axis='columns')
12 df.to_feather(outpath)
15 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
16 df = pd.read_parquet(inpath)
17 # exclude the diagonal
18 df = df.loc[df.subreddit != df.variable]
19 res = agg(df.groupby(['subreddit','week'])).reset_index()
20 res.to_feather(outpath)
23 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
24 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
28 overlap_density(inpath, outpath, agg)
30 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
31 outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
36 overlap_density(inpath, outpath, agg)
38 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
39 outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
43 overlap_density_weekly(inpath, outpath, agg)
45 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
46 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
50 overlap_density_weekly(inpath, outpath, agg)
53 if __name__ == "__main__":
54 fire.Fire({'authors':author_overlap_density,
55 'terms':term_overlap_density,
56 'author_weekly':author_overlap_density_weekly,
57 'term_weekly':term_overlap_density_weekly})