]> code.communitydata.science - cdsc_reddit.git/blob - density/overlap_density.py
Some improvements to run affinity clustering on larger dataset and
[cdsc_reddit.git] / density / overlap_density.py
1 import pandas as pd
2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
3 import fire
4 import numpy as np
5
6 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
7     df = pd.read_feather(inpath)
8     df = df.drop('subreddit',1)
9     np.fill_diagonal(df.values,0)
10     df = agg(df, 0).reset_index()
11     df = df.rename({0:'overlap_density'},axis='columns')
12     df.to_feather(outpath)
13     return df
14
15 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
16     df = pd.read_parquet(inpath)
17     # exclude the diagonal
18     df = df.loc[df.subreddit != df.variable]
19     res = agg(df.groupby(['subreddit','week'])).reset_index()
20     res.to_feather(outpath)
21     return res
22
23 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
24                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
25     if type(agg) == str:
26         agg = eval(agg)
27
28     overlap_density(inpath, outpath, agg)
29
30 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
31                          outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
32
33     if type(agg) == str:
34         agg = eval(agg)
35
36     overlap_density(inpath, outpath, agg)
37
38 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
39                                   outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
40     if type(agg) == str:
41         agg = eval(agg)
42
43     overlap_density_weekly(inpath, outpath, agg)
44
45 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
46                                 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
47     if type(agg) == str:
48         agg = eval(agg)
49
50     overlap_density_weekly(inpath, outpath, agg)
51
52
53 if __name__ == "__main__":
54     fire.Fire({'authors':author_overlap_density,
55                'terms':term_overlap_density,
56                'author_weekly':author_overlap_density_weekly,
57                'term_weekly':term_overlap_density_weekly})

Community Data Science Collective || Want to submit a patch?