density/overlap_density.py

   1 import pandas as pd
   2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
   3 import fire
   4 import numpy as np
   5
   6 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
   7     df = pd.read_feather(inpath)
   8     df = df.drop('subreddit',1)
   9     np.fill_diagonal(df.values,0)
  10     df = agg(df, 0).reset_index()
  11     df = df.rename({0:'overlap_density'},axis='columns')
  12     df.to_feather(outpath)
  13     return df
  14
  15 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
  16     df = pd.read_parquet(inpath)
  17     # exclude the diagonal
  18     df = df.loc[df.subreddit != df.variable]
  19     res = agg(df.groupby(['subreddit','week'])).reset_index()
  20     res.to_feather(outpath)
  21     return res
  22
  23 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
  24                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
  25     if type(agg) == str:
  26         agg = eval(agg)
  27
  28     overlap_density(inpath, outpath, agg)
  29
  30 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
  31                          outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
  32
  33     if type(agg) == str:
  34         agg = eval(agg)
  35
  36     overlap_density(inpath, outpath, agg)
  37
  38 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
  39                                   outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
  40     if type(agg) == str:
  41         agg = eval(agg)
  42
  43     overlap_density_weekly(inpath, outpath, agg)
  44
  45 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
  46                                 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
  47     if type(agg) == str:
  48         agg = eval(agg)
  49
  50     overlap_density_weekly(inpath, outpath, agg)
  51
  52
  53 if __name__ == "__main__":
  54     fire.Fire({'authors':author_overlap_density,
  55                'terms':term_overlap_density,
  56                'author_weekly':author_overlap_density_weekly,
  57                'term_weekly':term_overlap_density_weekly})