density/overlap_density.py

   1 import pandas as pd
   2 from pandas.core.groupby import DataFrameGroupBy as GroupBy
   3 import fire
   4 import numpy as np
   5 import sys
   6 sys.path.append("..")
   7 sys.path.append("../similarities")
   8 from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval
   9
  10 # this is the mean of the ratio of the overlap to the focal size.
  11 # mean shared membership per focal community member
  12 # the input is the author tf-idf matrix
  13
  14 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
  15     df = pd.read_feather(inpath)
  16     df = df.drop('subreddit',1)
  17     np.fill_diagonal(df.values,0)
  18     df = agg(df, 0).reset_index()
  19     df = df.rename({0:'overlap_density'},axis='columns')
  20     df.to_feather(outpath)
  21     return df
  22
  23 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
  24     df = pd.read_parquet(inpath)
  25     # exclude the diagonal
  26     df = df.loc[df.subreddit != df.variable]
  27     res = agg(df.groupby(['subreddit','week'])).reset_index()
  28     res.to_feather(outpath)
  29     return res
  30
  31
  32 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
  33 # min_df=1;
  34 # included_subreddits=None;
  35 # topN=10000;
  36 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
  37
  38 # to_date=2019-10-28
  39
  40
  41 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
  42                            outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
  43     if type(agg) == str:
  44         agg = eval(agg)
  45
  46     overlap_density(inpath, outpath, agg)
  47
  48 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
  49                          outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
  50
  51     if type(agg) == str:
  52         agg = eval(agg)
  53
  54     overlap_density(inpath, outpath, agg)
  55
  56 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
  57                                   outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
  58     if type(agg) == str:
  59         agg = eval(agg)
  60
  61     overlap_density_weekly(inpath, outpath, agg)
  62
  63 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
  64                                 outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
  65     if type(agg) == str:
  66         agg = eval(agg)
  67
  68     overlap_density_weekly(inpath, outpath, agg)
  69
  70
  71 if __name__ == "__main__":
  72     fire.Fire({'authors':author_overlap_density,
  73                'terms':term_overlap_density,
  74                'author_weekly':author_overlap_density_weekly,
  75                'term_weekly':term_overlap_density_weekly,
  76                'wang_overlaps':wang_overlap_density})