X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/b4dd9acbd8e499d87714413b8260240341ebf7d7..2d21ff1137dfaf83c5a51fdcd8900503c50a06ab:/density/overlap_density.py?ds=sidebyside diff --git a/density/overlap_density.py b/density/overlap_density.py index a1e9f6d..2036824 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -1,11 +1,12 @@ import pandas as pd from pandas.core.groupby import DataFrameGroupBy as GroupBy +from pathlib import Path import fire import numpy as np import sys sys.path.append("..") sys.path.append("../similarities") -from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval +from similarities.similarities_helper import reindex_tfidf # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member @@ -13,10 +14,12 @@ from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, r def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): df = pd.read_feather(inpath) - df = df.drop('subreddit',1) + df = df.drop('_subreddit',1) np.fill_diagonal(df.values,0) df = agg(df, 0).reset_index() df = df.rename({0:'overlap_density'},axis='columns') + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) df.to_feather(outpath) return df @@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum): # exclude the diagonal df = df.loc[df.subreddit != df.variable] res = agg(df.groupby(['subreddit','week'])).reset_index() + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) res.to_feather(outpath) return res @@ -72,5 +77,5 @@ if __name__ == "__main__": fire.Fire({'authors':author_overlap_density, 'terms':term_overlap_density, 'author_weekly':author_overlap_density_weekly, - 'term_weekly':term_overlap_density_weekly, - 'wang_overlaps':wang_overlap_density}) + 'term_weekly':term_overlap_density_weekly}) +