X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/56269deee3d33620550d67bdd3c1a7b64eb3f7e4..refs/heads/synced/excise_reindex:/density/overlap_density.py diff --git a/density/overlap_density.py b/density/overlap_density.py index 2bddb8b..ef0eb26 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -1,14 +1,25 @@ import pandas as pd from pandas.core.groupby import DataFrameGroupBy as GroupBy +from pathlib import Path import fire import numpy as np +import sys +# sys.path.append("..") +# sys.path.append("../similarities") +# from similarities.similarities_helper import pull_tfidf + +# this is the mean of the ratio of the overlap to the focal size. +# mean shared membership per focal community member +# the input is the author tf-idf matrix def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): df = pd.read_feather(inpath) - df = df.drop('subreddit',1) + df = df.drop('_subreddit',1) np.fill_diagonal(df.values,0) df = agg(df, 0).reset_index() df = df.rename({0:'overlap_density'},axis='columns') + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) df.to_feather(outpath) return df @@ -17,9 +28,21 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum): # exclude the diagonal df = df.loc[df.subreddit != df.variable] res = agg(df.groupby(['subreddit','week'])).reset_index() + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) res.to_feather(outpath) return res + +# inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; +# min_df=1; +# included_subreddits=None; +# topN=10000; +# outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather" + +# to_date=2019-10-28 + + def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather", outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum): if type(agg) == str: @@ -55,3 +78,4 @@ if __name__ == "__main__": 'terms':term_overlap_density, 'author_weekly':author_overlap_density_weekly, 'term_weekly':term_overlap_density_weekly}) +