X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/56269deee3d33620550d67bdd3c1a7b64eb3f7e4..14ab979f5910637809dec24617276eb7bd0d9554:/density/overlap_density.py diff --git a/density/overlap_density.py b/density/overlap_density.py index 2bddb8b..5a8e91a 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -2,6 +2,14 @@ import pandas as pd from pandas.core.groupby import DataFrameGroupBy as GroupBy import fire import numpy as np +import sys +sys.path.append("..") +sys.path.append("../similarities") +from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval + +# this is the mean of the ratio of the overlap to the focal size. +# mean shared membership per focal community member +# the input is the author tf-idf matrix def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): df = pd.read_feather(inpath) @@ -20,6 +28,16 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum): res.to_feather(outpath) return res + +# inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; +# min_df=1; +# included_subreddits=None; +# topN=10000; +# outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather" + +# to_date=2019-10-28 + + def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather", outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum): if type(agg) == str: @@ -55,3 +73,4 @@ if __name__ == "__main__": 'terms':term_overlap_density, 'author_weekly':author_overlap_density_weekly, 'term_weekly':term_overlap_density_weekly}) +