timeseries/choose_clusters.py

   1 from pyarrow import dataset as ds
   2 import numpy as np
   3 import pandas as pd
   4 import plotnine as pn
   5 random = np.random.RandomState(1968)
   6
   7 def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
   8                    author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"):
   9
  10     term_density = pd.read_feather(term_density_file)
  11     author_density = pd.read_feather(author_density_file)
  12
  13     term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True)
  14     author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True)
  15
  16     density = term_density.merge(author_density,on='subreddit',how='inner')
  17
  18     return density
  19
  20 def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
  21                   author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"):
  22     term_clusters = pd.read_feather(term_clusters_file)
  23     author_clusters = pd.read_feather(author_clusters_file)
  24
  25     # rename, join and return
  26     term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True)
  27     author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True)
  28
  29     clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner')
  30
  31     return clusters
  32
  33 if __name__ == '__main__':
  34
  35     df = load_densities()
  36     cl = load_clusters()
  37
  38     df['td_rank'] = df.term_density.rank()
  39     df['ad_rank'] = df.author_density.rank()
  40
  41     df['td_percentile'] = df.td_rank / df.shape[0]
  42     df['ad_percentile'] = df.ad_rank / df.shape[0]
  43
  44     df = df.merge(cl, on='subreddit',how='inner')
  45
  46     term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'],
  47                                                          'ad_rank':['mean','min','max'],
  48                                                          'td_percentile':['mean','min','max'],
  49                                                            'ad_percentile':['mean','min','max'],
  50                                                            'subreddit':['count']})
  51
  52
  53     author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'],
  54                                                          'ad_rank':['mean','min','max'],
  55                                                          'td_percentile':['mean','min','max'],
  56                                                            'ad_percentile':['mean','min','max'],
  57                                                            'subreddit':['count']})
  58
  59     # which clusters have the most term_density?
  60     term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index]
  61
  62     # which clusters have the most author_density?
  63     term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20]
  64
  65     high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)]
  66
  67     # let's just use term density instead of author density for now. We can do a second batch with author density next.
  68     chosen_clusters = high_density_term_clusters.sample(3,random_state=random)
  69
  70     cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)]
  71
  72     chosen_subreddits = cluster_info.subreddit.values
  73
  74     dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet')
  75     comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt'])
  76
  77     comments = comments.to_pandas()
  78
  79     comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d')
  80
  81     author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index()
  82
  83     for clid in chosen_clusters.index.values:
  84
  85         ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather")
  86
  87         pn.options.figure_size = (11.7,8.27)
  88         p = pn.ggplot(ts)
  89         p = p + pn.geom_line(pn.aes('week','value',group='subreddit'))
  90         p = p + pn.facet_wrap('~ subreddit')
  91         p.save(f"plots/ts_term_cluster_{clid}.png")
  92
  93
  94         fig, ax = pyplot.subplots(figsize=(11.7,8.27))
  95         g = sns.FacetGrid(ts,row='subreddit')
  96         g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax)