clustering/selection.py

   1 import pandas as pd
   2 import plotnine as pn
   3 from pathlib import Path
   4 from clustering.fit_tsne import fit_tsne
   5 from visualization.tsne_vis import build_visualization
   6
   7 df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
   8
   9 # plot silhouette_score as a function of isolates
  10 df = df.sort_values("silhouette_score")
  11
  12 df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
  13 p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
  14 p.save("isolates_x_score.png")
  15
  16 p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
  17 p.save("clusters_x_isolates.png")
  18
  19 # the best result for hdbscan seems like this one: it has a decent number of
  20 # i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
  21 best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
  22
  23 best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
  24
  25 tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
  26
  27 if not tnse_data.exists():
  28     fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
  29              tnse_data)
  30
  31 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
  32                     Path(best_eom.outpath)/(best_eom['name']+'.feather'),
  33                     "./authors-tf_lsi850_best_eom.html")
  34
  35 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
  36                     Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
  37                     "./authors-tf_lsi850_best_leaf.html")
  38