]> code.communitydata.science - cdsc_reddit.git/blob - clustering/selection.py
lsi support for weekly similarities
[cdsc_reddit.git] / clustering / selection.py
1 import pandas as pd
2 import plotnine as pn
3 from pathlib import Path
4 from clustering.fit_tsne import fit_tsne
5 from visualization.tsne_vis import build_visualization
6
7 df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
8
9 # plot silhouette_score as a function of isolates
10 df = df.sort_values("silhouette_score")
11
12 df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
13 p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
14 p.save("isolates_x_score.png")
15
16 p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
17 p.save("clusters_x_isolates.png")
18
19 # the best result for hdbscan seems like this one: it has a decent number of 
20 # i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
21 best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
22
23 best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
24
25 tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
26
27 if not tnse_data.exists():
28     fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
29              tnse_data)
30
31 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
32                     Path(best_eom.outpath)/(best_eom['name']+'.feather'),
33                     "./authors-tf_lsi850_best_eom.html")
34
35 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
36                     Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
37                     "./authors-tf_lsi850_best_leaf.html")
38

Community Data Science Collective || Want to submit a patch?