3 from pathlib import Path
4 from clustering.fit_tsne import fit_tsne
5 from visualization.tsne_vis import build_visualization
7 df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
9 # plot silhouette_score as a function of isolates
10 df = df.sort_values("silhouette_score")
12 df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
13 p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
14 p.save("isolates_x_score.png")
16 p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
17 p.save("clusters_x_isolates.png")
19 # the best result for hdbscan seems like this one: it has a decent number of
20 # i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
21 best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
23 best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
25 tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
27 if not tnse_data.exists():
28 fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
31 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
32 Path(best_eom.outpath)/(best_eom['name']+'.feather'),
33 "./authors-tf_lsi850_best_eom.html")
35 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
36 Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
37 "./authors-tf_lsi850_best_leaf.html")