clustering_data=/gscratch/comdata/output/reddit_clustering
kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
-affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]"
+affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]"
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
-def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]):
+def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10):
"""Run affinity clustering once or more with different parameters.
Usage:
map(int,max_iters),
map(int,convergence_iters),
map(float,preference_quantiles))
- obj.run(1)
+ obj.run(n_cores)
obj.save(savefile)
def test_select_affinity_clustering():
inpath,
outpath,
self.namer,
- self.lsi_dim,
+ [self.lsi_dim],
*args,
**kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
-def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'):
+def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30):
"""Run affinity clustering once or more with different parameters.
Usage:
map(int,convergence_iters),
map(float,preference_quantiles))
- obj.run(1)
+ obj.run(n_cores)
obj.save(savefile)
if __name__ == "__main__":
import pandas as pd
from dataclasses import dataclass
from sklearn.metrics import silhouette_score, silhouette_samples
+from collections import Counter
# this is meant to be an interface, not created directly
class clustering_job:
return self.result
def silhouette(self):
- isolates = self.clustering.labels_ == -1
+ counts = Counter(self.clustering.labels_)
+ singletons = [key for key, value in counts.items() if value == 1]
+ isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons)))
scoremat = self.mat[~isolates][:,~isolates]
- if scoremat.shape[0] > 0:
+ if self.n_clusters > 1:
score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
print(f"{n_isolates1} clusters have 1 member")
- n_isolates2 = (cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])
-
+ n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list()
+ if len(n_isolates2) > 0:
+ n_isloates2 = n_isolates2[0]
print(f"{n_isolates2} subreddits are in cluster -1",flush=True)
if n_isolates1 == 0:
df = pd.read_feather(similarities)
n = df.shape[0]
- mat = np.array(df.drop('subreddit',1),dtype=np.float64)
+ mat = np.array(df.drop('_subreddit',1),dtype=np.float64)
mat[range(n),range(n)] = 1
mat[mat > 1] = 1
dist = 2*np.arccos(mat)/np.pi
tsne_fit_whole = tsne_fit_model.fit_transform(dist)
- plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
+ plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']})
plot_data.to_feather(output)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*"))
else:
- lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
+ lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
- lsi_nums = [p.stem for p in lsi_paths]
+ lsi_nums = [int(p.stem) for p in lsi_paths]
self.hasrun = False
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))