srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
similarity_data=/gscratch/comdata/output/reddit_similarity
clustering_data=/gscratch/comdata/output/reddit_clustering
-kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]"
-#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
-all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
-# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
-# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
+kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
+hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
+affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]"
-$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
- $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
+authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
+authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
+authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k
+authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI
-$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
- $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
+authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather
+authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
+authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k
+authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
-$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
- $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
+terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather
+terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI
+terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k
+terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI
+all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi
-affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
-$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
- $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
+terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv
-$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
- $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
+authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv
-$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
- $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
+authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv
-clean:
- rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
- rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv
- rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv
- rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
- rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
- rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
+terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv
-PHONY: clean
+authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv
+
+authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
+
+${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+
+## LSI Models
+${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
+ $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
+ $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
+
+${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
+ $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+
+
+
+clean_affinity:
+ rm -f ${authors_10k_output}/affinity/selection_data.csv
+ rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
+ rm -f ${terms_10k_output}/affinity/selection_data.csv
+
+clean_kmeans:
+ rm -f ${authors_10k_output}/kmeans/selection_data.csv
+ rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
+ rm -f ${terms_10k_output}/kmeans/selection_data.csv
+
+clean_hdbscan:
+ rm -f ${authors_10k_output}/hdbscan/selection_data.csv
+ rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
+ rm -f ${terms_10k_output}/hdbscan/selection_data.csv
+
+clean_authors:
+ rm -f ${authors_10k_output}/affinity/selection_data.csv
+ rm -f ${authors_10k_output}/kmeans/selection_data.csv
+ rm -f ${authors_10k_output}/hdbscan/selection_data.csv
+
+clean_authors_tf:
+ rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
+ rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
+ rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
+
+clean_terms:
+ rm -f ${terms_10k_output}/affinity/selection_data.csv
+ rm -f ${terms_10k_output}/kmeans/selection_data.csv
+ rm -f ${terms_10k_output}/hdbscan/selection_data.csv
+
+clean_lsi_affinity:
+ rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
+ rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
+ rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
+
+clean_lsi_kmeans:
+ rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
+ rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
+ rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
+
+clean_lsi_hdbscan:
+ rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
+ rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
+ rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
+
+clean_lsi_authors:
+ rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
+ rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
+ rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
+
+clean_lsi_authors_tf:
+ rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
+ rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
+ rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
+
+clean_lsi_terms:
+ rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
+ rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
+ rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
+
+clean: clean_affinity clean_kmeans clean_hdbscan
+
+PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
-from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
-from functools import partial
from dataclasses import dataclass
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
-from multiprocessing import Pool, cpu_count, Array, Process
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
from pathlib import Path
from itertools import product, starmap
-import numpy as np
-import pandas as pd
import fire
import sys
+import numpy as np
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
@dataclass
preference:float
max_iter:int
-@dataclass
-class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
- pass
-
class affinity_job(clustering_job):
def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
super().__init__(infile,
return self.result
-class affinity_lsi_job(affinity_job, lsi_mixin):
- def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
- super().__init__(infile,
- outpath,
- name,
- *args,
- **kwargs)
- super().set_lsi_dims(lsi_dims)
-
- def get_info(self):
- result = super().get_info()
- self.result = affinity_clustering_result_lsi(**result.__dict__,
- lsi_dimensions=self.lsi_dims)
- return self.result
-
class affinity_grid_sweep(grid_sweep):
def __init__(self,
inpath,
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
-class _affinity_lsi_grid_sweep(grid_sweep):
- def __init__(self,
- inpath,
- outpath,
- lsi_dim,
- *args,
- **kwargs):
- self.lsi_dim = lsi_dim
- self.jobtype = affinity_lsi_job
- super().__init__(self.jobtype,
- inpath,
- outpath,
- self.namer,
- self.lsi_dim,
- *args,
- **kwargs)
-
- def namer(self, *args, **kwargs):
- s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
- s += f"_lsi-{self.lsi_dim}"
- return s
-
-class affinity_lsi_grid_sweep(lsi_grid_sweep):
- def __init__(self,
- inpath,
- lsi_dims,
- outpath,
- dampings=[0.9],
- max_iters=[10000],
- convergence_iters=[30],
- preference_quantiles=[0.5]):
-
- super().__init__(affinity_lsi_job,
- _affinity_lsi_grid_sweep,
- inpath,
- lsi_dims,
- outpath,
- dampings,
- max_iters,
- convergence_iters,
- preference_quantiles)
+def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]):
+ """Run affinity clustering once or more with different parameters.
-
+ Usage:
+ affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to feather data containing a labeled matrix of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
+ preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
+ convergence_iters:one or more integers of number of iterations without improvement before stopping.
+ max_iters: one or more numbers of different maximum interations.
+ """
+ obj = affinity_grid_sweep(inpath,
+ outpath,
+ map(float,dampings),
+ map(int,max_iters),
+ map(int,convergence_iters),
+ map(float,preference_quantiles))
+ obj.run(1)
+ obj.save(savefile)
def test_select_affinity_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
if __name__ == "__main__":
- fire.Fire{'grid_sweep':affinity_grid_sweep,
- 'grid_sweep_lsi':affinity_lsi_grid_sweep
- 'cluster':affinity_job,
- 'cluster_lsi':affinity_lsi_job}
+ fire.Fire(run_affinity_grid_sweep)
--- /dev/null
+import fire
+from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
+from grid_sweep import grid_sweep
+from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
+from dataclasses import dataclass
+
+@dataclass
+class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
+ pass
+
+
+class affinity_lsi_job(affinity_job, lsi_mixin):
+ def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
+ super().__init__(infile,
+ outpath,
+ name,
+ *args,
+ **kwargs)
+ super().set_lsi_dims(lsi_dims)
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = affinity_clustering_result_lsi(**result.__dict__,
+ lsi_dimensions=self.lsi_dims)
+ return self.result
+
+class affinity_lsi_grid_sweep(lsi_grid_sweep):
+ def __init__(self,
+ inpath,
+ lsi_dims,
+ outpath,
+ dampings=[0.9],
+ max_iters=[10000],
+ convergence_iters=[30],
+ preference_quantiles=[0.5]):
+
+ super().__init__(affinity_lsi_job,
+ _affinity_lsi_grid_sweep,
+ inpath,
+ lsi_dims,
+ outpath,
+ dampings,
+ max_iters,
+ convergence_iters,
+ preference_quantiles)
+
+
+class _affinity_lsi_grid_sweep(grid_sweep):
+ def __init__(self,
+ inpath,
+ outpath,
+ lsi_dim,
+ *args,
+ **kwargs):
+ self.lsi_dim = lsi_dim
+ self.jobtype = affinity_lsi_job
+ super().__init__(self.jobtype,
+ inpath,
+ outpath,
+ self.namer,
+ self.lsi_dim,
+ *args,
+ **kwargs)
+
+ def namer(self, *args, **kwargs):
+ s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
+ s += f"_lsi-{self.lsi_dim}"
+ return s
+
+def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'):
+ """Run affinity clustering once or more with different parameters.
+
+ Usage:
+ affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
+ preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
+ convergence_iters:one or more integers of number of iterations without improvement before stopping.
+ max_iters: one or more numbers of different maximum interations.
+ lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
+ """
+
+ obj = affinity_lsi_grid_sweep(inpath,
+ lsi_dimensions,
+ outpath,
+ map(float,dampings),
+ map(int,max_iters),
+ map(int,convergence_iters),
+ map(float,preference_quantiles))
+
+ obj.run(1)
+ obj.save(savefile)
+
+if __name__ == "__main__":
+ fire.Fire(run_affinity_lsi_grid_sweep)
import pandas as pd
from dataclasses import dataclass
from sklearn.metrics import silhouette_score, silhouette_samples
-from itertools import product, chain
-from multiprocessing import Pool, cpu_count
-
-def sim_to_dist(mat):
- dist = 1-mat
- dist[dist < 0] = 0
- np.fill_diagonal(dist,0)
- return dist
-
-class grid_sweep:
- def __init__(self, jobtype, inpath, outpath, namer, *args):
- self.jobtype = jobtype
- self.namer = namer
- grid = list(product(*args))
- inpath = Path(inpath)
- outpath = Path(outpath)
- self.hasrun = False
- self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
- self.jobs = [jobtype(*g) for g in self.grid]
-
- def run(self, cores=20):
- if cores is not None and cores > 1:
- with Pool(cores) as pool:
- infos = pool.map(self.jobtype.get_info, self.jobs)
- else:
- infos = map(self.jobtype.get_info, self.jobs)
-
- self.infos = pd.DataFrame(infos)
- self.hasrun = True
-
- def save(self, outcsv):
- if not self.hasrun:
- self.run()
- outcsv = Path(outcsv)
- outcsv.parent.mkdir(parents=True, exist_ok=True)
- self.infos.to_csv(outcsv)
-
-
-class lsi_grid_sweep(grid_sweep):
- def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
- self.jobtype = jobtype
- self.subsweep = subsweep
- inpath = Path(inpath)
- if lsi_dimensions == 'all':
- lsi_paths = list(inpath.glob("*"))
- else:
- lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
- lsi_nums = [p.stem for p in lsi_paths]
- self.hasrun = False
- self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
- self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
-
# this is meant to be an interface, not created directly
class clustering_job:
name=self.name,
n_clusters=self.n_clusters,
n_isolates=self.n_isolates,
- silhouette_samples = str(self.silsampout.resolve())
+ silhouette_samples = self.silsampout
)
return self.result
def silhouette(self):
isolates = self.clustering.labels_ == -1
scoremat = self.mat[~isolates][:,~isolates]
- score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
- silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
- silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
- self.outpath.mkdir(parents=True, exist_ok=True)
- self.silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
- silhouette_samp.to_feather(self.silsampout)
+ if scoremat.shape[0] > 0:
+ score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
+ silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
+ silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
+ self.outpath.mkdir(parents=True, exist_ok=True)
+ silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
+ self.silsampout = silsampout.resolve()
+ silhouette_samp.to_feather(self.silsampout)
+ else:
+ score = None
+ self.silsampout = None
return score
def read_distance_mat(self, similarities, use_threads=True):
return cluster_data
-
-class lsi_mixin():
- def set_lsi_dims(self, lsi_dims):
- self.lsi_dims = lsi_dims
-
@dataclass
class clustering_result:
outpath:Path
n_clusters:int
n_isolates:int
silhouette_samples:str
-
-@dataclass
-class lsi_result_mixin:
- lsi_dimensions:int
--- /dev/null
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+from itertools import product, chain
+import pandas as pd
+
+class grid_sweep:
+ def __init__(self, jobtype, inpath, outpath, namer, *args):
+ self.jobtype = jobtype
+ self.namer = namer
+ grid = list(product(*args))
+ inpath = Path(inpath)
+ outpath = Path(outpath)
+ self.hasrun = False
+ self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
+ self.jobs = [jobtype(*g) for g in self.grid]
+
+ def run(self, cores=20):
+ if cores is not None and cores > 1:
+ with Pool(cores) as pool:
+ infos = pool.map(self.jobtype.get_info, self.jobs)
+ else:
+ infos = map(self.jobtype.get_info, self.jobs)
+
+ self.infos = pd.DataFrame(infos)
+ self.hasrun = True
+
+ def save(self, outcsv):
+ if not self.hasrun:
+ self.run()
+ outcsv = Path(outcsv)
+ outcsv.parent.mkdir(parents=True, exist_ok=True)
+ self.infos.to_csv(outcsv)
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
from dataclasses import dataclass
import hdbscan
from sklearn.neighbors import NearestNeighbors
import numpy as np
from itertools import product, starmap, chain
import pandas as pd
-from sklearn.metrics import silhouette_score, silhouette_samples
-from pathlib import Path
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
import fire
-from pyarrow.feather import write_feather
def test_select_hdbscan_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
-
-class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
- def __init__(self,
- inpath,
- lsi_dims,
- outpath,
- min_cluster_sizes,
- min_samples,
- cluster_selection_epsilons,
- cluster_selection_methods
- ):
-
- super().__init__(hdbscan_lsi_job,
- _hdbscan_lsi_grid_sweep,
- inpath,
- lsi_dims,
- outpath,
- min_cluster_sizes,
- min_samples,
- cluster_selection_epsilons,
- cluster_selection_methods)
-
class hdbscan_grid_sweep(grid_sweep):
def __init__(self,
inpath,
cluster_selection_method):
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
-
-class _hdbscan_lsi_grid_sweep(grid_sweep):
- def __init__(self,
- inpath,
- outpath,
- lsi_dim,
- *args,
- **kwargs):
-
- self.lsi_dim = lsi_dim
- self.jobtype = hdbscan_lsi_job
- super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
-
-
- def namer(self, *args, **kwargs):
- s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
- s += f"_lsi-{self.lsi_dim}"
- return s
-
@dataclass
class hdbscan_clustering_result(clustering_result):
min_cluster_size:int
cluster_selection_epsilon:float
cluster_selection_method:str
-@dataclass
-class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
- pass
-
class hdbscan_job(clustering_job):
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
super().__init__(infile,
cluster_selection_method=self.cluster_selection_method)
return self.result
-class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
- def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
- super().__init__(
- infile,
- outpath,
- name,
- *args,
- **kwargs)
- super().set_lsi_dims(lsi_dims)
-
- def get_info(self):
- partial_result = super().get_info()
- self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
- lsi_dimensions=self.lsi_dims)
- return self.result
-
-# def select_hdbscan_clustering(inpath,
-# outpath,
-# outfile=None,
-# min_cluster_sizes=[2],
-# min_samples=[1],
-# cluster_selection_epsilons=[0],
-# cluster_selection_methods=['eom'],
-# lsi_dimensions='all'
-# ):
-
-# inpath = Path(inpath)
-# outpath = Path(outpath)
-# outpath.mkdir(exist_ok=True, parents=True)
-
-# if lsi_dimensions is None:
-# lsi_paths = [inpath]
-# elif lsi_dimensions == 'all':
-# lsi_paths = list(inpath.glob("*"))
-
-# else:
-# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-# if lsi_dimensions is not None:
-# lsi_nums = [p.stem for p in lsi_paths]
-# else:
-# lsi_nums = [None]
-# grid = list(product(lsi_nums,
-# min_cluster_sizes,
-# min_samples,
-# cluster_selection_epsilons,
-# cluster_selection_methods))
-
-# # fix the output file names
-# names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-
-# with Pool(int(cpu_count()/4)) as pool:
-# mods = starmap(hdbscan_clustering, grid)
-
-# res = pd.DataFrame(mods)
-# if outfile is None:
-# outfile = outpath / "selection_data.csv"
-
-# res.to_csv(outfile)
-
-# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-# subreddits, mat = read_similarity_mat(similarities)
-# mat = sim_to_dist(mat)
-# clustering = _hdbscan_clustering(mat,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# metric='precomputed',
-# core_dist_n_jobs=cpu_count()
-# )
-
-# cluster_data = process_clustering_result(clustering, subreddits)
-# isolates = clustering.labels_ == -1
-# scoremat = mat[~isolates][:,~isolates]
-# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-# cluster_data.to_feather(output)
-# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-# silsampout = output.parent / ("silhouette_samples" + output.name)
-# silhouette_samp.to_feather(silsampout)
-
-# result = hdbscan_clustering_result(outpath=output,
-# silhouette_samples=silsampout,
-# silhouette_score=score,
-# name=name,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# lsi_dimensions=lsi_dim,
-# n_isolates=isolates.sum(),
-# n_clusters=len(set(clustering.labels_))
-# )
-
-
-
-# return(result)
-
-# # for all runs we should try cluster_selection_epsilon = None
-# # for terms we should try cluster_selection_epsilon around 0.56-0.66
-# # for authors we should try cluster_selection_epsilon around 0.98-0.99
-# def _hdbscan_clustering(mat, *args, **kwargs):
-# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-# print(mat)
-# clusterer = hdbscan.HDBSCAN(*args,
-# **kwargs,
-# )
-
-# clustering = clusterer.fit(mat.astype('double'))
+def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
+ """Run hdbscan clustering once or more with different parameters.
-# return(clustering)
+ Usage:
+ hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to feather data containing a labeled matrix of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ min_cluster_sizes: one or more integers indicating the minumum cluster size
+ min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+ cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
+ cluster_selection_method: "eom" or "leaf" eom gives larger clusters.
+ """
+ obj = hdbscan_grid_sweep(inpath,
+ outpath,
+ map(int,min_cluster_sizes),
+ map(int,min_samples),
+ map(float,cluster_selection_epsilons),
+ map(float,cluster_selection_methods))
+ obj.run()
+ obj.save(savefile)
def KNN_distances_plot(mat,outname,k=2):
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
if __name__ == "__main__":
- fire.Fire{'grid_sweep':hdbscan_grid_sweep,
- 'grid_sweep_lsi':hdbscan_lsi_grid_sweep
- 'cluster':hdbscan_job,
- 'cluster_lsi':hdbscan_lsi_job}
+ fire.Fire(run_hdbscan_grid_sweep)
# test_select_hdbscan_clustering()
#fire.Fire(select_hdbscan_clustering)
--- /dev/null
+from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
+from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
+from grid_sweep import grid_sweep
+import fire
+from dataclasses import dataclass
+
+@dataclass
+class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
+ pass
+
+class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
+ def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
+ super().__init__(
+ infile,
+ outpath,
+ name,
+ *args,
+ **kwargs)
+ super().set_lsi_dims(lsi_dims)
+
+ def get_info(self):
+ partial_result = super().get_info()
+ self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
+ lsi_dimensions=self.lsi_dims)
+ return self.result
+
+class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
+ def __init__(self,
+ inpath,
+ lsi_dims,
+ outpath,
+ min_cluster_sizes,
+ min_samples,
+ cluster_selection_epsilons,
+ cluster_selection_methods
+ ):
+
+ super().__init__(hdbscan_lsi_job,
+ _hdbscan_lsi_grid_sweep,
+ inpath,
+ lsi_dims,
+ outpath,
+ min_cluster_sizes,
+ min_samples,
+ cluster_selection_epsilons,
+ cluster_selection_methods)
+
+
+
+class _hdbscan_lsi_grid_sweep(grid_sweep):
+ def __init__(self,
+ inpath,
+ outpath,
+ lsi_dim,
+ *args,
+ **kwargs):
+ print(args)
+ print(kwargs)
+
+ self.lsi_dim = lsi_dim
+ self.jobtype = hdbscan_lsi_job
+ super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+
+
+ def namer(self, *args, **kwargs):
+ s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
+ s += f"_lsi-{self.lsi_dim}"
+ return s
+
+def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'):
+ """Run hdbscan clustering once or more with different parameters.
+
+ Usage:
+ hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
+ outpath: path to output fit clusterings.
+ min_cluster_sizes: one or more integers indicating the minumum cluster size
+ min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+ cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
+ cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.
+ lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
+ """
+
+ obj = hdbscan_lsi_grid_sweep(inpath,
+ lsi_dimensions,
+ outpath,
+ map(int,min_cluster_sizes),
+ map(int,min_samples),
+ map(float,cluster_selection_epsilons),
+ cluster_selection_methods
+ )
+
+ obj.run(10)
+ obj.save(savefile)
+
+
+if __name__ == "__main__":
+ fire.Fire(run_hdbscan_lsi_grid_sweep)
from sklearn.cluster import KMeans
import fire
from pathlib import Path
-from multiprocessing import cpu_count
from dataclasses import dataclass
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
-
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
@dataclass
class kmeans_clustering_result(clustering_result):
n_init:int
max_iter:int
-@dataclass
-class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
- pass
-
class kmeans_job(clustering_job):
def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
super().__init__(infile,
def get_info(self):
result = super().get_info()
self.result = kmeans_clustering_result(**result.__dict__,
- n_init=n_init,
- max_iter=max_iter)
+ n_init=self.n_init,
+ max_iter=self.max_iter)
return self.result
-class kmeans_lsi_job(kmeans_job, lsi_mixin):
- def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
- super().__init__(infile,
- outpath,
- name,
- *args,
- **kwargs)
- super().set_lsi_dims(lsi_dims)
-
- def get_info(self):
- result = super().get_info()
- self.result = kmeans_clustering_result_lsi(**result.__dict__,
- lsi_dimensions=self.lsi_dims)
- return self.result
-
-
class kmeans_grid_sweep(grid_sweep):
+
def __init__(self,
inpath,
outpath,
max_iter):
return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
-class _kmeans_lsi_grid_sweep(grid_sweep):
- def __init__(self,
- inpath,
- outpath,
- lsi_dim,
- *args,
- **kwargs):
- self.lsi_dim = lsi_dim
- self.jobtype = kmeans_lsi_job
- super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
-
- def namer(self, *args, **kwargs):
- s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
- s += f"_lsi-{self.lsi_dim}"
- return s
-
-class kmeans_lsi_grid_sweep(lsi_grid_sweep):
- def __init__(self,
- inpath,
- lsi_dims,
- outpath,
- n_clusters,
- n_inits,
- max_iters
- ):
-
- super().__init__(kmeans_lsi_job,
- _kmeans_lsi_grid_sweep,
- inpath,
- lsi_dims,
- outpath,
- n_clusters,
- n_inits,
- max_iters)
-
def test_select_kmeans_clustering():
- # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
- # "test_hdbscan_author30k",
- # min_cluster_sizes=[2],
- # min_samples=[1,2],
- # cluster_selection_epsilons=[0,0.05,0.1,0.15],
- # cluster_selection_methods=['eom','leaf'],
- # lsi_dimensions='all')
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
outpath = "test_kmeans";
n_clusters=[200,300,400];
gs.run(20)
gs.save("test_hdbscan/lsi_sweep.csv")
+def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]):
+ """Run kmeans clustering once or more with different parameters.
+
+ Usage:
+ kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to feather data containing a labeled matrix of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ n_clusters: one or more numbers of kmeans clusters to select.
+ n_inits: one or more numbers of different initializations to use for each clustering.
+ max_iters: one or more numbers of different maximum interations.
+ """
-if __name__ == "__main__":
+ obj = kmeans_grid_sweep(inpath,
+ outpath,
+ map(int,n_clusters),
+ map(int,n_inits),
+ map(int,max_iters))
- fire.Fire{'grid_sweep':kmeans_grid_sweep,
- 'grid_sweep_lsi':kmeans_lsi_grid_sweep
- 'cluster':kmeans_job,
- 'cluster_lsi':kmeans_lsi_job}
+
+ obj.run(1)
+ obj.save(savefile)
+
+if __name__ == "__main__":
+ fire.Fire(run_kmeans_grid_sweep)
--- /dev/null
+import fire
+from dataclasses import dataclass
+from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
+from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
+from grid_sweep import grid_sweep
+
+@dataclass
+class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
+ pass
+
+class kmeans_lsi_job(kmeans_job, lsi_mixin):
+ def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
+ super().__init__(infile,
+ outpath,
+ name,
+ *args,
+ **kwargs)
+ super().set_lsi_dims(lsi_dims)
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = kmeans_clustering_result_lsi(**result.__dict__,
+ lsi_dimensions=self.lsi_dims)
+ return self.result
+
+class _kmeans_lsi_grid_sweep(grid_sweep):
+ def __init__(self,
+ inpath,
+ outpath,
+ lsi_dim,
+ *args,
+ **kwargs):
+ print(args)
+ print(kwargs)
+ self.lsi_dim = lsi_dim
+ self.jobtype = kmeans_lsi_job
+ super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+
+ def namer(self, *args, **kwargs):
+ s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
+ s += f"_lsi-{self.lsi_dim}"
+ return s
+
+class kmeans_lsi_grid_sweep(lsi_grid_sweep):
+
+ def __init__(self,
+ inpath,
+ lsi_dims,
+ outpath,
+ n_clusters,
+ n_inits,
+ max_iters
+ ):
+
+ super().__init__(kmeans_lsi_job,
+ _kmeans_lsi_grid_sweep,
+ inpath,
+ lsi_dims,
+ outpath,
+ n_clusters,
+ n_inits,
+ max_iters)
+
+def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
+ """Run kmeans clustering once or more with different parameters.
+
+ Usage:
+ kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
+ n_clusters: one or more numbers of kmeans clusters to select.
+ n_inits: one or more numbers of different initializations to use for each clustering.
+ max_iters: one or more numbers of different maximum interations.
+ """
+
+ obj = kmeans_lsi_grid_sweep(inpath,
+ lsi_dimensions,
+ outpath,
+ list(map(int,n_clusters)),
+ list(map(int,n_inits)),
+ list(map(int,max_iters))
+ )
+
+ obj.run(1)
+ obj.save(savefile)
+
+
+if __name__ == "__main__":
+ fire.Fire(run_kmeans_lsi_grid_sweep)
--- /dev/null
+from clustering_base import clustering_job, clustering_result
+from grid_sweep import grid_sweep
+from dataclasses import dataclass
+from itertools import chain
+from pathlib import Path
+
+class lsi_mixin():
+ def set_lsi_dims(self, lsi_dims):
+ self.lsi_dims = lsi_dims
+
+@dataclass
+class lsi_result_mixin:
+ lsi_dimensions:int
+
+class lsi_grid_sweep(grid_sweep):
+ def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
+ self.jobtype = jobtype
+ self.subsweep = subsweep
+ inpath = Path(inpath)
+ if lsi_dimensions == 'all':
+ lsi_paths = list(inpath.glob("*"))
+ else:
+ lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
+
+ lsi_nums = [p.stem for p in lsi_paths]
+ self.hasrun = False
+ self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
+ self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))