X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/7df8436067dba9a9e6867424002d01593e4bcd25..b7c39a3494ce214f315fd7e3bb0bf99bc58070d1:/clustering/clustering.py?ds=sidebyside diff --git a/clustering/clustering.py b/clustering/clustering.py index 85be3fe..6ee7842 100755 --- a/clustering/clustering.py +++ b/clustering/clustering.py @@ -3,7 +3,7 @@ import sys import pandas as pd import numpy as np -from sklearn.cluster import AffinityPropagation, KMeans +from sklearn.cluster import AffinityPropagation import fire from pathlib import Path from multiprocessing import cpu_count @@ -46,24 +46,6 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, print(f"saved {output}") return clustering -def kmeans_clustering(similarities, *args, **kwargs): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, *args, **kwargs) - cluster_data = process_clustering_result(clustering, subreddits) - return(cluster_data) - -def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): - - clustering = KMeans(n_clusters=n_clusters, - n_init=n_init, - max_iter=max_iter, - random_state=random_state, - verbose=verbose - ).fit(mat) - - return clustering - if __name__ == "__main__":