From: Nate E TeBlunthuis Date: Tue, 17 Nov 2020 23:59:20 +0000 (-0800) Subject: Update code for clustering + tsne. X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/f8ff8b2d0f634d4671de090b3c1ceba12be958d6?ds=sidebyside;hp=82d184d9c608db47f5c37f17a18962f07169cbc7 Update code for clustering + tsne. --- diff --git a/clustering.py b/clustering.py new file mode 100644 index 0000000..552d8ae --- /dev/null +++ b/clustering.py @@ -0,0 +1,45 @@ +import pandas as pd +import numpy as np +from sklearn.cluster import AffinityPropagation +import fire + +def affinity_clustering(similarities, output, damping=0.5, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968): + ''' + similarities: feather file with a dataframe of similarity scores + preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits. + ''' + + df = pd.read_feather(similarities) + n = df.shape[0] + mat = np.array(df.drop('subreddit',1)) + mat[range(n),range(n)] = 1 + + preference = np.quantile(mat,preference_quantile) + + clustering = AffinityPropagation(damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + copy=False, + preference=preference, + affinity='precomputed', + random_state=random_state).fit(mat) + + + print(f"clustering took {clustering.n_iter_} iterations") + clusters = clustering.labels_ + + print(f"found {len(set(clusters))} clusters") + + cluster_data = pd.DataFrame({'subreddit': df.subreddit,'cluster':clustering.labels_}) + + cluster_sizes = cluster_data.groupby("cluster").count() + print(f"the largest cluster has {cluster_sizes.subreddit.max()} members") + + print(f"the median cluster has {cluster_sizes.subreddit.median()} members") + + print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") + + cluster_data.to_feather(output) + +if __name__ == "__main__": + fire.Fire(affinity_clustering)