X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/e6294b5b90135a5163441c8dc62252dd6a188412..2d21ff1137dfaf83c5a51fdcd8900503c50a06ab:/clustering/fit_tsne.py diff --git a/clustering/fit_tsne.py b/clustering/fit_tsne.py index 28b0fd3..55d7239 100644 --- a/clustering/fit_tsne.py +++ b/clustering/fit_tsne.py @@ -5,7 +5,7 @@ from numpy import random import numpy as np from sklearn.manifold import TSNE -similarities = "term_similarities_10000.feather" +similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20): ''' @@ -17,7 +17,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 df = pd.read_feather(similarities) n = df.shape[0] - mat = np.array(df.drop('subreddit',1),dtype=np.float64) + mat = np.array(df.drop('_subreddit',1),dtype=np.float64) mat[range(n),range(n)] = 1 mat[mat > 1] = 1 dist = 2*np.arccos(mat)/np.pi @@ -26,7 +26,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 tsne_fit_whole = tsne_fit_model.fit_transform(dist) - plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit}) + plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']}) plot_data.to_feather(output)