]> code.communitydata.science - cdsc_reddit.git/commitdiff
git-annex in nathante@mox2.hyak.local:/gscratch/comdata/users/nathante/cdsc-reddit
authorNate E TeBlunthuis <nathante@n2344.hyak.local>
Wed, 18 Nov 2020 00:31:48 +0000 (16:31 -0800)
committerNate E TeBlunthuis <nathante@n2344.hyak.local>
Wed, 18 Nov 2020 00:31:48 +0000 (16:31 -0800)
fit_tsne.py
visualization/data/term_affinityprop_10000.feather [new symlink]
visualization/data/term_affinityprop_3000.feather [new symlink]
visualization/data/term_tsne_10000.feather [new symlink]
visualization/data/term_tsne_3000.feather [new symlink]

index 7de2ac015f7960a5a37dfa7b5d54fcb6d1ebaae6..28b0fd30630e4a666d619974f321cffb4ba37470 100644 (file)
@@ -1,35 +1,34 @@
+import fire
 import pyarrow
 import pandas as pd
 from numpy import random
 import numpy as np
 from sklearn.manifold import TSNE
 
 import pyarrow
 import pandas as pd
 from numpy import random
 import numpy as np
 from sklearn.manifold import TSNE
 
-df = pd.read_feather("reddit_term_similarity_3000.feather")
-df = df.sort_values(['i','j'])
+similarities = "term_similarities_10000.feather"
 
 
-n = max(df.i.max(),df.j.max())
+def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
+    '''
+    similarities: feather file with a dataframe of similarity scores
+    learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball.
+    perplexity: number of neighbors to use. the default of 50 is often good.
 
 
-def zero_pad(grp):
-    p = grp.shape[0]
-    grp = grp.sort_values('j')
-    return np.concatenate([np.zeros(n-p),np.ones(1),np.array(grp.value)])
+    '''
+    df = pd.read_feather(similarities)
 
 
-col_names = df.sort_values('j').loc[:,['subreddit_j']].drop_duplicates()
-first_name = list(set(df.subreddit_i) - set(df.subreddit_j))[0]
-col_names = [first_name] + list(col_names.subreddit_j)
-mat = df.groupby('i').apply(zero_pad)
-mat.loc[n] = np.concatenate([np.zeros(n),np.ones(1)])
-mat = np.stack(mat)
+    n = df.shape[0]
+    mat = np.array(df.drop('subreddit',1),dtype=np.float64)
+    mat[range(n),range(n)] = 1
+    mat[mat > 1] = 1
+    dist = 2*np.arccos(mat)/np.pi
+    tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
+    tsne_fit_model = tsne_model.fit(dist)
 
 
-mat = mat + np.tril(mat.transpose(),k=-1)
-dist = 2*np.arccos(mat)/np.pi
+    tsne_fit_whole = tsne_fit_model.fit_transform(dist)
 
 
-tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
+    plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
 
 
-tsne_fit_model = tsne_model.fit(dist)
+    plot_data.to_feather(output)
 
 
-tsne_fit_whole = tsne_fit_model.fit_transform(dist)
-
-plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':col_names})
-
-plot_data.to_feather("tsne_subreddit_fit.feather")
+if __name__ == "__main__":
+    fire.Fire(fit_tsne)
diff --git a/visualization/data/term_affinityprop_10000.feather b/visualization/data/term_affinityprop_10000.feather
new file mode 120000 (symlink)
index 0000000..188939f
--- /dev/null
@@ -0,0 +1 @@
+../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784
\ No newline at end of file
diff --git a/visualization/data/term_affinityprop_3000.feather b/visualization/data/term_affinityprop_3000.feather
new file mode 120000 (symlink)
index 0000000..c9b4233
--- /dev/null
@@ -0,0 +1 @@
+../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e
\ No newline at end of file
diff --git a/visualization/data/term_tsne_10000.feather b/visualization/data/term_tsne_10000.feather
new file mode 120000 (symlink)
index 0000000..764f2e0
--- /dev/null
@@ -0,0 +1 @@
+../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543
\ No newline at end of file
diff --git a/visualization/data/term_tsne_3000.feather b/visualization/data/term_tsne_3000.feather
new file mode 120000 (symlink)
index 0000000..21f156f
--- /dev/null
@@ -0,0 +1 @@
+../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf
\ No newline at end of file

Community Data Science Collective || Want to submit a patch?