X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/ce549c6c97058325ac6f1b9dab20406af1dbb2af..98c1317af5da5aafd1e7acb31911ca4333312571:/similarities/similarities_helper.py?ds=inline

diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py
index 13845d1..d97e519 100644
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
             'relative_tf':ds.field('relative_tf').cast('float32'),
             'tf_idf':ds.field('tf_idf').cast('float32')}
 
+        print(projection)
 
     df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
 
@@ -240,7 +241,6 @@ def test_lsi_sims():
 def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None):
     # first compute the lsi of the matrix
     # then take the column similarities
-    print("running LSI",flush=True)
 
     if type(n_components) is int:
         n_components = [n_components]
@@ -249,10 +249,14 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
     
     svd_components = n_components[0]
     
-    if lsi_model_load is not None:
+    if lsi_model_load is not None and Path(lsi_model_load).exists():
+        print("loading LSI")
         mod = pickle.load(open(lsi_model_load ,'rb'))
+        lsi_model_save = lsi_model_load
 
     else:
+        print("running LSI",flush=True)
+
         svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter)
         mod = svd.fit(tfidfmat.T)