X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/b7c39a3494ce214f315fd7e3bb0bf99bc58070d1..55b75ea6fcf421e95f4fe6b180dcec6e64676619:/similarities/similarities_helper.py?ds=sidebyside diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 13845d1..03c10b2 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} + print(projection) df = tfidf_ds.to_table(filter=ds_filter,columns=projection) @@ -240,7 +241,6 @@ def test_lsi_sims(): def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None): # first compute the lsi of the matrix # then take the column similarities - print("running LSI",flush=True) if type(n_components) is int: n_components = [n_components] @@ -249,15 +249,20 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 svd_components = n_components[0] - if lsi_model_load is not None: + if lsi_model_load is not None and Path(lsi_model_load).exists(): + print("loading LSI") mod = pickle.load(open(lsi_model_load ,'rb')) + lsi_model_save = lsi_model_load else: + print("running LSI",flush=True) + svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) mod = svd.fit(tfidfmat.T) lsimat = mod.transform(tfidfmat.T) if lsi_model_save is not None: + Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True) pickle.dump(mod, open(lsi_model_save,'wb')) sims_list = []