]> code.communitydata.science - cdsc_reddit.git/blobdiff - similarities/similarities_helper.py
add note to try other tf normalization strategies.
[cdsc_reddit.git] / similarities / similarities_helper.py
index 57a36ca924ea5d0d991e976347cac0362b97ac20..3ace8f29f3922838009adfb4dccf77e5f03b1e34 100644 (file)
@@ -60,7 +60,7 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
     if included_subreddits is None:
         included_subreddits = select_topN_subreddits(topN)
     else:
-        included_subreddits = set(open(included_subreddits))
+        included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits))))
 
     if exclude_phrases == True:
         tfidf = tfidf.filter(~f.col(term_colname).contains("_"))

Community Data Science Collective || Want to submit a patch?