X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/f0176d9f0de93f0e4f3ab1d676c852c2e5fad3b3..53f5b8c03c55aab7fa535a851c61d47e5bf65857:/similarities/similarities_helper.py?ds=inline diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 57a36ca..3ace8f2 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -60,7 +60,7 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre if included_subreddits is None: included_subreddits = select_topN_subreddits(topN) else: - included_subreddits = set(open(included_subreddits)) + included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits)))) if exclude_phrases == True: tfidf = tfidf.filter(~f.col(term_colname).contains("_"))