- if min_df is None:
- min_df = 0.1 * len(included_subreddits)
-
- tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
-
- # reset the subreddit ids
- sub_ids = tfidf.select('subreddit_id').distinct()
- sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.orderBy("subreddit_id")))
- tfidf = tfidf.join(sub_ids,'subreddit_id')
-
- # only use terms in at least min_df included subreddits
- new_count = tfidf.groupBy('term_id').agg(f.count('term_id').alias('new_count'))
- term_ids = term_ids.join(new_count,'term_id')
- term_ids = term_ids.filter(new_count >= min_df)