]> code.communitydata.science - cdsc_reddit.git/blobdiff - similarities/weekly_cosine_similarities.py
Some improvements to run affinity clustering on larger dataset and
[cdsc_reddit.git] / similarities / weekly_cosine_similarities.py
index 2b3c90be18cd08d7fbed3ae6e5c62bd77966499f..54856b030d10aa123e609da067ec6dcc9f74df62 100644 (file)
@@ -35,7 +35,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None,
     subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
     spark.stop()
 
-    weeks = list(subreddit_names.week.drop_duplicates())
+d    weeks = sorted(list(subreddit_names.week.drop_duplicates()))
     for week in weeks:
         print(f"loading matrix: {week}")
         mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)

Community Data Science Collective || Want to submit a patch?