X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/a60747292e91a47d122158659182f82bfd2e922a:/top_subreddits_by_comments.py..e6294b5b90135a5163441c8dc62252dd6a188412:/similarities/top_subreddits_by_comments.py diff --git a/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py similarity index 56% rename from top_subreddits_by_comments.py rename to similarities/top_subreddits_by_comments.py index 9e172c5..214c7e0 100644 --- a/top_subreddits_by_comments.py +++ b/similarities/top_subreddits_by_comments.py @@ -1,14 +1,6 @@ from pyspark.sql import functions as f from pyspark.sql import SparkSession from pyspark.sql import Window -from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix -import numpy as np -import pyarrow -import pandas as pd -import fire -from itertools import islice -from pathlib import Path -from similarities_helper import cosine_similarities spark = SparkSession.builder.getOrCreate() conf = spark.sparkContext.getConf() @@ -21,10 +13,10 @@ df = df.filter(~df.subreddit.like("u_%")) df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) win = Window.orderBy(f.col('n_comments').desc()) -df = df.withColumn('comments_rank',f.rank().over(win)) +df = df.withColumn('comments_rank', f.rank().over(win)) df = df.toPandas() df = df.sort_values("n_comments") -df.to_csv('/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv',index=False) +df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)