X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/a60747292e91a47d122158659182f82bfd2e922a..e6294b5b90135a5163441c8dc62252dd6a188412:/top_subreddits_by_comments.py diff --git a/top_subreddits_by_comments.py b/top_subreddits_by_comments.py deleted file mode 100644 index 9e172c5..0000000 --- a/top_subreddits_by_comments.py +++ /dev/null @@ -1,30 +0,0 @@ -from pyspark.sql import functions as f -from pyspark.sql import SparkSession -from pyspark.sql import Window -from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix -import numpy as np -import pyarrow -import pandas as pd -import fire -from itertools import islice -from pathlib import Path -from similarities_helper import cosine_similarities - -spark = SparkSession.builder.getOrCreate() -conf = spark.sparkContext.getConf() - -df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") - -# remove /u/ pages -df = df.filter(~df.subreddit.like("u_%")) - -df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) - -win = Window.orderBy(f.col('n_comments').desc()) -df = df.withColumn('comments_rank',f.rank().over(win)) - -df = df.toPandas() - -df = df.sort_values("n_comments") - -df.to_csv('/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv',index=False)