X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/a60747292e91a47d122158659182f82bfd2e922a..e6294b5b90135a5163441c8dc62252dd6a188412:/sort_tf_comments.py diff --git a/sort_tf_comments.py b/sort_tf_comments.py deleted file mode 100644 index abb097e..0000000 --- a/sort_tf_comments.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 - -from pyspark.sql import functions as f -from pyspark.sql import SparkSession - -spark = SparkSession.builder.getOrCreate() -df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/") - -df = df.repartition(2000,'term') -df = df.sort(['term','week','subreddit']) -df = df.sortWithinPartitions(['term','week','subreddit']) - -df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy')