From: Nate E TeBlunthuis Date: Tue, 4 Aug 2020 00:56:36 +0000 (-0700) Subject: code to sort tf X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/40be7bedb668e26f2ad09d3cc15e6f4bb605d143?ds=inline;hp=--cc code to sort tf --- 40be7bedb668e26f2ad09d3cc15e6f4bb605d143 diff --git a/sort_tf_comments.py b/sort_tf_comments.py new file mode 100644 index 0000000..abb097e --- /dev/null +++ b/sort_tf_comments.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +from pyspark.sql import functions as f +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/") + +df = df.repartition(2000,'term') +df = df.sort(['term','week','subreddit']) +df = df.sortWithinPartitions(['term','week','subreddit']) + +df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy')