From 40be7bedb668e26f2ad09d3cc15e6f4bb605d143 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 3 Aug 2020 17:56:36 -0700 Subject: [PATCH] code to sort tf --- sort_tf_comments.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 sort_tf_comments.py diff --git a/sort_tf_comments.py b/sort_tf_comments.py new file mode 100644 index 0000000..abb097e --- /dev/null +++ b/sort_tf_comments.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +from pyspark.sql import functions as f +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/") + +df = df.repartition(2000,'term') +df = df.sort(['term','week','subreddit']) +df = df.sortWithinPartitions(['term','week','subreddit']) + +df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy') -- 2.39.5