]> code.communitydata.science - cdsc_reddit.git/commitdiff
code to sort tf
authorNate E TeBlunthuis <nathante@n2347.hyak.local>
Tue, 4 Aug 2020 00:56:36 +0000 (17:56 -0700)
committerNate E TeBlunthuis <nathante@n2347.hyak.local>
Tue, 4 Aug 2020 00:56:36 +0000 (17:56 -0700)
sort_tf_comments.py [new file with mode: 0644]

diff --git a/sort_tf_comments.py b/sort_tf_comments.py
new file mode 100644 (file)
index 0000000..abb097e
--- /dev/null
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+from pyspark.sql import functions as f
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.getOrCreate()
+df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/")
+
+df = df.repartition(2000,'term')
+df = df.sort(['term','week','subreddit'])
+df = df.sortWithinPartitions(['term','week','subreddit'])
+
+df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy')

Community Data Science Collective || Want to submit a patch?