]> code.communitydata.science - cdsc_reddit.git/blobdiff - datasets/submissions_2_parquet_part1.py
changes from dirty branch.
[cdsc_reddit.git] / datasets / submissions_2_parquet_part1.py
index 77ae09f33ca260261ea919cbf23bbe822aa940c4..d1a8a3d392bb1e75e24ff907a72f9710bc6d7b29 100755 (executable)
@@ -58,7 +58,7 @@ def parse_submission(post, names = None):
 def parse_dump(partition):
 
     N=10000
-    stream = open_fileset([f"/gscratch/comdata/raw_data/reddit_dumps/submissions/{partition}"])
+    stream = open_fileset([f"/gscratch/comdata/raw_data/submissions/{partition}"])
     rows = map(parse_submission,stream)
     schema = pa.schema([
         pa.field('id', pa.string(),nullable=True),
@@ -102,7 +102,7 @@ def parse_dump(partition):
 
         writer.close()
 
-def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"):
+def gen_task_list(dumpdir="/gscratch/comdata/raw_data/submissions"):
     files = list(find_dumps(dumpdir,base_pattern="RS_20*.*"))
     with open("submissions_task_list.sh",'w') as of:
         for fpath in files:

Community Data Science Collective || Want to submit a patch?