X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/197518a222a321a8027c3dc5a4121350c47d0779..refs/heads/icwsm_dataverse:/datasets/submissions_2_parquet_part1.py diff --git a/datasets/submissions_2_parquet_part1.py b/datasets/submissions_2_parquet_part1.py index 77ae09f..d1a8a3d 100755 --- a/datasets/submissions_2_parquet_part1.py +++ b/datasets/submissions_2_parquet_part1.py @@ -58,7 +58,7 @@ def parse_submission(post, names = None): def parse_dump(partition): N=10000 - stream = open_fileset([f"/gscratch/comdata/raw_data/reddit_dumps/submissions/{partition}"]) + stream = open_fileset([f"/gscratch/comdata/raw_data/submissions/{partition}"]) rows = map(parse_submission,stream) schema = pa.schema([ pa.field('id', pa.string(),nullable=True), @@ -102,7 +102,7 @@ def parse_dump(partition): writer.close() -def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"): +def gen_task_list(dumpdir="/gscratch/comdata/raw_data/submissions"): files = list(find_dumps(dumpdir,base_pattern="RS_20*.*")) with open("submissions_task_list.sh",'w') as of: for fpath in files: