datasets/comments_2_parquet_part1.py

   1 #!/usr/bin/env python3
   2 import os
   3 import json
   4 from datetime import datetime
   5 from multiprocessing import Pool
   6 from itertools import islice
   7 from helper import open_input_file, find_dumps
   8 import pandas as pd
   9 import pyarrow as pa
  10 import pyarrow.parquet as pq
  11 from pathlib import Path
  12 import fire
  13
  14 def parse_comment(comment, names= None):
  15     if names is None:
  16         names = ["id","subreddit","link_id","parent_id","created_utc","author","ups","downs","score","edited","subreddit_type","subreddit_id","stickied","is_submitter","body","error"]
  17
  18     try:
  19         comment = json.loads(comment)
  20     except json.decoder.JSONDecodeError as e:
  21         print(e)
  22         print(comment)
  23         row = [None for _ in names]
  24         row[-1] = "json.decoder.JSONDecodeError|{0}|{1}".format(e,comment)
  25         return tuple(row)
  26
  27     row = []
  28     for name in names:
  29         if name == 'created_utc':
  30             row.append(datetime.fromtimestamp(int(comment['created_utc']),tz=None))
  31         elif name == 'edited':
  32             val = comment[name]
  33             if type(val) == bool:
  34                 row.append(val)
  35                 row.append(None)
  36             else:
  37                 row.append(True)
  38                 row.append(datetime.fromtimestamp(int(val),tz=None))
  39         elif name == "time_edited":
  40             continue
  41         elif name not in comment:
  42             row.append(None)
  43
  44         else:
  45             row.append(comment[name])
  46
  47     return tuple(row)
  48
  49
  50 #    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
  51
  52 def parse_dump(partition):
  53
  54     dumpdir = f"/gscratch/comdata/raw_data/reddit_dumps/comments/{partition}"
  55
  56     stream = open_input_file(dumpdir)
  57     rows = map(parse_comment, stream)
  58
  59     schema = pa.schema([
  60         pa.field('id', pa.string(), nullable=True),
  61         pa.field('subreddit', pa.string(), nullable=True),
  62         pa.field('link_id', pa.string(), nullable=True),
  63         pa.field('parent_id', pa.string(), nullable=True),
  64         pa.field('created_utc', pa.timestamp('ms'), nullable=True),
  65         pa.field('author', pa.string(), nullable=True),
  66         pa.field('ups', pa.int64(), nullable=True),
  67         pa.field('downs', pa.int64(), nullable=True),
  68         pa.field('score', pa.int64(), nullable=True),
  69         pa.field('edited', pa.bool_(), nullable=True),
  70         pa.field('time_edited', pa.timestamp('ms'), nullable=True),
  71         pa.field('subreddit_type', pa.string(), nullable=True),
  72         pa.field('subreddit_id', pa.string(), nullable=True),
  73         pa.field('stickied', pa.bool_(), nullable=True),
  74         pa.field('is_submitter', pa.bool_(), nullable=True),
  75         pa.field('body', pa.string(), nullable=True),
  76         pa.field('error', pa.string(), nullable=True),
  77     ])
  78
  79     p = Path("/gscratch/comdata/output/temp/reddit_comments.parquet")
  80     p.mkdir(exist_ok=True,parents=True)
  81
  82     N=10000
  83     with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet",
  84                           schema=schema,
  85                           compression='snappy',
  86                           flavor='spark') as writer:
  87
  88         while True:
  89             chunk = islice(rows,N)
  90             pddf = pd.DataFrame(chunk, columns=schema.names)
  91             table = pa.Table.from_pandas(pddf,schema=schema)
  92             if table.shape[0] == 0:
  93                 break
  94             writer.write_table(table)
  95
  96         writer.close()
  97
  98
  99 def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/comments", overwrite=True):
 100     files = list(find_dumps(dumpdir,base_pattern="RC_20*.*"))
 101     with open("comments_task_list.sh",'w') as of:
 102         for fpath in files:
 103             partition = os.path.split(fpath)[1]
 104             if (not Path(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
 105                 of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n')
 106
 107
 108 if __name__ == '__main__':
 109     fire.Fire({'parse_dump':parse_dump,
 110               'gen_task_list':gen_task_list})
 111