reddit_bz2_2parquet.py

   1
   2 #!/usr/bin/env python3
   3 import pyspark
   4 from pyspark.sql import functions as f
   5 from pyspark.sql.types import *
   6 from pyspark import SparkConf, SparkContext
   7 from pyspark.sql import SparkSession, SQLContext
   8
   9 conf = SparkConf().setAppName("Reddit to bz2")
  10 conf = conf.set('spark.sql.crossJoin.enabled',"true")
  11
  12 spark = SparkSession.builder.getOrCreate()
  13 sc = spark.sparkContext
  14
  15 globstr = "/gscratch/comdata/raw_data/reddit_dumps/RC_20*.bz2"
  16
  17 import re
  18 import glob
  19 import json
  20 from subprocess import Popen, PIPE
  21 from datetime import datetime
  22 import pandas as pd
  23 from multiprocessing import Pool
  24
  25 def open_fileset(globstr):
  26     files = glob.glob(globstr)
  27     for fh in files:
  28         print(fh)
  29         lines = open_input_file(fh)
  30         for line in lines:
  31             yield json.loads(line)
  32
  33 def open_input_file(input_filename):
  34     if re.match(r'.*\.7z$', input_filename):
  35         cmd = ["7za", "x", "-so", input_filename, '*']
  36     elif re.match(r'.*\.gz$', input_filename):
  37         cmd = ["zcat", input_filename]
  38     elif re.match(r'.*\.bz2$', input_filename):
  39         cmd = ["bzcat", "-dk", input_filename]
  40
  41     elif re.match(r'.*\.bz', input_filename):
  42         cmd = ["bzcat", "-dk", input_filename]
  43     elif re.match(r'.*\.xz', input_filename):
  44         cmd = ["xzcat",'-dk',input_filename]
  45     try:
  46         input_file = Popen(cmd, stdout=PIPE).stdout
  47     except NameError:
  48         input_file = open(input_filename, 'r')
  49     return input_file
  50
  51 def include_row(comment, subreddits_to_track = []):
  52
  53     subreddit = comment['subreddit'].lower()
  54
  55     return subreddit in subreddits_to_track
  56
  57 def parse_comment(comment, names= None):
  58     if names is None:
  59         names = ["id","subreddit","link_id","parent_id","created_utc","author","ups","downs","score","edited","subreddit_type","subreddit_id","stickied","is_submitter","body","error"]
  60
  61     try:
  62         comment = json.loads(comment)
  63     except json.decoder.JSONDecodeError as e:
  64         print(e)
  65         print(comment)
  66         row = [None for _ in names]
  67         row[-1] = "json.decoder.JSONDecodeError|{0}|{1}".format(e,comment)
  68         return tuple(row)
  69
  70     row = []
  71     for name in names:
  72         if name == 'created_utc':
  73             row.append(datetime.fromtimestamp(int(comment['created_utc']),tz=None))
  74         elif name == 'edited':
  75             val = comment[name]
  76             if type(val) == bool:
  77                 row.append(val)
  78                 row.append(None)
  79             else:
  80                 row.append(True)
  81                 row.append(datetime.fromtimestamp(int(val),tz=None))
  82         elif name == "time_edited":
  83             continue
  84         elif name not in comment:
  85             row.append(None)
  86
  87         else:
  88             row.append(comment[name])
  89
  90     return tuple(row)
  91
  92
  93 #    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
  94
  95 sqlContext = pyspark.SQLContext(sc)
  96
  97 comments = sc.textFile(globstr)
  98
  99 schema = StructType().add("id", StringType(), True)
 100 schema = schema.add("subreddit", StringType(), True)
 101 schema = schema.add("link_id", StringType(), True)
 102 schema = schema.add("parent_id", StringType(), True)
 103 schema = schema.add("created_utc", DateType(), True)
 104 schema = schema.add("author", StringType(), True)
 105 schema = schema.add("ups", LongType(), True)
 106 schema = schema.add("downs", LongType(), True)
 107 schema = schema.add("score", LongType(), True)
 108 schema = schema.add("edited", BooleanType(), True)
 109 schema = schema.add("time_edited", DateType(), True)
 110 schema = schema.add("subreddit_type", StringType(), True)
 111 schema = schema.add("subreddit_id", StringType(), True)
 112 schema = schema.add("stickied", BooleanType(), True)
 113 schema = schema.add("is_submitter", BooleanType(), True)
 114 schema = schema.add("body", StringType(), True)
 115 schema = schema.add("error", StringType(), True)
 116
 117 rows = comments.map(lambda c: parse_comment(c, schema.fieldNames()))
 118 #!/usr/bin/env python3
 119
 120 df =  sqlContext.createDataFrame(rows, schema)
 121
 122 df = df.withColumnRenamed("created_utc","CreatedAt")
 123 df = df.withColumn("Month",f.month(f.col("CreatedAt")))
 124 df = df.withColumn("Year",f.year(f.col("CreatedAt")))
 125 df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
 126 df = df.withColumn("subreddit_hash",f.sha2(f.col("subreddit"), 256)[0:3])
 127 df = df.sort(["subreddit","author","link_id","parent_id","Year","Month","Day"],ascending=True)
 128 df.write.parquet("/gscratch/comdata/output/reddit_dumps/reddit_comments.parquet", partitionBy=["Year",'Month'],mode='overwrite')