From: Nate E TeBlunthuis Date: Sat, 3 Oct 2020 23:42:22 +0000 (-0700) Subject: Update reddit comments data with daily dumps. X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/4ced659d1961630c20a1ef817422f242f723af7f Update reddit comments data with daily dumps. --- diff --git a/check_comments_shas.py b/check_comments_shas.py old mode 100644 new mode 100755 index a2bc89b..199261c --- a/check_comments_shas.py +++ b/check_comments_shas.py @@ -5,8 +5,10 @@ import requests from os import path import hashlib -shasums = requests.get("https://files.pushshift.io/reddit/comments/sha256sums.txt").text +shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text +shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text +shasums = shasums1 + shasums2 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" for l in shasums.strip().split('\n'): diff --git a/comments_2_parquet.sh b/comments_2_parquet.sh index 9233138..e9818c1 100755 --- a/comments_2_parquet.sh +++ b/comments_2_parquet.sh @@ -1,9 +1,6 @@ ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete #!/usr/bin/env bash - - - echo "#!/usr/bin/bash" > job_script.sh echo "source $(pwd)/../bin/activate" >> job_script.sh echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh diff --git a/helper.py b/helper.py index b401cad..af87f71 100644 --- a/helper.py +++ b/helper.py @@ -40,6 +40,8 @@ def open_input_file(input_filename): cmd = ["xzcat",'-dk', '-T 20',input_filename] elif re.match(r'.*\.zst',input_filename): cmd = ['zstd','-dck', input_filename] + elif re.match(r'.*\.gz',input_filename): + cmd = ['gzip','-dc', input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout except NameError as e: diff --git a/pull_pushshift_comments.sh b/pull_pushshift_comments.sh index 243e464..3f6d2c9 100755 --- a/pull_pushshift_comments.sh +++ b/pull_pushshift_comments.sh @@ -4,8 +4,11 @@ user_agent='nathante teblunthuis ' output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' base_url='https://files.pushshift.io/reddit/comments/' -wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url -./check_comment_shas.py +# starting in 2020 we use daily dumps not monthly dumps +wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/ + +./check_comments_shas.py