dumps/check_comments_shas.py

   1 #!/usr/bin/env python3
   2 # run from a build_machine
   3
   4 import requests
   5 from os import path
   6 import hashlib
   7
   8 shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text
   9 shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
  10
  11 shasums = shasums1 + shasums2
  12 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments"
  13
  14 for l in shasums.strip().split('\n'):
  15     sha256_hash = hashlib.sha256()
  16     parts = l.split(' ')
  17
  18     correct_sha256 = parts[0]
  19     filename = parts[-1]
  20     print(f"checking {filename}")
  21     fpath = path.join(dumpdir,filename)
  22     if path.isfile(fpath):
  23         with open(fpath,'rb') as f:
  24             for byte_block in iter(lambda: f.read(4096),b""):
  25                 sha256_hash.update(byte_block)
  26
  27         if sha256_hash.hexdigest() == correct_sha256:
  28             print(f"{filename} checks out")
  29         else:
  30             print(f"ERROR! {filename} has the wrong hash. Redownload and recheck!")
  31     else:
  32         print(f"Skipping {filename} as it doesn't exist")
  33