X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/fc6575a28716f6d1611f988c48d15e64a22687ac..40d45637702fb51feb9f99ff7f6d71787af765ed:/helper.py diff --git a/helper.py b/helper.py new file mode 100644 index 0000000..4dc6210 --- /dev/null +++ b/helper.py @@ -0,0 +1,57 @@ +from subprocess import Popen, PIPE +import re +from collections import defaultdict +from os import path +import glob + +def find_dumps(dumpdir, base_pattern): + + files = glob.glob(path.join(dumpdir,base_pattern)) + + # build a dictionary of possible extensions for each dump + dumpext = defaultdict(list) + for fpath in files: + fname, ext = path.splitext(fpath) + dumpext[fname].append(ext) + + ext_priority = ['.zst','.xz','.bz2'] + + for base, exts in dumpext.items(): + found = False + if len(exts) == 1: + yield base + exts[0] + found = True + else: + for ext in ext_priority: + if ext in exts: + yield base + ext + found = True + assert(found == True) + +def open_fileset(files): + for fh in files: + print(fh) + lines = open_input_file(fh) + for line in lines: + yield line + +def open_input_file(input_filename): + if re.match(r'.*\.7z$', input_filename): + cmd = ["7za", "x", "-so", input_filename, '*'] + elif re.match(r'.*\.gz$', input_filename): + cmd = ["zcat", input_filename] + elif re.match(r'.*\.bz2$', input_filename): + cmd = ["bzcat", "-dk", input_filename] + elif re.match(r'.*\.bz', input_filename): + cmd = ["bzcat", "-dk", input_filename] + elif re.match(r'.*\.xz', input_filename): + cmd = ["xzcat",'-dk', '-T 20',input_filename] + elif re.match(r'.*\.zst',input_filename): + cmd = ['zstd','-dck', input_filename] + try: + input_file = Popen(cmd, stdout=PIPE).stdout + except NameError as e: + print(e) + input_file = open(input_filename, 'r') + return input_file +