X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/40d45637702fb51feb9f99ff7f6d71787af765ed..HEAD:/helper.py diff --git a/helper.py b/helper.py deleted file mode 100644 index 4dc6210..0000000 --- a/helper.py +++ /dev/null @@ -1,57 +0,0 @@ -from subprocess import Popen, PIPE -import re -from collections import defaultdict -from os import path -import glob - -def find_dumps(dumpdir, base_pattern): - - files = glob.glob(path.join(dumpdir,base_pattern)) - - # build a dictionary of possible extensions for each dump - dumpext = defaultdict(list) - for fpath in files: - fname, ext = path.splitext(fpath) - dumpext[fname].append(ext) - - ext_priority = ['.zst','.xz','.bz2'] - - for base, exts in dumpext.items(): - found = False - if len(exts) == 1: - yield base + exts[0] - found = True - else: - for ext in ext_priority: - if ext in exts: - yield base + ext - found = True - assert(found == True) - -def open_fileset(files): - for fh in files: - print(fh) - lines = open_input_file(fh) - for line in lines: - yield line - -def open_input_file(input_filename): - if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*'] - elif re.match(r'.*\.gz$', input_filename): - cmd = ["zcat", input_filename] - elif re.match(r'.*\.bz2$', input_filename): - cmd = ["bzcat", "-dk", input_filename] - elif re.match(r'.*\.bz', input_filename): - cmd = ["bzcat", "-dk", input_filename] - elif re.match(r'.*\.xz', input_filename): - cmd = ["xzcat",'-dk', '-T 20',input_filename] - elif re.match(r'.*\.zst',input_filename): - cmd = ['zstd','-dck', input_filename] - try: - input_file = Popen(cmd, stdout=PIPE).stdout - except NameError as e: - print(e) - input_file = open(input_filename, 'r') - return input_file -