X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/40d45637702fb51feb9f99ff7f6d71787af765ed..4ced659d1961630c20a1ef817422f242f723af7f:/helper.py?ds=sidebyside diff --git a/helper.py b/helper.py index 4dc6210..af87f71 100644 --- a/helper.py +++ b/helper.py @@ -17,16 +17,8 @@ def find_dumps(dumpdir, base_pattern): ext_priority = ['.zst','.xz','.bz2'] for base, exts in dumpext.items(): - found = False - if len(exts) == 1: - yield base + exts[0] - found = True - else: - for ext in ext_priority: - if ext in exts: - yield base + ext - found = True - assert(found == True) + ext = [ext for ext in ext_priority if ext in exts][0] + yield base + ext def open_fileset(files): for fh in files: @@ -48,6 +40,8 @@ def open_input_file(input_filename): cmd = ["xzcat",'-dk', '-T 20',input_filename] elif re.match(r'.*\.zst',input_filename): cmd = ['zstd','-dck', input_filename] + elif re.match(r'.*\.gz',input_filename): + cmd = ['gzip','-dc', input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout except NameError as e: