Reuse code for term and author cosine similarity.

[cdsc_reddit.git] / helper.py
diff --git a/helper.py b/helper.py

index 4dc6210bef50e8142f2c56fbc788700cc761a082..af87f71d265501f1f3ca25ecb64882f4ff9997da 100644 (file)
--- a/helper.py
+++ b/helper.py
@@ -17,16 +17,8 @@ def find_dumps(dumpdir, base_pattern):
      ext_priority = ['.zst','.xz','.bz2']
  
      for base, exts in dumpext.items():
-        found = False
-        if len(exts) == 1:
-            yield base + exts[0]
-            found = True
-        else:
-            for ext in ext_priority:
-                if ext in exts:
-                    yield base + ext
-                    found = True
-        assert(found == True)
+        ext = [ext for ext in ext_priority if ext in exts][0]
+        yield base + ext
  
  def open_fileset(files):
      for fh in files:
@@ -48,6 +40,8 @@ def open_input_file(input_filename):
          cmd = ["xzcat",'-dk', '-T 20',input_filename]
      elif re.match(r'.*\.zst',input_filename):
          cmd = ['zstd','-dck', input_filename]
+    elif re.match(r'.*\.gz',input_filename):
+        cmd = ['gzip','-dc', input_filename]
      try:
          input_file = Popen(cmd, stdout=PIPE).stdout
      except NameError as e: