Build comments dataset similarly to submissions and improve partitioning scheme

[cdsc_reddit.git] / helper.py
diff --git a/helper.py b/helper.py

new file mode 100644 (file)

index 0000000..4dc6210
--- /dev/null
+++ b/helper.py
@@ -0,0 +1,57 @@
+from subprocess import Popen, PIPE
+import re
+from collections import defaultdict
+from os import path
+import glob
+
+def find_dumps(dumpdir, base_pattern):
+
+    files = glob.glob(path.join(dumpdir,base_pattern))
+
+    # build a dictionary of possible extensions for each dump
+    dumpext = defaultdict(list)
+    for fpath in files:
+        fname, ext = path.splitext(fpath)
+        dumpext[fname].append(ext)
+
+    ext_priority = ['.zst','.xz','.bz2']
+
+    for base, exts in dumpext.items():
+        found = False
+        if len(exts) == 1:
+            yield base + exts[0]
+            found = True
+        else:
+            for ext in ext_priority:
+                if ext in exts:
+                    yield base + ext
+                    found = True
+        assert(found == True)
+
+def open_fileset(files):
+    for fh in files:
+        print(fh)
+        lines = open_input_file(fh)
+        for line in lines:
+            yield line
+
+def open_input_file(input_filename):
+    if re.match(r'.*\.7z$', input_filename):
+        cmd = ["7za", "x", "-so", input_filename, '*'] 
+    elif re.match(r'.*\.gz$', input_filename):
+        cmd = ["zcat", input_filename] 
+    elif re.match(r'.*\.bz2$', input_filename):
+        cmd = ["bzcat", "-dk", input_filename] 
+    elif re.match(r'.*\.bz', input_filename):
+        cmd = ["bzcat", "-dk", input_filename] 
+    elif re.match(r'.*\.xz', input_filename):
+        cmd = ["xzcat",'-dk', '-T 20',input_filename]
+    elif re.match(r'.*\.zst',input_filename):
+        cmd = ['zstd','-dck', input_filename]
+    try:
+        input_file = Popen(cmd, stdout=PIPE).stdout
+    except NameError as e:
+        print(e)
+        input_file = open(input_filename, 'r')
+    return input_file
+