]> code.communitydata.science - cdsc_reddit.git/blob - helper.py
Build comments dataset similarly to submissions and improve partitioning scheme
[cdsc_reddit.git] / helper.py
1 from subprocess import Popen, PIPE
2 import re
3 from collections import defaultdict
4 from os import path
5 import glob
6
7 def find_dumps(dumpdir, base_pattern):
8
9     files = glob.glob(path.join(dumpdir,base_pattern))
10
11     # build a dictionary of possible extensions for each dump
12     dumpext = defaultdict(list)
13     for fpath in files:
14         fname, ext = path.splitext(fpath)
15         dumpext[fname].append(ext)
16
17     ext_priority = ['.zst','.xz','.bz2']
18
19     for base, exts in dumpext.items():
20         found = False
21         if len(exts) == 1:
22             yield base + exts[0]
23             found = True
24         else:
25             for ext in ext_priority:
26                 if ext in exts:
27                     yield base + ext
28                     found = True
29         assert(found == True)
30
31 def open_fileset(files):
32     for fh in files:
33         print(fh)
34         lines = open_input_file(fh)
35         for line in lines:
36             yield line
37
38 def open_input_file(input_filename):
39     if re.match(r'.*\.7z$', input_filename):
40         cmd = ["7za", "x", "-so", input_filename, '*'] 
41     elif re.match(r'.*\.gz$', input_filename):
42         cmd = ["zcat", input_filename] 
43     elif re.match(r'.*\.bz2$', input_filename):
44         cmd = ["bzcat", "-dk", input_filename] 
45     elif re.match(r'.*\.bz', input_filename):
46         cmd = ["bzcat", "-dk", input_filename] 
47     elif re.match(r'.*\.xz', input_filename):
48         cmd = ["xzcat",'-dk', '-T 20',input_filename]
49     elif re.match(r'.*\.zst',input_filename):
50         cmd = ['zstd','-dck', input_filename]
51     try:
52         input_file = Popen(cmd, stdout=PIPE).stdout
53     except NameError as e:
54         print(e)
55         input_file = open(input_filename, 'r')
56     return input_file
57

Community Data Science Collective || Want to submit a patch?