]> code.communitydata.science - cdsc_reddit.git/blob - dumps/remove_duplicate_comments.py
make pass keyword arg to dataframe.drop
[cdsc_reddit.git] / dumps / remove_duplicate_comments.py
1 from pathlib import Path
2 from itertools import chain, groupby
3
4 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
5
6 zst_files = dumpdir.glob("*.zst")
7 bz2_files = dumpdir.glob("*.bz2")
8 xz_files = dumpdir.glob("*.xz")
9 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
10 groups = groupby(all_files, key = lambda p: p.stem)
11
12 kept_paths = []
13 removed_paths = []
14
15 priority = ['.zst','.xz','.bz2']
16
17 for stem, files in groups:
18     keep_file = None
19     remove_files = []
20     for f in files:
21         if keep_file is None:
22             keep_file = f
23         elif priority.index(keep_file.suffix) > priority.index(f.suffix):
24             remove_files.append(keep_file)
25             keep_file = f
26         else:
27             remove_files.append(f)
28     kept_paths.append(keep_file)
29     removed_paths.extend(remove_files)
30
31 (dumpdir / "to_remove").mkdir()
32
33 for f in removed_paths:
34     f.rename(f.parent / "to_remove" / f.name)

Community Data Science Collective || Want to submit a patch?