1 from pathlib import Path
2 from itertools import chain, groupby
4 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions")
6 zst_files = dumpdir.glob("*.zst")
7 bz2_files = dumpdir.glob("*.bz2")
8 xz_files = dumpdir.glob("*.xz")
9 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
10 groups = groupby(all_files, key = lambda p: p.stem)
15 priority = ['.zst','.xz','.bz2']
17 for stem, files in groups:
23 elif priority.index(keep_file.suffix) > priority.index(f.suffix):
24 remove_files.append(keep_file)
27 remove_files.append(f)
28 kept_paths.append(keep_file)
29 removed_paths.extend(remove_files)
31 (dumpdir / "to_remove").mkdir()
33 for f in removed_paths:
34 f.rename(f.parent / "to_remove" / f.name)