X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/3f9da4074733981fa1dda6e88af75a19054e6c52..7db6288923361564d569b1dfb2b1922664dac5c7:/wikiq diff --git a/wikiq b/wikiq index f25874e..ad4d549 100755 --- a/wikiq +++ b/wikiq @@ -15,7 +15,7 @@ from hashlib import sha1 from mw.xml_dump import Iterator from mw.lib import persistence -from mw.lib import reverts +import mwreverts from urllib.parse import quote TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS=7 @@ -147,7 +147,7 @@ class WikiqParser(): state = persistence.State() window = deque(maxlen=PERSISTENCE_RADIUS) - rev_detector = reverts.Detector() + rev_detector = mwreverts.Detector() # Iterate through a page's revisions for rev in page: @@ -272,12 +272,12 @@ class WikiqParser(): def open_input_file(input_filename): - if re.match(r'.*\.7z', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*.xml'] - elif re.match(r'.*\.gz', input_filename): - cmd = ["zcat", input_filename] - elif re.match(r'.*\.bz2', input_filename): + if re.match(r'.*\.7z$', input_filename): + cmd = ["7za", "x", "-so", input_filename, '*'] + elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] + elif re.match(r'.*\.bz2$', input_filename): + cmd = ["bzcat", "-dk", input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout @@ -322,24 +322,25 @@ if len(args.dumpfiles) > 0: for filename in args.dumpfiles: input_file = open_input_file(filename) - # open file for output + # open directory for output + if args.output_dir: + output_dir = args.output_dir[0] + else: + output_dir = "." + + print("Processing file: %s" % filename, file=sys.stderr) + if args.stdout: output_file = sys.stdout else: - if args.output_dir: - output_dir = args.output_dir[0] - else: - output_dir = "." - filename = os.path.join(output_dir, os.path.basename(filename)) output_file = open_output_file(filename) wikiq = WikiqParser(input_file, output_file, - collapse_user=args.collapse_user, + collapse_user=args.collapse_user, persist=args.persist, urlencode=args.urlencode) - print("Processing file: %s" % filename, file=sys.stderr) wikiq.process() @@ -348,7 +349,7 @@ if len(args.dumpfiles) > 0: output_file.close() else: wikiq = WikiqParser(sys.stdin, sys.stdout, - collapse_user=args.collapse_user, + collapse_user=args.collapse_user, persist=args.persist, urlencode=args.urlencode) wikiq.process()