X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/ba886ecf4c1cc5003882be8fa2f8d2e733471c52..7db6288923361564d569b1dfb2b1922664dac5c7:/wikiq diff --git a/wikiq b/wikiq index 7a2f8e4..ad4d549 100755 --- a/wikiq +++ b/wikiq @@ -15,7 +15,7 @@ from hashlib import sha1 from mw.xml_dump import Iterator from mw.lib import persistence -from mw.lib import reverts +import mwreverts from urllib.parse import quote TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS=7 @@ -147,7 +147,7 @@ class WikiqParser(): state = persistence.State() window = deque(maxlen=PERSISTENCE_RADIUS) - rev_detector = reverts.Detector() + rev_detector = mwreverts.Detector() # Iterate through a page's revisions for rev in page: @@ -277,7 +277,7 @@ def open_input_file(input_filename): elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): - cmd = ["zcat", input_filename] + cmd = ["bzcat", "-dk", input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout