from mw.xml_dump import Iterator
from mw.lib import persistence
-from mw.lib import reverts
+import mwreverts
from urllib.parse import quote
TO_ENCODE = ('title', 'editor')
PERSISTENCE_RADIUS=7
state = persistence.State()
window = deque(maxlen=PERSISTENCE_RADIUS)
- rev_detector = reverts.Detector()
+ rev_detector = mwreverts.Detector()
# Iterate through a page's revisions
for rev in page:
def open_input_file(input_filename):
- if re.match(r'.*\.7z', input_filename):
- cmd = ["7za", "x", "-so", input_filename, '*.xml']
- elif re.match(r'.*\.gz', input_filename):
- cmd = ["zcat", input_filename]
- elif re.match(r'.*\.bz2', input_filename):
+ if re.match(r'.*\.7z$', input_filename):
+ cmd = ["7za", "x", "-so", input_filename, '*']
+ elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename]
+ elif re.match(r'.*\.bz2$', input_filename):
+ cmd = ["bzcat", "-dk", input_filename]
try:
input_file = Popen(cmd, stdout=PIPE).stdout
for filename in args.dumpfiles:
input_file = open_input_file(filename)
- # open file for output
+ # open directory for output
+ if args.output_dir:
+ output_dir = args.output_dir[0]
+ else:
+ output_dir = "."
+
+ print("Processing file: %s" % filename, file=sys.stderr)
+
if args.stdout:
output_file = sys.stdout
else:
- if args.output_dir:
- output_dir = args.output_dir[0]
- else:
- output_dir = "."
-
filename = os.path.join(output_dir, os.path.basename(filename))
output_file = open_output_file(filename)
wikiq = WikiqParser(input_file, output_file,
- collapse_user=args.collapse_user,
+ collapse_user=args.collapse_user,
persist=args.persist,
urlencode=args.urlencode)
- print("Processing file: %s" % filename, file=sys.stderr)
wikiq.process()
output_file.close()
else:
wikiq = WikiqParser(sys.stdin, sys.stdout,
- collapse_user=args.collapse_user,
+ collapse_user=args.collapse_user,
persist=args.persist,
urlencode=args.urlencode)
wikiq.process()