From: Benjamin Mako Hill Date: Thu, 17 May 2018 21:37:20 +0000 (-0700) Subject: a number of small updates and fixes X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/commitdiff_plain/ba886ecf4c1cc5003882be8fa2f8d2e733471c52?ds=inline a number of small updates and fixes - fix regex for filename/filetype matches - unload all files not just ones with end with xml in 7z archives - fix bug that broke stdout - minor cosmetic fixes - updated mediawiki-utilities submodule to latest version --- diff --git a/Mediawiki-Utilities b/Mediawiki-Utilities index beba46e..f732941 160000 --- a/Mediawiki-Utilities +++ b/Mediawiki-Utilities @@ -1 +1 @@ -Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d +Subproject commit f7329417ebb2f03d1e9b8a626236a3c0ce65c814 diff --git a/wikiq b/wikiq index f25874e..7a2f8e4 100755 --- a/wikiq +++ b/wikiq @@ -272,11 +272,11 @@ class WikiqParser(): def open_input_file(input_filename): - if re.match(r'.*\.7z', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*.xml'] - elif re.match(r'.*\.gz', input_filename): + if re.match(r'.*\.7z$', input_filename): + cmd = ["7za", "x", "-so", input_filename, '*'] + elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] - elif re.match(r'.*\.bz2', input_filename): + elif re.match(r'.*\.bz2$', input_filename): cmd = ["zcat", input_filename] try: @@ -322,24 +322,25 @@ if len(args.dumpfiles) > 0: for filename in args.dumpfiles: input_file = open_input_file(filename) - # open file for output + # open directory for output + if args.output_dir: + output_dir = args.output_dir[0] + else: + output_dir = "." + + print("Processing file: %s" % filename, file=sys.stderr) + if args.stdout: output_file = sys.stdout else: - if args.output_dir: - output_dir = args.output_dir[0] - else: - output_dir = "." - filename = os.path.join(output_dir, os.path.basename(filename)) output_file = open_output_file(filename) wikiq = WikiqParser(input_file, output_file, - collapse_user=args.collapse_user, + collapse_user=args.collapse_user, persist=args.persist, urlencode=args.urlencode) - print("Processing file: %s" % filename, file=sys.stderr) wikiq.process() @@ -348,7 +349,7 @@ if len(args.dumpfiles) > 0: output_file.close() else: wikiq = WikiqParser(sys.stdin, sys.stdout, - collapse_user=args.collapse_user, + collapse_user=args.collapse_user, persist=args.persist, urlencode=args.urlencode) wikiq.process()