]> code.communitydata.science - mediawiki_dump_tools.git/commitdiff
a number of small updates and fixes
authorBenjamin Mako Hill <mako@atdot.cc>
Thu, 17 May 2018 21:37:20 +0000 (14:37 -0700)
committerBenjamin Mako Hill <mako@atdot.cc>
Thu, 17 May 2018 21:37:20 +0000 (14:37 -0700)
- fix regex for filename/filetype matches
- unload all files not just ones with end with xml in 7z archives
- fix bug that broke stdout
- minor cosmetic fixes
- updated mediawiki-utilities submodule to latest version

Mediawiki-Utilities
wikiq

index beba46e3eee8e0582cc3a5515dfa658ffbd18f9d..f7329417ebb2f03d1e9b8a626236a3c0ce65c814 160000 (submodule)
@@ -1 +1 @@
-Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d
+Subproject commit f7329417ebb2f03d1e9b8a626236a3c0ce65c814
diff --git a/wikiq b/wikiq
index f25874e41b24edf898c4e773d9d8c73581eeddb8..7a2f8e45ca20d61464e042704f675f214fd0be09 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -272,11 +272,11 @@ class WikiqParser():
 
 
 def open_input_file(input_filename):
 
 
 def open_input_file(input_filename):
-    if re.match(r'.*\.7z', input_filename):
-        cmd = ["7za", "x", "-so", input_filename, '*.xml'] 
-    elif re.match(r'.*\.gz', input_filename):
+    if re.match(r'.*\.7z$', input_filename):
+        cmd = ["7za", "x", "-so", input_filename, '*'] 
+    elif re.match(r'.*\.gz$', input_filename):
         cmd = ["zcat", input_filename] 
         cmd = ["zcat", input_filename] 
-    elif re.match(r'.*\.bz2', input_filename):
+    elif re.match(r'.*\.bz2$', input_filename):
         cmd = ["zcat", input_filename] 
 
     try:
         cmd = ["zcat", input_filename] 
 
     try:
@@ -322,24 +322,25 @@ if len(args.dumpfiles) > 0:
     for filename in args.dumpfiles:
         input_file = open_input_file(filename)
 
     for filename in args.dumpfiles:
         input_file = open_input_file(filename)
 
-        # open file for output
+        # open directory for output
+        if args.output_dir:
+            output_dir = args.output_dir[0]
+        else:
+            output_dir = "."
+
+        print("Processing file: %s" % filename, file=sys.stderr)
+
         if args.stdout:
             output_file = sys.stdout
         else:
         if args.stdout:
             output_file = sys.stdout
         else:
-            if args.output_dir:
-                output_dir = args.output_dir[0]
-            else:
-                output_dir = "."
-
             filename = os.path.join(output_dir, os.path.basename(filename))
             output_file = open_output_file(filename)
 
         wikiq = WikiqParser(input_file, output_file, 
             filename = os.path.join(output_dir, os.path.basename(filename))
             output_file = open_output_file(filename)
 
         wikiq = WikiqParser(input_file, output_file, 
-                           collapse_user=args.collapse_user,
+                            collapse_user=args.collapse_user,
                             persist=args.persist,
                             urlencode=args.urlencode)
 
                             persist=args.persist,
                             urlencode=args.urlencode)
 
-        print("Processing file: %s" % filename, file=sys.stderr)
 
         wikiq.process()
 
 
         wikiq.process()
 
@@ -348,7 +349,7 @@ if len(args.dumpfiles) > 0:
         output_file.close()
 else:
     wikiq = WikiqParser(sys.stdin, sys.stdout,
         output_file.close()
 else:
     wikiq = WikiqParser(sys.stdin, sys.stdout,
-                       collapse_user=args.collapse_user,
+                        collapse_user=args.collapse_user,
                         persist=args.persist,
                         urlencode=args.urlencode)
     wikiq.process()
                         persist=args.persist,
                         urlencode=args.urlencode)
     wikiq.process()

Community Data Science Collective || Want to submit a patch?