added support to parse namespaces from title

author Benjamin Mako Hill <mako@atdot.cc>

Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)

committer Benjamin Mako Hill <mako@atdot.cc>

Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)
author Benjamin Mako Hill <mako@atdot.cc>
Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)
committer Benjamin Mako Hill <mako@atdot.cc>
Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)
diff --git a/Mediawiki-Utilities b/Mediawiki-Utilities

index ddd3ea3442ca0450ab16f88c5fab674551d35ee7..beba46e3eee8e0582cc3a5515dfa658ffbd18f9d 160000 (submodule)
--- a/Mediawiki-Utilities
+++ b/Mediawiki-Utilities
@@ -1 +1 @@
-Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7
+Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d
diff --git a/wikiq b/wikiq

index f115fdc94b4ca1f25cdbcf18846d299a91bee673..b11f3a480e5c3d7c74b48aaabf1c40b822b79094 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -104,6 +104,22 @@ class WikiqParser():
          self.collapse_user = collapse_user
          self.persist = persist
          self.printed_header = False
+        self.namespaces = []
+
+    def __get_namespace_from_title(self, title):
+        default_ns = None
+
+        for ns in self.namespaces:
+            # skip if the namespace is not defined
+            if ns == None:
+                default_ns = self.namespaces[ns]
+                continue
+
+            if title.startswith(ns + ":"):
+                return self.namespaces[ns]
+
+        # if we've made it this far with no matches, we return the default namespace
+        return default_ns
  
      def process(self):
          print("Processing file: %s" % self.input_file.name, file=sys.stderr)
@@ -116,6 +132,9 @@ class WikiqParser():
          # Construct dump file iterator
          dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
  
+        # extract list of namspaces
+        self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
+
          page_count = 0
          rev_count = 0
          # Iterate through pages
@@ -134,7 +153,7 @@ class WikiqParser():
                              'articleid' : page.id,
                              'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
                              'title' : '"' + page.title + '"',
-                            'namespace' : page.namespace,
+                            'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
                              'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
  
                  # if revisions are deleted, /many/ things will be missing
author	Benjamin Mako Hill <mako@atdot.cc>
	Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)
committer	Benjamin Mako Hill <mako@atdot.cc>
	Thu, 23 Jul 2015 19:12:20 +0000 (12:12 -0700)
Mediawiki-Utilities		patch \| blob \| history
wikiq		patch \| blob \| history