]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq
fix code to work with bzip files
[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq
index f115fdc94b4ca1f25cdbcf18846d299a91bee673..c2ac412b1b776da978c8bb2ed7cc69c6868cbbac 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -104,6 +104,22 @@ class WikiqParser():
         self.collapse_user = collapse_user
         self.persist = persist
         self.printed_header = False
         self.collapse_user = collapse_user
         self.persist = persist
         self.printed_header = False
+        self.namespaces = []
+
+    def __get_namespace_from_title(self, title):
+        default_ns = None
+
+        for ns in self.namespaces:
+            # skip if the namespace is not defined
+            if ns == None:
+                default_ns = self.namespaces[ns]
+                continue
+
+            if title.startswith(ns + ":"):
+                return self.namespaces[ns]
+
+        # if we've made it this far with no matches, we return the default namespace
+        return default_ns
 
     def process(self):
         print("Processing file: %s" % self.input_file.name, file=sys.stderr)
 
     def process(self):
         print("Processing file: %s" % self.input_file.name, file=sys.stderr)
@@ -116,6 +132,9 @@ class WikiqParser():
         # Construct dump file iterator
         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
 
         # Construct dump file iterator
         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
 
+        # extract list of namspaces
+        self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
+
         page_count = 0
         rev_count = 0
         # Iterate through pages
         page_count = 0
         rev_count = 0
         # Iterate through pages
@@ -134,7 +153,7 @@ class WikiqParser():
                             'articleid' : page.id,
                             'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
                             'title' : '"' + page.title + '"',
                             'articleid' : page.id,
                             'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
                             'title' : '"' + page.title + '"',
-                            'namespace' : page.namespace,
+                            'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
                             'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
 
                 # if revisions are deleted, /many/ things will be missing
                             'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
 
                 # if revisions are deleted, /many/ things will be missing
@@ -250,7 +269,7 @@ def open_input_file(input_filename):
     elif re.match(r'.*\.gz', input_filename):
         cmd = ["zcat", input_filename] 
     elif re.match(r'.*\.bz2', input_filename):
     elif re.match(r'.*\.gz', input_filename):
         cmd = ["zcat", input_filename] 
     elif re.match(r'.*\.bz2', input_filename):
-        cmd = ["zcat", input_filename] 
+        cmd = ["bzcat", input_filename] 
 
     try:
         input_file = Popen(cmd, stdout=PIPE).stdout
 
     try:
         input_file = Popen(cmd, stdout=PIPE).stdout

Community Data Science Collective || Want to submit a patch?