]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq
Prefix page titles with namespace names.
[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq
index 8a12d90980f6379d096b6fac0fb70d8f274bdb3f..bc6b06ded1f7f0ca1e8c4e81c8337e3a02930118 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -3,6 +3,7 @@
 # original wikiq headers are: title articleid revid date_time anon
 # editor editor_id minor text_size text_entropy text_md5 reversion
 # additions_size deletions_size
+import pdb
 import argparse
 import sys
 import os, os.path
@@ -32,11 +33,15 @@ class WikiqIterator():
         self.fh = fh
         self.collapse_user = collapse_user
         self.mwiterator = Dump.from_file(self.fh)
+        self.namespace_map = { ns.id : ns.name for ns in
+                               self.mwiterator.site_info.namespaces }
         self.__pages = self.load_pages()
 
     def load_pages(self):
         for page in self.mwiterator:
-            yield WikiqPage(page, collapse_user=self.collapse_user)
+            yield WikiqPage(page,
+                            namespace_map = self.namespace_map,
+                            collapse_user=self.collapse_user)
 
     def __iter__(self):
         return self.__pages
@@ -49,13 +54,14 @@ class WikiqPage():
                  'restrictions', 'mwpage', '__revisions',
                  'collapse_user')
     
-    def __init__(self, page, collapse_user=False):
+    def __init__(self, page, namespace_map, collapse_user=False):
         self.id = page.id
-        self.title = page.title
         self.namespace = page.namespace
-        self.redirect = page.redirect
+        if page.namespace != 0:
+            self.title = ':'.join([namespace_map[page.namespace], page.title])
+        else:
+            self.title = page.title
         self.restrictions = page.restrictions
-        
         self.collapse_user = collapse_user
         self.mwpage = page
         self.__revisions = self.rev_list()
@@ -111,7 +117,6 @@ class WikiqPage():
 
 class WikiqParser():
 
-
     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
         
         self.input_file = input_file

Community Data Science Collective || Want to submit a patch?