self.collapse_user = collapse_user
self.persist = persist
self.printed_header = False
+ self.namespaces = []
+
+ def __get_namespace_from_title(self, title):
+ default_ns = None
+
+ for ns in self.namespaces:
+ # skip if the namespace is not defined
+ if ns == None:
+ default_ns = self.namespaces[ns]
+ continue
+
+ if title.startswith(ns + ":"):
+ return self.namespaces[ns]
+
+ # if we've made it this far with no matches, we return the default namespace
+ return default_ns
def process(self):
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
+ # extract list of namspaces
+ self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
+
page_count = 0
rev_count = 0
# Iterate through pages
'articleid' : page.id,
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
'title' : '"' + page.title + '"',
- 'namespace' : page.namespace,
+ 'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
# if revisions are deleted, /many/ things will be missing
elif re.match(r'.*\.gz', input_filename):
cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2', input_filename):
- cmd = ["zcat", input_filename]
+ cmd = ["bzcat", input_filename]
try:
input_file = Popen(cmd, stdout=PIPE).stdout