summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
108c844)
This is necessary for wikis (e.g., Wikia XML dumps) that do not include
namespace metadata as tags within each <page>.
-Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7
+Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d
self.collapse_user = collapse_user
self.persist = persist
self.printed_header = False
self.collapse_user = collapse_user
self.persist = persist
self.printed_header = False
+ self.namespaces = []
+
+ def __get_namespace_from_title(self, title):
+ default_ns = None
+
+ for ns in self.namespaces:
+ # skip if the namespace is not defined
+ if ns == None:
+ default_ns = self.namespaces[ns]
+ continue
+
+ if title.startswith(ns + ":"):
+ return self.namespaces[ns]
+
+ # if we've made it this far with no matches, we return the default namespace
+ return default_ns
def process(self):
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
def process(self):
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
+ # extract list of namspaces
+ self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
+
page_count = 0
rev_count = 0
# Iterate through pages
page_count = 0
rev_count = 0
# Iterate through pages
'articleid' : page.id,
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
'title' : '"' + page.title + '"',
'articleid' : page.id,
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
'title' : '"' + page.title + '"',
- 'namespace' : page.namespace,
+ 'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
# if revisions are deleted, /many/ things will be missing
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
# if revisions are deleted, /many/ things will be missing