X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/dba793c6ac595e7a5c0ac795575c28231f06f8cb..bf396ad366988d8dfa8afd00cbc49df8454cc611:/wikiq diff --git a/wikiq b/wikiq index 8a12d90..bc6b06d 100755 --- a/wikiq +++ b/wikiq @@ -3,6 +3,7 @@ # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size +import pdb import argparse import sys import os, os.path @@ -32,11 +33,15 @@ class WikiqIterator(): self.fh = fh self.collapse_user = collapse_user self.mwiterator = Dump.from_file(self.fh) + self.namespace_map = { ns.id : ns.name for ns in + self.mwiterator.site_info.namespaces } self.__pages = self.load_pages() def load_pages(self): for page in self.mwiterator: - yield WikiqPage(page, collapse_user=self.collapse_user) + yield WikiqPage(page, + namespace_map = self.namespace_map, + collapse_user=self.collapse_user) def __iter__(self): return self.__pages @@ -49,13 +54,14 @@ class WikiqPage(): 'restrictions', 'mwpage', '__revisions', 'collapse_user') - def __init__(self, page, collapse_user=False): + def __init__(self, page, namespace_map, collapse_user=False): self.id = page.id - self.title = page.title self.namespace = page.namespace - self.redirect = page.redirect + if page.namespace != 0: + self.title = ':'.join([namespace_map[page.namespace], page.title]) + else: + self.title = page.title self.restrictions = page.restrictions - self.collapse_user = collapse_user self.mwpage = page self.__revisions = self.rev_list() @@ -111,7 +117,6 @@ class WikiqPage(): class WikiqParser(): - def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): self.input_file = input_file