X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/118b8b17225a0653b95cca816851d5ba744e62c3..311810a36cacb4199e2ee88e55c51b582f9332a3:/bin/wikiq diff --git a/bin/wikiq b/bin/wikiq index bc6b06d..e324d85 100755 --- a/bin/wikiq +++ b/bin/wikiq @@ -3,336 +3,17 @@ # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import pdb import argparse import sys -import os, os.path -import re +import os +sys.path.append("..") +from wikiq_util import calculate_persistence +from wikiq_util import WikiqIterator +from wikiq_util import WikiqPage +from wikiq_util import WikiqParser +from wikiq_util import open_input_file +from wikiq_util import open_output_file -from subprocess import Popen, PIPE -from collections import deque -from hashlib import sha1 - -from mwxml import Dump - -from deltas.tokenizers import wikitext_split -import mwpersistence -import mwreverts -from urllib.parse import quote -TO_ENCODE = ('title', 'editor') -PERSISTENCE_RADIUS=7 -from deltas import SequenceMatcher - -def calculate_persistence(tokens_added): - return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) - - -class WikiqIterator(): - def __init__(self, fh, collapse_user=False): - self.fh = fh - self.collapse_user = collapse_user - self.mwiterator = Dump.from_file(self.fh) - self.namespace_map = { ns.id : ns.name for ns in - self.mwiterator.site_info.namespaces } - self.__pages = self.load_pages() - - def load_pages(self): - for page in self.mwiterator: - yield WikiqPage(page, - namespace_map = self.namespace_map, - collapse_user=self.collapse_user) - - def __iter__(self): - return self.__pages - - def __next__(self): - return next(self._pages) - -class WikiqPage(): - __slots__ = ('id', 'title', 'namespace', 'redirect', - 'restrictions', 'mwpage', '__revisions', - 'collapse_user') - - def __init__(self, page, namespace_map, collapse_user=False): - self.id = page.id - self.namespace = page.namespace - if page.namespace != 0: - self.title = ':'.join([namespace_map[page.namespace], page.title]) - else: - self.title = page.title - self.restrictions = page.restrictions - self.collapse_user = collapse_user - self.mwpage = page - self.__revisions = self.rev_list() - - def rev_list(self): - # Outline for how we want to handle collapse_user=True - # iteration rev.user prev_rev.user add prev_rev? - # 0 A None Never - # 1 A A False - # 2 B A True - # 3 A B True - # 4 A A False - # Post-loop A Always - for i, rev in enumerate(self.mwpage): - # never yield the first time - if i == 0: - if self.collapse_user: - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - - else: - if self.collapse_user: - # yield if this is the last edit in a seq by a user and reset - # also yield if we do know who the user is - - if rev.deleted.user or prev_rev.deleted.user: - yield prev_rev - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - - elif not rev.user.text == prev_rev.user.text: - yield prev_rev - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - # otherwise, add one to the counter - else: - collapsed_revs += 1 - rev.collapsed_revs = collapsed_revs - # if collapse_user is false, we always yield - else: - yield prev_rev - - prev_rev = rev - - # also yield the final time - yield prev_rev - - def __iter__(self): - return self.__revisions - - def __next__(self): - return next(self.__revisions) - -class WikiqParser(): - - def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): - - self.input_file = input_file - self.output_file = output_file - self.collapse_user = collapse_user - self.persist = persist - self.persist_legacy = persist_legacy - self.printed_header = False - self.namespaces = [] - self.urlencode = urlencode - - def __get_namespace_from_title(self, title): - default_ns = None - - for ns in self.namespaces: - # skip if the namespace is not defined - if ns == None: - default_ns = self.namespaces[ns] - continue - - if title.startswith(ns + ":"): - return self.namespaces[ns] - - # if we've made it this far with no matches, we return the default namespace - return default_ns - - def process(self): - - # create a regex that creates the output filename - # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$', - # r'output/wikiq-\1-\2.tsv', - # input_filename) - - # Construct dump file iterator - dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) - - # extract list of namspaces - self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} - - page_count = 0 - rev_count = 0 - - - # Iterate through pages - for page in dump: - rev_detector = mwreverts.Detector() - - if self.persist or self.persist_legacy: - window = deque(maxlen=PERSISTENCE_RADIUS) - - if not self.persist_legacy: - state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), - revert_radius=PERSISTENCE_RADIUS) - - else: - from mw.lib import persistence - state = persistence.State() - - # Iterate through a page's revisions - for rev in page: - - rev_data = {'revid' : rev.id, - 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'), - 'articleid' : page.id, - 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, - 'title' : '"' + page.title + '"', - 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title), - 'deleted' : "TRUE" if rev.deleted.text else "FALSE" } - - # if revisions are deleted, /many/ things will be missing - if rev.deleted.text: - rev_data['text_chars'] = "" - rev_data['sha1'] = "" - rev_data['revert'] = "" - rev_data['reverteds'] = "" - - else: - # rev.text can be None if the page has no text - if not rev.text: - rev.text = "" - # if text exists, we'll check for a sha1 and generate one otherwise - - if rev.sha1: - text_sha1 = rev.sha1 - else: - - text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() - - rev_data['sha1'] = text_sha1 - - # TODO rev.bytes doesn't work.. looks like a bug - rev_data['text_chars'] = len(rev.text) - - # generate revert data - revert = rev_detector.process(text_sha1, rev.id) - - if revert: - rev_data['revert'] = "TRUE" - rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"' - else: - rev_data['revert'] = "FALSE" - rev_data['reverteds'] = "" - - # if the fact that the edit was minor can be hidden, this might be an issue - rev_data['minor'] = "TRUE" if rev.minor else "FALSE" - - if not rev.deleted.user: - # wrap user-defined editors in quotes for fread - rev_data['editor'] = '"' + rev.user.text + '"' - rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE" - - else: - rev_data['anon'] = "" - rev_data['editor'] = "" - - #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): - # redirect = True - #else: - # redirect = False - - #TODO missing: additions_size deletions_size - - # if collapse user was on, lets run that - if self.collapse_user: - rev_data['collapsed_revs'] = rev.collapsed_revs - - if self.persist or self.persist_legacy: - if rev.deleted.text: - - for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: - old_rev_data[k] = None - else: - - if not self.persist_legacy: - _, tokens_added, tokens_removed = state.update(rev.text, rev.id) - - else: - _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) - - window.append((rev.id, rev_data, tokens_added, tokens_removed)) - - if len(window) == PERSISTENCE_RADIUS: - old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - - num_token_revs, num_tokens = calculate_persistence(old_tokens_added) - - old_rev_data["token_revs"] = num_token_revs - old_rev_data["tokens_added"] = num_tokens - old_rev_data["tokens_removed"] = len(old_tokens_removed) - old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 - - self.print_rev_data(old_rev_data) - - else: - self.print_rev_data(rev_data) - - rev_count += 1 - - if self.persist or self.persist_legacy: - # print out metadata for the last RADIUS revisions - for i, item in enumerate(window): - # if the window was full, we've already printed item 0 - if len(window) == PERSISTENCE_RADIUS and i == 0: - continue - - rev_id, rev_data, tokens_added, tokens_removed = item - num_token_revs, num_tokens = calculate_persistence(tokens_added) - - rev_data["token_revs"] = num_token_revs - rev_data["tokens_added"] = num_tokens - rev_data["tokens_removed"] = len(tokens_removed) - rev_data["tokens_window"] = len(window)-(i+1) - - self.print_rev_data(rev_data) - - page_count += 1 - - print("Done: %s revisions and %s pages." % (rev_count, page_count), - file=sys.stderr) - - def print_rev_data(self, rev_data): - # if it's the first time through, print the header - if self.urlencode: - for field in TO_ENCODE: - rev_data[field] = quote(str(rev_data[field])) - - if not self.printed_header: - print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file) - self.printed_header = True - - print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file) - - -def open_input_file(input_filename): - if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*'] - elif re.match(r'.*\.gz$', input_filename): - cmd = ["zcat", input_filename] - elif re.match(r'.*\.bz2$', input_filename): - cmd = ["bzcat", "-dk", input_filename] - - try: - input_file = Popen(cmd, stdout=PIPE).stdout - except NameError: - input_file = open(input_filename, 'r') - - return input_file - -def open_output_file(input_filename): - # create a regex that creates the output filename - output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) - output_filename = re.sub(r'\.xml', '', output_filename) - output_filename = output_filename + ".tsv" - output_file = open(output_filename, "w") - - return output_file parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')