X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/0c2d72b881b174621ef811a7c8cef4e5c1103e97..4089ebae9272bb74c8d6f5f4404969e125dfbd70:/wikiq diff --git a/wikiq b/wikiq index f8e5fd6..731f59a 100755 --- a/wikiq +++ b/wikiq @@ -1,9 +1,8 @@ - #!/usr/bin/env python3 +#!/usr/bin/env python3 # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import pdb import argparse import sys import os, os.path @@ -18,7 +17,7 @@ from mwxml import Dump, Page from deltas.tokenizers import wikitext_split from mwdiffs.utilities import dump2diffs import mwpersistence -from mwpersistence.state import Version, apply_opdocs, apply_operations, persist_revision_once +from mwpersistence.state import DiffState from mwpersistence import Token from mwpersistence.utilities import diffs2persistence @@ -30,46 +29,8 @@ from deltas import SegmentMatcher TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS=7 -# this is a simple override of mwpersistence.DiffState that doesn't do anything special for reverts. -class WikiqDiffState(mwpersistence.DiffState): - def _update(self, text=None, checksum=None, opdocs=None, revision=None): - if checksum is None: - if text is None: - raise TypeError("Either 'text' or 'checksum' must be " + - "specified.") - else: - checksum = sha1(bytes(text, 'utf8')).hexdigest() - - current_version = Version() - - # the main difference we have is that we don't do anything special for reverts - if opdocs is not None: - transition = apply_opdocs(opdocs, self.last.tokens or []) - current_version.tokens, _, _ = transition - else: - # NOTICE: HEAVY COMPUTATION HERE!!! - # - # Diffs usually run in O(n^2) -- O(n^3) time and most - # tokenizers produce a lot of tokens. - if self.diff_processor is None: - raise RuntimeError("DiffState cannot process raw text " + - "without a diff_engine specified.") - operations, _, current_tokens = \ - self.diff_processor.process(text, token_class=Token) - - transition = apply_operations(operations, - self.last.tokens or [], - current_tokens) - current_version.tokens, _, _ = transition - - # Record persistence - persist_revision_once(current_version.tokens, revision) - - # Update last version - self.last = current_version - - # Return the tranisitoned state - return transition +ws_lex = ['break','whitespace'] +punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start'] class PersistMethod: none = 0 @@ -77,25 +38,36 @@ class PersistMethod: segment = 2 legacy = 3 -def calculate_persistence(tokens_added): +def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False): + + if not legacy: + cond = lambda t: not (exclude_punct and (t.type in punct_lex)) \ + and not(exclude_ws and (t.type in ws_lex)) + + tokens_added = [t for t in tokens_added if cond(t)] + tokens_removed = [t for t in tokens_removed if cond(t)] + return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) + len(tokens_added), + len(tokens_removed) + ) class WikiqIterator(Dump): @classmethod def from_file(cls, fh, collapse_user = False): - cls = super(WikiqIterator, cls).from_file(fh) cls.fh = fh cls.collapse_user = collapse_user - cls.namespace_map = { ns.id : ns.name for ns in - cls.site_info.namespaces } + cls = super(WikiqIterator, cls).from_file(fh) return cls @classmethod - def process_item(cls, item_element, namespace_map, collapse_user = False): + def process_item(cls, item_element, namespace_map): + if not hasattr(cls,'inv_namespace_map'): + cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()} + if item_element.tag == "page": - return WikiqPage.from_element(item_element, namespace_map, collapse_user) + return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user) elif item_element.tag == "logitem": return LogItem.from_element(item_element, namespace_map) else: @@ -107,11 +79,9 @@ class WikiqPage(Page): 'restrictions','collapse_user') @classmethod - def from_element(cls, item_element, namespace_map, collapse_user = False): + def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False): cls.prev_rev = None - inv_namespace_map = {ns.id:name for name,ns in namespace_map.items()} - cls = super(WikiqPage, cls).from_element(item_element, namespace_map) # following mwxml, we assume namespace 0 in cases where @@ -171,7 +141,8 @@ class WikiqPage(Page): if self.collapse_user: collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs + self.prev_rev.collapsed_revs = collapsed_revs + prev_rev = self.prev_rev for rev in self.revisions: rev = WikiqPage._correct_sha(rev) @@ -220,12 +191,6 @@ class WikiqPage(Page): revision.page = self yield revision - # def __iter__(self): - # return self.__revisions - - # def __next__(self): - # return next(self.__revisions) - class WikiqParser(): def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None): @@ -300,11 +265,11 @@ class WikiqParser(): window = deque(maxlen=PERSISTENCE_RADIUS) if self.persist == PersistMethod.sequence: - state = WikiqDiffState(SequenceMatcher(tokenizer = wikitext_split), + state = DiffState(SequenceMatcher(tokenizer = wikitext_split), revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.segment: - state = WikiqDiffState(SegmentMatcher(tokenizer = wikitext_split), + state = DiffState(SegmentMatcher(tokenizer = wikitext_split), revert_radius=PERSISTENCE_RADIUS) else: @@ -395,11 +360,11 @@ class WikiqParser(): if len(window) == PERSISTENCE_RADIUS: old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - num_token_revs, num_tokens = calculate_persistence(old_tokens_added) + num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(old_tokens_added, old_tokens_removed, legacy = self.persist == PersistMethod.legacy) old_rev_data["token_revs"] = num_token_revs - old_rev_data["tokens_added"] = num_tokens - old_rev_data["tokens_removed"] = len(old_tokens_removed) + old_rev_data["tokens_added"] = num_tokens_added + old_rev_data["tokens_removed"] = num_tokens_removed old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 self.print_rev_data(old_rev_data) @@ -417,11 +382,12 @@ class WikiqParser(): continue rev_id, rev_data, tokens_added, tokens_removed = item - num_token_revs, num_tokens = calculate_persistence(tokens_added) + + num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed, legacy = self.persist == PersistMethod.legacy) rev_data["token_revs"] = num_token_revs - rev_data["tokens_added"] = num_tokens - rev_data["tokens_removed"] = len(tokens_removed) + rev_data["tokens_added"] = num_tokens_added + rev_data["tokens_removed"] = num_tokens_removed rev_data["tokens_window"] = len(window)-(i+1) self.print_rev_data(rev_data)