X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/0c2d72b881b174621ef811a7c8cef4e5c1103e97..refs/heads/tests:/wikiq?ds=sidebyside diff --git a/wikiq b/wikiq index f8e5fd6..334f195 100755 --- a/wikiq +++ b/wikiq @@ -1,9 +1,8 @@ - #!/usr/bin/env python3 +#!/usr/bin/env python3 # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import pdb import argparse import sys import os, os.path @@ -18,7 +17,7 @@ from mwxml import Dump, Page from deltas.tokenizers import wikitext_split from mwdiffs.utilities import dump2diffs import mwpersistence -from mwpersistence.state import Version, apply_opdocs, apply_operations, persist_revision_once +from mwpersistence.state import DiffState from mwpersistence import Token from mwpersistence.utilities import diffs2persistence @@ -30,46 +29,8 @@ from deltas import SegmentMatcher TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS=7 -# this is a simple override of mwpersistence.DiffState that doesn't do anything special for reverts. -class WikiqDiffState(mwpersistence.DiffState): - def _update(self, text=None, checksum=None, opdocs=None, revision=None): - if checksum is None: - if text is None: - raise TypeError("Either 'text' or 'checksum' must be " + - "specified.") - else: - checksum = sha1(bytes(text, 'utf8')).hexdigest() - - current_version = Version() - - # the main difference we have is that we don't do anything special for reverts - if opdocs is not None: - transition = apply_opdocs(opdocs, self.last.tokens or []) - current_version.tokens, _, _ = transition - else: - # NOTICE: HEAVY COMPUTATION HERE!!! - # - # Diffs usually run in O(n^2) -- O(n^3) time and most - # tokenizers produce a lot of tokens. - if self.diff_processor is None: - raise RuntimeError("DiffState cannot process raw text " + - "without a diff_engine specified.") - operations, _, current_tokens = \ - self.diff_processor.process(text, token_class=Token) - - transition = apply_operations(operations, - self.last.tokens or [], - current_tokens) - current_version.tokens, _, _ = transition - - # Record persistence - persist_revision_once(current_version.tokens, revision) - - # Update last version - self.last = current_version - - # Return the tranisitoned state - return transition +ws_lex = ['break','whitespace'] +punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start'] class PersistMethod: none = 0 @@ -77,25 +38,36 @@ class PersistMethod: segment = 2 legacy = 3 -def calculate_persistence(tokens_added): +def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False): + + if not legacy: + cond = lambda t: not (exclude_punct and (t.type in punct_lex)) \ + and not(exclude_ws and (t.type in ws_lex)) + + tokens_added = [t for t in tokens_added if cond(t)] + tokens_removed = [t for t in tokens_removed if cond(t)] + return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) + len(tokens_added), + len(tokens_removed) + ) class WikiqIterator(Dump): @classmethod def from_file(cls, fh, collapse_user = False): - cls = super(WikiqIterator, cls).from_file(fh) cls.fh = fh cls.collapse_user = collapse_user - cls.namespace_map = { ns.id : ns.name for ns in - cls.site_info.namespaces } + cls = super(WikiqIterator, cls).from_file(fh) return cls @classmethod - def process_item(cls, item_element, namespace_map, collapse_user = False): + def process_item(cls, item_element, namespace_map): + if not hasattr(cls,'inv_namespace_map'): + cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()} + if item_element.tag == "page": - return WikiqPage.from_element(item_element, namespace_map, collapse_user) + return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user) elif item_element.tag == "logitem": return LogItem.from_element(item_element, namespace_map) else: @@ -107,11 +79,9 @@ class WikiqPage(Page): 'restrictions','collapse_user') @classmethod - def from_element(cls, item_element, namespace_map, collapse_user = False): + def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False): cls.prev_rev = None - inv_namespace_map = {ns.id:name for name,ns in namespace_map.items()} - cls = super(WikiqPage, cls).from_element(item_element, namespace_map) # following mwxml, we assume namespace 0 in cases where @@ -160,9 +130,8 @@ class WikiqPage(Page): # 2 B A True # 3 A B True # 4 A A False - # Post-loop A Always + # Post-loop A Always def __find_next_revision(self): - if self.prev_rev is None: prev_rev = WikiqPage._correct_sha(next(self.revisions)) self.prev_rev = prev_rev @@ -171,7 +140,8 @@ class WikiqPage(Page): if self.collapse_user: collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs + self.prev_rev.collapsed_revs = collapsed_revs + prev_rev = self.prev_rev for rev in self.revisions: rev = WikiqPage._correct_sha(rev) @@ -220,15 +190,9 @@ class WikiqPage(Page): revision.page = self yield revision - # def __iter__(self): - # return self.__revisions - - # def __next__(self): - # return next(self.__revisions) - class WikiqParser(): - def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None): + def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None, exclude_punct = False, exclude_ws = False): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -245,11 +209,9 @@ class WikiqParser(): else: self.namespace_filter = None - # create a regex that creates the output filename - # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$', - # r'output/wikiq-\1-\2.tsv', - # input_filename) - + self.exclude_punct = exclude_punct + self.exclude_ws = exclude_ws + # Construct dump file iterator self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user) @@ -261,29 +223,6 @@ class WikiqParser(): if self.persist == PersistMethod.segment: self.diff_engine = SegmentMatcher(tokenizer = wikitext_split) - # def __get_namespace_from_title(self, title): - # default_ns = None - - # for ns in self.namespaces: - # # skip if the namespace is not defined - # if ns == None: - # default_ns = self.namespaces[ns] - # continue - - # if title.startswith(ns + ":"): - # return self.namespaces[ns] - - # # if we've made it this far with no matches, we return the default namespace - # return default_ns - - # def _set_namespace(self, rev_docs): - - # for rev_data in rev_docs: - # if 'namespace' not in rev_data['page']: - # namespace = self.__get_namespace_from_title(page['title']) - # rev_data['page']['namespace'] = namespace - # yield rev_data - def process(self): page_count = 0 rev_count = 0 @@ -300,11 +239,11 @@ class WikiqParser(): window = deque(maxlen=PERSISTENCE_RADIUS) if self.persist == PersistMethod.sequence: - state = WikiqDiffState(SequenceMatcher(tokenizer = wikitext_split), + state = DiffState(SequenceMatcher(tokenizer = wikitext_split), revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.segment: - state = WikiqDiffState(SegmentMatcher(tokenizer = wikitext_split), + state = DiffState(SegmentMatcher(tokenizer = wikitext_split), revert_radius=PERSISTENCE_RADIUS) else: @@ -367,16 +306,11 @@ class WikiqParser(): rev_data['anon'] = "" rev_data['editor'] = "" - #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): - # redirect = True - #else: - # redirect = False + # we can easily add redirect info + # rev_data['redirect'] = rev.page.redirect - #TODO missing: additions_size deletions_size - - # if collapse user was on, lets run that - # if self.collapse_user: - # rev_data.collapsed_revs = rev.collapsed_revs + if self.collapse_user: + rev_data['collapsed_revs'] = rev.collapsed_revs if self.persist != PersistMethod.none: if rev.deleted.text: @@ -395,11 +329,19 @@ class WikiqParser(): if len(window) == PERSISTENCE_RADIUS: old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - num_token_revs, num_tokens = calculate_persistence(old_tokens_added) - + num_token_revs, \ + num_tokens_added, \ + num_tokens_removed = \ + calculate_persistence( + old_tokens_added, + old_tokens_removed, + exclude_ws = self.exclude_ws, + exclude_punct = self.exclude_punct, + legacy = self.persist == PersistMethod.legacy) + old_rev_data["token_revs"] = num_token_revs - old_rev_data["tokens_added"] = num_tokens - old_rev_data["tokens_removed"] = len(old_tokens_removed) + old_rev_data["tokens_added"] = num_tokens_added + old_rev_data["tokens_removed"] = num_tokens_removed old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 self.print_rev_data(old_rev_data) @@ -417,11 +359,20 @@ class WikiqParser(): continue rev_id, rev_data, tokens_added, tokens_removed = item - num_token_revs, num_tokens = calculate_persistence(tokens_added) + + num_token_revs, \ + num_tokens_added, \ + num_tokens_removed = calculate_persistence( + tokens_added, + tokens_removed, + exclude_ws = self.exclude_ws, + exclude_punct = self.exclude_punct, + legacy = self.persist == PersistMethod.legacy) + rev_data["token_revs"] = num_token_revs - rev_data["tokens_added"] = num_tokens - rev_data["tokens_removed"] = len(tokens_removed) + rev_data["tokens_added"] = num_tokens_added + rev_data["tokens_removed"] = num_tokens_removed rev_data["tokens_window"] = len(window)-(i+1) self.print_rev_data(rev_data) @@ -492,7 +443,11 @@ parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', help="Id number of namspace to include. Can be specified more than once.") +parser.add_argument('--exclude-whitespace', dest="exclude_ws", action="store_true", + help="Flag to remove whitespace from persistence measures.") +parser.add_argument('--exclude-punctuation', dest="exclude_punct", action="store_true", + help="Flag to remove punctuation from persistence measures.") args = parser.parse_args() @@ -534,7 +489,9 @@ if len(args.dumpfiles) > 0: collapse_user=args.collapse_user, persist=persist, urlencode=args.urlencode, - namespaces = namespaces) + namespaces = namespaces, + exclude_punct = args.exclude_punct, + exclude_ws = args.exclude_ws) wikiq.process()