X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/0c2d72b881b174621ef811a7c8cef4e5c1103e97..26ea272114a7512421f425c6f2436eeb8316c3c1:/wikiq?ds=sidebyside diff --git a/wikiq b/wikiq index f8e5fd6..35cdbc0 100755 --- a/wikiq +++ b/wikiq @@ -1,9 +1,8 @@ - #!/usr/bin/env python3 +#!/usr/bin/env python3 # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import pdb import argparse import sys import os, os.path @@ -77,9 +76,25 @@ class PersistMethod: segment = 2 legacy = 3 -def calculate_persistence(tokens_added): +def calculate_persistence(tokens_added, tokens_removed, exclude_ws = True, exclude_punct = False): + cond = True + ws_lex = ['break','whitespace'] + punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start'] + + + if exclude_ws: + cond = lambda t: cond and not t.type in ws_lex + + if exclude_punct: + cond = lambda t: cond and not t.type in punct_lex + + tokens_added = [t for t in tokens_added if cond(t)] + tokens_removed = [t for t in tokens_removed if cond(t)] + return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) + len(tokens_added), + len(tokens_removed) + ) class WikiqIterator(Dump): @@ -395,11 +410,11 @@ class WikiqParser(): if len(window) == PERSISTENCE_RADIUS: old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - num_token_revs, num_tokens = calculate_persistence(old_tokens_added) + num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(old_tokens_added, old_tokens_removed) old_rev_data["token_revs"] = num_token_revs - old_rev_data["tokens_added"] = num_tokens - old_rev_data["tokens_removed"] = len(old_tokens_removed) + old_rev_data["tokens_added"] = num_tokens_added + old_rev_data["tokens_removed"] = num_tokens_removed old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 self.print_rev_data(old_rev_data) @@ -417,11 +432,12 @@ class WikiqParser(): continue rev_id, rev_data, tokens_added, tokens_removed = item - num_token_revs, num_tokens = calculate_persistence(tokens_added) + + num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed) rev_data["token_revs"] = num_token_revs - rev_data["tokens_added"] = num_tokens - rev_data["tokens_removed"] = len(tokens_removed) + rev_data["tokens_added"] = num_tokens_added + rev_data["tokens_removed"] = num_tokens_removed rev_data["tokens_window"] = len(window)-(i+1) self.print_rev_data(rev_data)