- #!/usr/bin/env python3
+#!/usr/bin/env python3
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
-import pdb
import argparse
import sys
import os, os.path
segment = 2
legacy = 3
-def calculate_persistence(tokens_added):
+def calculate_persistence(tokens_added, tokens_removed, exclude_ws = True, exclude_punct = False):
+ cond = True
+ ws_lex = ['break','whitespace']
+ punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
+
+
+ if exclude_ws:
+ cond = lambda t: cond and not t.type in ws_lex
+
+ if exclude_punct:
+ cond = lambda t: cond and not t.type in punct_lex
+
+ tokens_added = [t for t in tokens_added if cond(t)]
+ tokens_removed = [t for t in tokens_removed if cond(t)]
+
return(sum([(len(x.revisions)-1) for x in tokens_added]),
- len(tokens_added))
+ len(tokens_added),
+ len(tokens_removed)
+ )
class WikiqIterator(Dump):
if len(window) == PERSISTENCE_RADIUS:
old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
- num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
+ num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(old_tokens_added, old_tokens_removed)
old_rev_data["token_revs"] = num_token_revs
- old_rev_data["tokens_added"] = num_tokens
- old_rev_data["tokens_removed"] = len(old_tokens_removed)
+ old_rev_data["tokens_added"] = num_tokens_added
+ old_rev_data["tokens_removed"] = num_tokens_removed
old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
self.print_rev_data(old_rev_data)
continue
rev_id, rev_data, tokens_added, tokens_removed = item
- num_token_revs, num_tokens = calculate_persistence(tokens_added)
+
+ num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed)
rev_data["token_revs"] = num_token_revs
- rev_data["tokens_added"] = num_tokens
- rev_data["tokens_removed"] = len(tokens_removed)
+ rev_data["tokens_added"] = num_tokens_added
+ rev_data["tokens_removed"] = num_tokens_removed
rev_data["tokens_window"] = len(window)-(i+1)
self.print_rev_data(rev_data)