]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq
wikiq mostly functional, but reverters take all the credit for the content they restore.
[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq
index f8e5fd616aeeb094258c2523a6daa39b8064c437..35cdbc0e43cb8965fa452f850393eec3eada4dd7 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -1,9 +1,8 @@
- #!/usr/bin/env python3
+#!/usr/bin/env python3
 
 # original wikiq headers are: title articleid revid date_time anon
 # editor editor_id minor text_size text_entropy text_md5 reversion
 # additions_size deletions_size
-import pdb 
 import argparse
 import sys
 import os, os.path
@@ -77,9 +76,25 @@ class PersistMethod:
     segment = 2
     legacy = 3
 
-def calculate_persistence(tokens_added):
+def calculate_persistence(tokens_added, tokens_removed, exclude_ws = True, exclude_punct = False):
+    cond = True
+    ws_lex = ['break','whitespace']
+    punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
+
+
+    if exclude_ws:
+        cond = lambda t: cond and not t.type in ws_lex
+
+    if exclude_punct:
+        cond = lambda t: cond and not t.type in punct_lex
+    
+    tokens_added = [t for t in tokens_added if cond(t)]
+    tokens_removed = [t for t in tokens_removed if cond(t)]
+
     return(sum([(len(x.revisions)-1) for x in tokens_added]),
-           len(tokens_added))
+           len(tokens_added),
+           len(tokens_removed)
+    )
 
 class WikiqIterator(Dump):
 
@@ -395,11 +410,11 @@ class WikiqParser():
                         if len(window) == PERSISTENCE_RADIUS:
                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
                             
-                            num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
+                            num_token_revs, num_tokens_added, num_tokens_removed  = calculate_persistence(old_tokens_added, old_tokens_removed)
 
                             old_rev_data["token_revs"] = num_token_revs
-                            old_rev_data["tokens_added"] = num_tokens
-                            old_rev_data["tokens_removed"] = len(old_tokens_removed)
+                            old_rev_data["tokens_added"] = num_tokens_added
+                            old_rev_data["tokens_removed"] = num_tokens_removed
                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
 
                             self.print_rev_data(old_rev_data)
@@ -417,11 +432,12 @@ class WikiqParser():
                         continue
 
                     rev_id, rev_data, tokens_added, tokens_removed = item
-                    num_token_revs, num_tokens = calculate_persistence(tokens_added)
+
+                    num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed)
 
                     rev_data["token_revs"] = num_token_revs
-                    rev_data["tokens_added"] = num_tokens
-                    rev_data["tokens_removed"] = len(tokens_removed)
+                    rev_data["tokens_added"] = num_tokens_added
+                    rev_data["tokens_removed"] = num_tokens_removed
                     rev_data["tokens_window"] = len(window)-(i+1)
                     
                     self.print_rev_data(rev_data)

Community Data Science Collective || Want to submit a patch?