]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq_util.py
Use dask to parallelize and scale user level datasets
[mediawiki_dump_tools.git] / wikiq_util.py
index 495c31c0ed45d59af23ad92a5022f36942b12cb9..d475a423cf7dcf0a656618c79a84d4087a04f0b8 100644 (file)
@@ -8,10 +8,18 @@ from mwxml import Dump
 import mwpersistence
 import mwreverts
 from urllib.parse import quote
 import mwpersistence
 import mwreverts
 from urllib.parse import quote
+from urllib.parse import unquote
 from deltas import SequenceMatcher
 
 TO_ENCODE = ('title', 'editor')
 from deltas import SequenceMatcher
 
 TO_ENCODE = ('title', 'editor')
-PERSISTENCE_RADIUS=7
+PERSISTENCE_RADIUS = 7
+
+def try_unquote(obj):
+    if type(obj) is str:
+        obj = unquote(obj)
+        return obj.strip('\"')
+    else:
+        return
 
 def calculate_persistence(tokens_added):
     return(sum([(len(x.revisions)-1) for x in tokens_added]),
 
 def calculate_persistence(tokens_added):
     return(sum([(len(x.revisions)-1) for x in tokens_added]),
@@ -323,3 +331,23 @@ def open_output_file(input_filename):
     output_file = open(output_filename, "w")
 
     return output_file
     output_file = open(output_filename, "w")
 
     return output_file
+
+
+class IPCheck(object):
+
+    # IP address regexes taken from https://gist.github.com/mnordhoff/2213179
+    ipv4_address = re.compile('^(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
+
+    ipv6_address_or_addrz = re.compile('^(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)(?:%25(?:[A-Za-z0-9\\-._~]|%[0-9A-Fa-f]{2})+)?$')
+
+    @staticmethod
+    def is_ip(username):
+        if not type(username) is str:
+            return False
+
+        '''Check if a username is an ip (v4 or v6) address. We use this as
+        a marker of whether the user is anonymous.'''
+        if IPCheck.ipv4_address.match(username) or  IPCheck.ipv6_address_or_addrz.match(username):
+            return True
+        else:
+            return False

Community Data Science Collective || Want to submit a patch?