import sys import re from subprocess import Popen, PIPE from collections import deque from hashlib import sha1 from deltas.tokenizers import wikitext_split from mwxml import Dump import mwpersistence import mwreverts from urllib.parse import quote from urllib.parse import unquote from deltas import SequenceMatcher TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS = 7 def try_unquote(obj): if type(obj) is str: obj = unquote(obj) return obj.strip('\"') else: return def calculate_persistence(tokens_added): return(sum([(len(x.revisions)-1) for x in tokens_added]), len(tokens_added)) class WikiqIterator(): def __init__(self, fh, collapse_user=False): self.fh = fh self.collapse_user = collapse_user self.mwiterator = Dump.from_file(self.fh) self.namespace_map = { ns.id : ns.name for ns in self.mwiterator.site_info.namespaces } self.__pages = self.load_pages() def load_pages(self): for page in self.mwiterator: yield WikiqPage(page, namespace_map = self.namespace_map, collapse_user=self.collapse_user) def __iter__(self): return self.__pages def __next__(self): return next(self._pages) class WikiqPage(): __slots__ = ('id', 'title', 'namespace', 'redirect', 'restrictions', 'mwpage', '__revisions', 'collapse_user') def __init__(self, page, namespace_map, collapse_user=False): self.id = page.id self.namespace = page.namespace if page.namespace != 0: self.title = ':'.join([namespace_map[page.namespace], page.title]) else: self.title = page.title self.restrictions = page.restrictions self.collapse_user = collapse_user self.mwpage = page self.__revisions = self.rev_list() def rev_list(self): # Outline for how we want to handle collapse_user=True # iteration rev.user prev_rev.user add prev_rev? # 0 A None Never # 1 A A False # 2 B A True # 3 A B True # 4 A A False # Post-loop A Always for i, rev in enumerate(self.mwpage): # never yield the first time if i == 0: if self.collapse_user: collapsed_revs = 1 rev.collapsed_revs = collapsed_revs else: if self.collapse_user: # yield if this is the last edit in a seq by a user and reset # also yield if we do know who the user is if rev.deleted.user or prev_rev.deleted.user: yield prev_rev collapsed_revs = 1 rev.collapsed_revs = collapsed_revs elif not rev.user.text == prev_rev.user.text: yield prev_rev collapsed_revs = 1 rev.collapsed_revs = collapsed_revs # otherwise, add one to the counter else: collapsed_revs += 1 rev.collapsed_revs = collapsed_revs # if collapse_user is false, we always yield else: yield prev_rev prev_rev = rev # also yield the final time yield prev_rev def __iter__(self): return self.__revisions def __next__(self): return next(self.__revisions) class WikiqParser(): def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): self.input_file = input_file self.output_file = output_file self.collapse_user = collapse_user self.persist = persist self.persist_legacy = persist_legacy self.printed_header = False self.namespaces = [] self.urlencode = urlencode def __get_namespace_from_title(self, title): default_ns = None for ns in self.namespaces: # skip if the namespace is not defined if ns == None: default_ns = self.namespaces[ns] continue if title.startswith(ns + ":"): return self.namespaces[ns] # if we've made it this far with no matches, we return the default namespace return default_ns def process(self): # create a regex that creates the output filename # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$', # r'output/wikiq-\1-\2.tsv', # input_filename) # Construct dump file iterator dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) # extract list of namspaces self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} page_count = 0 rev_count = 0 # Iterate through pages for page in dump: rev_detector = mwreverts.Detector() if self.persist or self.persist_legacy: window = deque(maxlen=PERSISTENCE_RADIUS) if not self.persist_legacy: state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), revert_radius=PERSISTENCE_RADIUS) else: from mw.lib import persistence state = persistence.State() # Iterate through a page's revisions for rev in page: rev_data = {'revid' : rev.id, 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'), 'articleid' : page.id, 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, 'title' : '"' + page.title + '"', 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title), 'deleted' : "TRUE" if rev.deleted.text else "FALSE" } # if revisions are deleted, /many/ things will be missing if rev.deleted.text: rev_data['text_chars'] = "" rev_data['sha1'] = "" rev_data['revert'] = "" rev_data['reverteds'] = "" else: # rev.text can be None if the page has no text if not rev.text: rev.text = "" # if text exists, we'll check for a sha1 and generate one otherwise if rev.sha1: text_sha1 = rev.sha1 else: text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() rev_data['sha1'] = text_sha1 # TODO rev.bytes doesn't work.. looks like a bug rev_data['text_chars'] = len(rev.text) # generate revert data revert = rev_detector.process(text_sha1, rev.id) if revert: rev_data['revert'] = "TRUE" rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"' else: rev_data['revert'] = "FALSE" rev_data['reverteds'] = "" # if the fact that the edit was minor can be hidden, this might be an issue rev_data['minor'] = "TRUE" if rev.minor else "FALSE" if not rev.deleted.user: # wrap user-defined editors in quotes for fread rev_data['editor'] = '"' + rev.user.text + '"' rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE" else: rev_data['anon'] = "" rev_data['editor'] = "" #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): # redirect = True #else: # redirect = False #TODO missing: additions_size deletions_size # if collapse user was on, lets run that if self.collapse_user: rev_data['collapsed_revs'] = rev.collapsed_revs if self.persist or self.persist_legacy: if rev.deleted.text: for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: old_rev_data[k] = None else: if not self.persist_legacy: _, tokens_added, tokens_removed = state.update(rev.text, rev.id) else: _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) window.append((rev.id, rev_data, tokens_added, tokens_removed)) if len(window) == PERSISTENCE_RADIUS: old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] num_token_revs, num_tokens = calculate_persistence(old_tokens_added) old_rev_data["token_revs"] = num_token_revs old_rev_data["tokens_added"] = num_tokens old_rev_data["tokens_removed"] = len(old_tokens_removed) old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 self.print_rev_data(old_rev_data) else: self.print_rev_data(rev_data) rev_count += 1 if self.persist or self.persist_legacy: # print out metadata for the last RADIUS revisions for i, item in enumerate(window): # if the window was full, we've already printed item 0 if len(window) == PERSISTENCE_RADIUS and i == 0: continue rev_id, rev_data, tokens_added, tokens_removed = item num_token_revs, num_tokens = calculate_persistence(tokens_added) rev_data["token_revs"] = num_token_revs rev_data["tokens_added"] = num_tokens rev_data["tokens_removed"] = len(tokens_removed) rev_data["tokens_window"] = len(window)-(i+1) self.print_rev_data(rev_data) page_count += 1 print("Done: %s revisions and %s pages." % (rev_count, page_count), file=sys.stderr) def print_rev_data(self, rev_data): # if it's the first time through, print the header if self.urlencode: for field in TO_ENCODE: rev_data[field] = quote(str(rev_data[field])) if not self.printed_header: print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file) self.printed_header = True print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file) def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, '*'] elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout except NameError: input_file = open(input_filename, 'r') return input_file def open_output_file(input_filename): # create a regex that creates the output filename output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) output_filename = re.sub(r'\.xml', '', output_filename) output_filename = output_filename + ".tsv" output_file = open(output_filename, "w") return output_file class IPCheck(object): # IP address regexes taken from https://gist.github.com/mnordhoff/2213179 ipv4_address = re.compile('^(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$') ipv6_address_or_addrz = re.compile('^(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)(?:%25(?:[A-Za-z0-9\\-._~]|%[0-9A-Fa-f]{2})+)?$') @staticmethod def is_ip(username): if not type(username) is str: return False '''Check if a username is an ip (v4 or v6) address. We use this as a marker of whether the user is anonymous.''' if IPCheck.ipv4_address.match(username) or IPCheck.ipv6_address_or_addrz.match(username): return True else: return False