From 311810a36cacb4199e2ee88e55c51b582f9332a3 Mon Sep 17 00:00:00 2001 From: groceryheist Date: Sun, 12 Aug 2018 21:33:19 -0700 Subject: [PATCH] refactor wikiq to seperate script from classes and functions. Code reuse in testing. --- bin/wikiq | 335 ++------------------------------------------ tests/Wikiq_Test.py | 161 ++++++--------------- wikiq_util.py | 325 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 374 insertions(+), 447 deletions(-) create mode 100644 wikiq_util.py diff --git a/bin/wikiq b/bin/wikiq index bc6b06d..e324d85 100755 --- a/bin/wikiq +++ b/bin/wikiq @@ -3,336 +3,17 @@ # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import pdb import argparse import sys -import os, os.path -import re +import os +sys.path.append("..") +from wikiq_util import calculate_persistence +from wikiq_util import WikiqIterator +from wikiq_util import WikiqPage +from wikiq_util import WikiqParser +from wikiq_util import open_input_file +from wikiq_util import open_output_file -from subprocess import Popen, PIPE -from collections import deque -from hashlib import sha1 - -from mwxml import Dump - -from deltas.tokenizers import wikitext_split -import mwpersistence -import mwreverts -from urllib.parse import quote -TO_ENCODE = ('title', 'editor') -PERSISTENCE_RADIUS=7 -from deltas import SequenceMatcher - -def calculate_persistence(tokens_added): - return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) - - -class WikiqIterator(): - def __init__(self, fh, collapse_user=False): - self.fh = fh - self.collapse_user = collapse_user - self.mwiterator = Dump.from_file(self.fh) - self.namespace_map = { ns.id : ns.name for ns in - self.mwiterator.site_info.namespaces } - self.__pages = self.load_pages() - - def load_pages(self): - for page in self.mwiterator: - yield WikiqPage(page, - namespace_map = self.namespace_map, - collapse_user=self.collapse_user) - - def __iter__(self): - return self.__pages - - def __next__(self): - return next(self._pages) - -class WikiqPage(): - __slots__ = ('id', 'title', 'namespace', 'redirect', - 'restrictions', 'mwpage', '__revisions', - 'collapse_user') - - def __init__(self, page, namespace_map, collapse_user=False): - self.id = page.id - self.namespace = page.namespace - if page.namespace != 0: - self.title = ':'.join([namespace_map[page.namespace], page.title]) - else: - self.title = page.title - self.restrictions = page.restrictions - self.collapse_user = collapse_user - self.mwpage = page - self.__revisions = self.rev_list() - - def rev_list(self): - # Outline for how we want to handle collapse_user=True - # iteration rev.user prev_rev.user add prev_rev? - # 0 A None Never - # 1 A A False - # 2 B A True - # 3 A B True - # 4 A A False - # Post-loop A Always - for i, rev in enumerate(self.mwpage): - # never yield the first time - if i == 0: - if self.collapse_user: - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - - else: - if self.collapse_user: - # yield if this is the last edit in a seq by a user and reset - # also yield if we do know who the user is - - if rev.deleted.user or prev_rev.deleted.user: - yield prev_rev - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - - elif not rev.user.text == prev_rev.user.text: - yield prev_rev - collapsed_revs = 1 - rev.collapsed_revs = collapsed_revs - # otherwise, add one to the counter - else: - collapsed_revs += 1 - rev.collapsed_revs = collapsed_revs - # if collapse_user is false, we always yield - else: - yield prev_rev - - prev_rev = rev - - # also yield the final time - yield prev_rev - - def __iter__(self): - return self.__revisions - - def __next__(self): - return next(self.__revisions) - -class WikiqParser(): - - def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): - - self.input_file = input_file - self.output_file = output_file - self.collapse_user = collapse_user - self.persist = persist - self.persist_legacy = persist_legacy - self.printed_header = False - self.namespaces = [] - self.urlencode = urlencode - - def __get_namespace_from_title(self, title): - default_ns = None - - for ns in self.namespaces: - # skip if the namespace is not defined - if ns == None: - default_ns = self.namespaces[ns] - continue - - if title.startswith(ns + ":"): - return self.namespaces[ns] - - # if we've made it this far with no matches, we return the default namespace - return default_ns - - def process(self): - - # create a regex that creates the output filename - # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$', - # r'output/wikiq-\1-\2.tsv', - # input_filename) - - # Construct dump file iterator - dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) - - # extract list of namspaces - self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} - - page_count = 0 - rev_count = 0 - - - # Iterate through pages - for page in dump: - rev_detector = mwreverts.Detector() - - if self.persist or self.persist_legacy: - window = deque(maxlen=PERSISTENCE_RADIUS) - - if not self.persist_legacy: - state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), - revert_radius=PERSISTENCE_RADIUS) - - else: - from mw.lib import persistence - state = persistence.State() - - # Iterate through a page's revisions - for rev in page: - - rev_data = {'revid' : rev.id, - 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'), - 'articleid' : page.id, - 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, - 'title' : '"' + page.title + '"', - 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title), - 'deleted' : "TRUE" if rev.deleted.text else "FALSE" } - - # if revisions are deleted, /many/ things will be missing - if rev.deleted.text: - rev_data['text_chars'] = "" - rev_data['sha1'] = "" - rev_data['revert'] = "" - rev_data['reverteds'] = "" - - else: - # rev.text can be None if the page has no text - if not rev.text: - rev.text = "" - # if text exists, we'll check for a sha1 and generate one otherwise - - if rev.sha1: - text_sha1 = rev.sha1 - else: - - text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() - - rev_data['sha1'] = text_sha1 - - # TODO rev.bytes doesn't work.. looks like a bug - rev_data['text_chars'] = len(rev.text) - - # generate revert data - revert = rev_detector.process(text_sha1, rev.id) - - if revert: - rev_data['revert'] = "TRUE" - rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"' - else: - rev_data['revert'] = "FALSE" - rev_data['reverteds'] = "" - - # if the fact that the edit was minor can be hidden, this might be an issue - rev_data['minor'] = "TRUE" if rev.minor else "FALSE" - - if not rev.deleted.user: - # wrap user-defined editors in quotes for fread - rev_data['editor'] = '"' + rev.user.text + '"' - rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE" - - else: - rev_data['anon'] = "" - rev_data['editor'] = "" - - #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): - # redirect = True - #else: - # redirect = False - - #TODO missing: additions_size deletions_size - - # if collapse user was on, lets run that - if self.collapse_user: - rev_data['collapsed_revs'] = rev.collapsed_revs - - if self.persist or self.persist_legacy: - if rev.deleted.text: - - for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: - old_rev_data[k] = None - else: - - if not self.persist_legacy: - _, tokens_added, tokens_removed = state.update(rev.text, rev.id) - - else: - _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) - - window.append((rev.id, rev_data, tokens_added, tokens_removed)) - - if len(window) == PERSISTENCE_RADIUS: - old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - - num_token_revs, num_tokens = calculate_persistence(old_tokens_added) - - old_rev_data["token_revs"] = num_token_revs - old_rev_data["tokens_added"] = num_tokens - old_rev_data["tokens_removed"] = len(old_tokens_removed) - old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 - - self.print_rev_data(old_rev_data) - - else: - self.print_rev_data(rev_data) - - rev_count += 1 - - if self.persist or self.persist_legacy: - # print out metadata for the last RADIUS revisions - for i, item in enumerate(window): - # if the window was full, we've already printed item 0 - if len(window) == PERSISTENCE_RADIUS and i == 0: - continue - - rev_id, rev_data, tokens_added, tokens_removed = item - num_token_revs, num_tokens = calculate_persistence(tokens_added) - - rev_data["token_revs"] = num_token_revs - rev_data["tokens_added"] = num_tokens - rev_data["tokens_removed"] = len(tokens_removed) - rev_data["tokens_window"] = len(window)-(i+1) - - self.print_rev_data(rev_data) - - page_count += 1 - - print("Done: %s revisions and %s pages." % (rev_count, page_count), - file=sys.stderr) - - def print_rev_data(self, rev_data): - # if it's the first time through, print the header - if self.urlencode: - for field in TO_ENCODE: - rev_data[field] = quote(str(rev_data[field])) - - if not self.printed_header: - print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file) - self.printed_header = True - - print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file) - - -def open_input_file(input_filename): - if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*'] - elif re.match(r'.*\.gz$', input_filename): - cmd = ["zcat", input_filename] - elif re.match(r'.*\.bz2$', input_filename): - cmd = ["bzcat", "-dk", input_filename] - - try: - input_file = Popen(cmd, stdout=PIPE).stdout - except NameError: - input_file = open(input_filename, 'r') - - return input_file - -def open_output_file(input_filename): - # create a regex that creates the output filename - output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) - output_filename = re.sub(r'\.xml', '', output_filename) - output_filename = output_filename + ".tsv" - output_file = open(output_filename, "w") - - return output_file parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') diff --git a/tests/Wikiq_Test.py b/tests/Wikiq_Test.py index e995551..09cb3c5 100644 --- a/tests/Wikiq_Test.py +++ b/tests/Wikiq_Test.py @@ -16,30 +16,26 @@ from io import StringIO # wikia and wikipedia data DONE # malformed xmls DONE -class Test_Wikipedia(unittest.TestCase): - def setUp(self): +class Test_Wikiq(unittest.TestCase): + + def mkoutputdir(self): if not os.path.exists("test_output"): os.mkdir("test_output") - self.wiki = 'ikwiki-20180301-pages-meta-history' + def setuptoutputfiles(self, suffix="xml.7z"): self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - - self.infile = "{0}.xml.bz2".format(self.wiki) - self.base_call = "../bin/wikiq {0} -o {1}" + self.infile = "{0}.{1}".format(self.wiki,suffix) self.input_dir = "dumps" self.input_file = os.path.join(".", self.input_dir,self.infile) self.baseline_output_dir = "baseline_output" - def test_WP_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name + def run_and_check_output(self, call, test_filename): test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - - call = self.base_call.format(self.input_file, self.test_output_dir) - call = call + " --url-encode" + proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) proc.wait() @@ -51,142 +47,74 @@ class Test_Wikipedia(unittest.TestCase): baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) +class Test_Wikipedia(Test_Wikiq): + def setUp(self): + print(os.path.abspath(".")) + self.mkoutputdir() + self.wiki = 'ikwiki-20180301-pages-meta-history' + self.setuptoutputfiles(suffix="xml.bz2") + self.base_call = "../bin/wikiq {0} -o {1}" -class Test_Basic(unittest.TestCase): + def test_WP_url_encode(self): + test_filename = "url-encode_" + self.wikiq_out_name + call = self.base_call.format(self.input_file, self.test_output_dir) + call = call + " --url-encode" + self.run_and_check_output(call, test_filename) - def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) +class Test_Basic(Test_Wikiq): - self.infile = "{0}.xml.7z".format(self.wiki) + def setUp(self): + self.mkoutputdir() + self.wiki="sailormoon" + self.setuptoutputfiles() self.base_call = "../bin/wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - self.baseline_output_dir = "baseline_output" def test_noargs(self): - test_filename = "noargs_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() - - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) - + print(call) + self.run_and_check_output(call, test_filename) def test_collapse_user(self): test_filename = "collapse-user_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --collapse-user" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() - - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) + self.run_and_check_output(call, test_filename) def test_pwr_legacy(self): test_filename = "persistence_legacy_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence-legacy" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() - - - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) + self.run_and_check_output(call, test_filename) def test_pwr(self): test_filename = "persistence_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() - - - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) - + self.run_and_check_output(call, test_filename) def test_url_encode(self): test_filename = "url-encode_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() - - copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) - + self.run_and_check_output(call, test_filename) -class Test_Malformed(unittest.TestCase): +class Test_Malformed(Test_Wikiq): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - - self.wiki = 'twinpeaks' - self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - - self.infile = "{0}.xml.7z".format(self.wiki) + self.mkoutputdir() + self.wiki="twinpeaks" + self.setuptoutputfiles() self.base_call = "../bin/wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - def test_malformed_noargs(self): - call = self.base_call.format(self.input_file, self.test_output_dir) proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) proc.wait() @@ -194,28 +122,21 @@ class Test_Malformed(unittest.TestCase): errlines = str(errs).split("\\n") self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') -class Test_Stdout(unittest.TestCase): +class Test_Stdout(Test_Wikiq): def setUp(self): + self.mkoutputdir() self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" + self.setuptoutputfiles() - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../bin/wikiq {0} --stdout" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - self.baseline_output_dir = "baseline_output" def test_noargs(self): - - call = self.base_call.format(self.input_file) - proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True) - outs = proc.stdout.decode("utf8") - + self.base_call = ["../bin/wikiq", self.input_file, "--stdout"] + proc = subprocess.Popen(self.base_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') + outs = proc.stdout test_file = "noargs_" + self.wikiq_out_name baseline_file = os.path.join(".", self.baseline_output_dir, test_file) - print(baseline_file) - test = pd.read_table(StringIO(outs)) + test = pd.read_table(outs) baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) diff --git a/wikiq_util.py b/wikiq_util.py new file mode 100644 index 0000000..495c31c --- /dev/null +++ b/wikiq_util.py @@ -0,0 +1,325 @@ +import sys +import re +from subprocess import Popen, PIPE +from collections import deque +from hashlib import sha1 +from deltas.tokenizers import wikitext_split +from mwxml import Dump +import mwpersistence +import mwreverts +from urllib.parse import quote +from deltas import SequenceMatcher + +TO_ENCODE = ('title', 'editor') +PERSISTENCE_RADIUS=7 + +def calculate_persistence(tokens_added): + return(sum([(len(x.revisions)-1) for x in tokens_added]), + len(tokens_added)) + +class WikiqIterator(): + def __init__(self, fh, collapse_user=False): + self.fh = fh + self.collapse_user = collapse_user + self.mwiterator = Dump.from_file(self.fh) + self.namespace_map = { ns.id : ns.name for ns in + self.mwiterator.site_info.namespaces } + self.__pages = self.load_pages() + + def load_pages(self): + for page in self.mwiterator: + yield WikiqPage(page, + namespace_map = self.namespace_map, + collapse_user=self.collapse_user) + + def __iter__(self): + return self.__pages + + def __next__(self): + return next(self._pages) + + +class WikiqPage(): + __slots__ = ('id', 'title', 'namespace', 'redirect', + 'restrictions', 'mwpage', '__revisions', + 'collapse_user') + + def __init__(self, page, namespace_map, collapse_user=False): + self.id = page.id + self.namespace = page.namespace + if page.namespace != 0: + self.title = ':'.join([namespace_map[page.namespace], page.title]) + else: + self.title = page.title + self.restrictions = page.restrictions + self.collapse_user = collapse_user + self.mwpage = page + self.__revisions = self.rev_list() + + def rev_list(self): + # Outline for how we want to handle collapse_user=True + # iteration rev.user prev_rev.user add prev_rev? + # 0 A None Never + # 1 A A False + # 2 B A True + # 3 A B True + # 4 A A False + # Post-loop A Always + for i, rev in enumerate(self.mwpage): + # never yield the first time + if i == 0: + if self.collapse_user: + collapsed_revs = 1 + rev.collapsed_revs = collapsed_revs + + else: + if self.collapse_user: + # yield if this is the last edit in a seq by a user and reset + # also yield if we do know who the user is + + if rev.deleted.user or prev_rev.deleted.user: + yield prev_rev + collapsed_revs = 1 + rev.collapsed_revs = collapsed_revs + + elif not rev.user.text == prev_rev.user.text: + yield prev_rev + collapsed_revs = 1 + rev.collapsed_revs = collapsed_revs + # otherwise, add one to the counter + else: + collapsed_revs += 1 + rev.collapsed_revs = collapsed_revs + # if collapse_user is false, we always yield + else: + yield prev_rev + + prev_rev = rev + + # also yield the final time + yield prev_rev + + def __iter__(self): + return self.__revisions + + def __next__(self): + return next(self.__revisions) + +class WikiqParser(): + + def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): + + self.input_file = input_file + self.output_file = output_file + self.collapse_user = collapse_user + self.persist = persist + self.persist_legacy = persist_legacy + self.printed_header = False + self.namespaces = [] + self.urlencode = urlencode + + def __get_namespace_from_title(self, title): + default_ns = None + + for ns in self.namespaces: + # skip if the namespace is not defined + if ns == None: + default_ns = self.namespaces[ns] + continue + + if title.startswith(ns + ":"): + return self.namespaces[ns] + + # if we've made it this far with no matches, we return the default namespace + return default_ns + + def process(self): + + # create a regex that creates the output filename + # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$', + # r'output/wikiq-\1-\2.tsv', + # input_filename) + + # Construct dump file iterator + dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) + + # extract list of namspaces + self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} + + page_count = 0 + rev_count = 0 + + + # Iterate through pages + for page in dump: + rev_detector = mwreverts.Detector() + + if self.persist or self.persist_legacy: + window = deque(maxlen=PERSISTENCE_RADIUS) + + if not self.persist_legacy: + state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), + revert_radius=PERSISTENCE_RADIUS) + + else: + from mw.lib import persistence + state = persistence.State() + + # Iterate through a page's revisions + for rev in page: + + rev_data = {'revid' : rev.id, + 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'), + 'articleid' : page.id, + 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, + 'title' : '"' + page.title + '"', + 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title), + 'deleted' : "TRUE" if rev.deleted.text else "FALSE" } + + # if revisions are deleted, /many/ things will be missing + if rev.deleted.text: + rev_data['text_chars'] = "" + rev_data['sha1'] = "" + rev_data['revert'] = "" + rev_data['reverteds'] = "" + + else: + # rev.text can be None if the page has no text + if not rev.text: + rev.text = "" + # if text exists, we'll check for a sha1 and generate one otherwise + + if rev.sha1: + text_sha1 = rev.sha1 + else: + + text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() + + rev_data['sha1'] = text_sha1 + + # TODO rev.bytes doesn't work.. looks like a bug + rev_data['text_chars'] = len(rev.text) + + # generate revert data + revert = rev_detector.process(text_sha1, rev.id) + + if revert: + rev_data['revert'] = "TRUE" + rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"' + else: + rev_data['revert'] = "FALSE" + rev_data['reverteds'] = "" + + # if the fact that the edit was minor can be hidden, this might be an issue + rev_data['minor'] = "TRUE" if rev.minor else "FALSE" + + if not rev.deleted.user: + # wrap user-defined editors in quotes for fread + rev_data['editor'] = '"' + rev.user.text + '"' + rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE" + + else: + rev_data['anon'] = "" + rev_data['editor'] = "" + + #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): + # redirect = True + #else: + # redirect = False + + #TODO missing: additions_size deletions_size + + # if collapse user was on, lets run that + if self.collapse_user: + rev_data['collapsed_revs'] = rev.collapsed_revs + + if self.persist or self.persist_legacy: + if rev.deleted.text: + + for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: + old_rev_data[k] = None + else: + + if not self.persist_legacy: + _, tokens_added, tokens_removed = state.update(rev.text, rev.id) + + else: + _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) + + window.append((rev.id, rev_data, tokens_added, tokens_removed)) + + if len(window) == PERSISTENCE_RADIUS: + old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] + + num_token_revs, num_tokens = calculate_persistence(old_tokens_added) + + old_rev_data["token_revs"] = num_token_revs + old_rev_data["tokens_added"] = num_tokens + old_rev_data["tokens_removed"] = len(old_tokens_removed) + old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1 + + self.print_rev_data(old_rev_data) + + else: + self.print_rev_data(rev_data) + + rev_count += 1 + + if self.persist or self.persist_legacy: + # print out metadata for the last RADIUS revisions + for i, item in enumerate(window): + # if the window was full, we've already printed item 0 + if len(window) == PERSISTENCE_RADIUS and i == 0: + continue + + rev_id, rev_data, tokens_added, tokens_removed = item + num_token_revs, num_tokens = calculate_persistence(tokens_added) + + rev_data["token_revs"] = num_token_revs + rev_data["tokens_added"] = num_tokens + rev_data["tokens_removed"] = len(tokens_removed) + rev_data["tokens_window"] = len(window)-(i+1) + + self.print_rev_data(rev_data) + + page_count += 1 + + print("Done: %s revisions and %s pages." % (rev_count, page_count), + file=sys.stderr) + + def print_rev_data(self, rev_data): + # if it's the first time through, print the header + if self.urlencode: + for field in TO_ENCODE: + rev_data[field] = quote(str(rev_data[field])) + + if not self.printed_header: + print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file) + self.printed_header = True + + print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file) + + +def open_input_file(input_filename): + if re.match(r'.*\.7z$', input_filename): + cmd = ["7za", "x", "-so", input_filename, '*'] + elif re.match(r'.*\.gz$', input_filename): + cmd = ["zcat", input_filename] + elif re.match(r'.*\.bz2$', input_filename): + cmd = ["bzcat", "-dk", input_filename] + + try: + input_file = Popen(cmd, stdout=PIPE).stdout + except NameError: + input_file = open(input_filename, 'r') + + return input_file + +def open_output_file(input_filename): + # create a regex that creates the output filename + output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) + output_filename = re.sub(r'\.xml', '', output_filename) + output_filename = output_filename + ".tsv" + output_file = open(output_filename, "w") + + return output_file -- 2.39.5