]> code.communitydata.science - mediawiki_dump_tools.git/commitdiff
handling empty text
authorsohyeonhwang <sohyeonhwang@u.northwestern.edu>
Tue, 3 Dec 2019 22:44:53 +0000 (16:44 -0600)
committersohyeonhwang <sohyeonhwang@u.northwestern.edu>
Tue, 3 Dec 2019 22:44:53 +0000 (16:44 -0600)
test/Wikiq_Unit_Test.py
test/baseline_output/basic_emptytext_0.tsv [new file with mode: 0644]
test/baseline_output/basic_emptytext_1.tsv [new file with mode: 0644]
test/baseline_output/basic_emptytext_2.tsv [new file with mode: 0644]
test/baseline_output/basic_emptytext_3.tsv [new file with mode: 0644]
test/baseline_output/capturegroup_emptytext_0.tsv [new file with mode: 0644]
test/baseline_output/capturegroup_emptytext_1.tsv [new file with mode: 0644]
test/dumps/emptytext.xml.bz2 [new file with mode: 0644]
wikiq

index 14d38f12c2521d1f858ec9fc782e8ac5ad5ca23b..cc27fb52b7037a7aa2de214223cb592db5565ccd 100644 (file)
-import unittest
-import os
-import subprocess
-from shutil import copyfile
-import pandas as pd
-from pandas.util.testing import assert_frame_equal
-from io import StringIO
-
-# with / without pwr DONE
-# with / without url encode DONE
-# with / without collapse user DONE
-# with output to sdtout DONE
-# note that the persistence radius is 7 by default
-# reading various file formats including
-#        7z, gz, bz2, xml  DONE
-# wikia and wikipedia data DONE
-# malformed xmls DONE
-
-class Test_Wikipedia(unittest.TestCase):
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'ikwiki-20180301-pages-meta-history'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
-        self.infile = "{0}.xml.bz2".format(self.wiki)    
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
-
-    def test_WP_url_encode(self):
-        test_filename =  "url-encode_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --url-encode"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-    def test_WP_namespaces(self):
-        print(os.path.abspath('.'))
-        test_filename =  "namespaces_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " -n 0 -n 1"
-        print(call)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
-        num_wrong_ns = sum(~ test.namespace.isin({0,1}))
-        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-    def test_WP_revert_radius(self):
-        print(os.path.abspath('.'))
-        test_filename =  "revert_radius_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " -n 0 -n 1 -rr 1"
-        print(call)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
-        num_wrong_ns = sum(~ test.namespace.isin({0,1}))
-        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-
-
-class Test_Basic(unittest.TestCase):
-
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'sailormoon'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
-
-    def test_noargs(self):
-
-        test_filename =  "noargs_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-
-    def test_collapse_user(self):
-        test_filename =  "collapse-user_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --collapse-user"
-
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-    def test_pwr_segment(self):
-        test_filename =  "persistence_segment_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence segment"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-
-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-    def test_pwr_legacy(self):
-        test_filename =  "persistence_legacy_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence legacy"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-
-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-    def test_pwr(self):
-        test_filename =  "persistence_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file): 
-           os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-
-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-
-    def test_url_encode(self):
-        test_filename =  "url-encode_" + self.wikiq_out_name
-
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
-        
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --url-encode"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-
-class Test_Malformed(unittest.TestCase):
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'twinpeaks'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-
-
-    def test_malformed_noargs(self):
-
-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
-        proc.wait()
-        outs, errs = proc.communicate()
-        errlines = str(errs).split("\\n")
-        self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
-
-class Test_Stdout(unittest.TestCase):
-
-    def setUp(self):
-        self.wiki = 'sailormoon'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-
-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} --stdout"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
-
-    def test_noargs(self):
-
-        call = self.base_call.format(self.input_file)
-        proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
-        outs = proc.stdout.decode("utf8")
-
-        test_file = "noargs_" + self.wikiq_out_name
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
-        print(baseline_file)
-        test = pd.read_table(StringIO(outs))
-        baseline = pd.read_table(baseline_file)
-        assert_frame_equal(test,baseline)
-
-class Test_Regex(unittest.TestCase):
-
-    def setUp(self):
-        self.wiki = 'regextest'
-        self.wikiq_out_name = self.wiki + '.tsv'
-        self.infile = "{0}.xml.bz2".format(self.wiki)
-
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-        # we have two base calls, one for checking inputs and the other for checking outputs
-        self.base_call = "../wikiq {0}"
-        self.base_call_outs = "../wikiq {0} -o {1}"
-
-        self.baseline_output_dir = "baseline_output"
-
-        # sample inputs for checking that bad inputs get terminated / test_regex_inputs
-        self.bad_inputs_list = [
-            #label is missing            
-            "-RP '\\b\\d+\\b'", 
-            #number of reg and number of labels do not match 
-            "-RP 'NPO V' -RP THE -RPl testlabel",
-            #cp but rp label
-            "-CP '(Tamil|Li)' -RPl testlabel",
-            #regex is missing
-            "-CPl testlabel",
-            "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
-        ]
-
-        # sample inputs for checking the outcomes of good inputs / test_basic_regex
-        self.good_inputs_list = [
-            "-RP '\\b\\d{3}\\b' -RPl threedigits",
-            "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
-            "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
-            "-CP 'WP:EVADE' -CPl wp_evade"         
-        ]
-
-        
-        self.cap_inputs_list = [
-            "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
-            "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
-        ]
-
-
-
-    def test_regex_inputs(self):
-        for input in self.bad_inputs_list:
-            call = self.base_call.format(self.input_file)
-            call = call + " --stdout " + input
-            print(call)
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            stdout,stderr = proc.communicate()
-            #print(proc.returncode)
-            
-            # we want to check that the bad inputs were caught and sys.exit is stopping the code
-            print(stderr.decode("utf-8"))
-            self.assertNotEqual(proc.returncode,0)
-
-    def test_basic_regex(self):
-        for i, input in enumerate(self.good_inputs_list):
-
-            test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
-            #print(test_filename)
-            test_file = os.path.join(self.test_output_dir, test_filename)
-            if os.path.exists(test_file):
-                os.remove(test_file)
-
-            call = self.base_call_outs.format(self.input_file, self.test_output_dir)
-            call = call + " " + input
-            print(call)
-
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            proc.wait()
-            copyfile(self.call_output, test_file)
-
-            test = pd.read_table(test_file)
-            
-            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-            baseline = pd.read_table(baseline_file)
-            assert_frame_equal(test, baseline)
-            print(i)
-
-
-    def test_capturegroup_regex(self):
-        for i, input in enumerate(self.cap_inputs_list):
-            test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
-            print(test_filename)
-            test_file = os.path.join(self.test_output_dir, test_filename)
-            if os.path.exists(test_file):
-                os.remove(test_file)
-
-            call = self.base_call_outs.format(self.input_file, self.test_output_dir)
-            call = call + " " + input
-            print(call)
-
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            proc.wait()
-
-            copyfile(self.call_output, test_file)
-            
-            test = pd.read_table(test_file)
-            
-            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-            baseline = pd.read_table(baseline_file)
-            assert_frame_equal(test, baseline)
-
-
-if __name__ == '__main__':
-    unittest.main()
+import unittest\r
+import os\r
+import subprocess\r
+from shutil import copyfile\r
+import pandas as pd\r
+from pandas.util.testing import assert_frame_equal\r
+from io import StringIO\r
+\r
+# with / without pwr DONE\r
+# with / without url encode DONE\r
+# with / without collapse user DONE\r
+# with output to sdtout DONE\r
+# note that the persistence radius is 7 by default\r
+# reading various file formats including\r
+#        7z, gz, bz2, xml  DONE\r
+# wikia and wikipedia data DONE\r
+# malformed xmls DONE\r
+\r
+class Test_Wikipedia(unittest.TestCase):\r
+    def setUp(self):\r
+        if not os.path.exists("test_output"):\r
+            os.mkdir("test_output")\r
+\r
+        self.wiki = 'ikwiki-20180301-pages-meta-history'\r
+        self.wikiq_out_name =  self.wiki + ".tsv"\r
+        self.test_output_dir = os.path.join(".", "test_output")\r
+        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+        self.infile = "{0}.xml.bz2".format(self.wiki)    \r
+        self.base_call = "../wikiq {0} -o {1}"\r
+        self.input_dir = "dumps"\r
+        self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+        self.baseline_output_dir = "baseline_output"\r
+\r
+    def test_WP_url_encode(self):\r
+        test_filename =  "url-encode_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --url-encode"\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+        copyfile(self.call_output, test_file)\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+        # as a test let's make sure that we get equal data frames\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+    def test_WP_namespaces(self):\r
+        print(os.path.abspath('.'))\r
+        test_filename =  "namespaces_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " -n 0 -n 1"\r
+        print(call)\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+        copyfile(self.call_output, test_file)\r
+        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)\r
+\r
+        # as a test let's make sure that we get equal data frames\r
+        test = pd.read_table(test_file)\r
+        num_wrong_ns = sum(~ test.namespace.isin({0,1}))\r
+        self.assertEqual(num_wrong_ns, 0)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+    def test_WP_revert_radius(self):\r
+        print(os.path.abspath('.'))\r
+        test_filename =  "revert_radius_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " -n 0 -n 1 -rr 1"\r
+        print(call)\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+        copyfile(self.call_output, test_file)\r
+        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)\r
+\r
+        # as a test let's make sure that we get equal data frames\r
+        test = pd.read_table(test_file)\r
+        num_wrong_ns = sum(~ test.namespace.isin({0,1}))\r
+        self.assertEqual(num_wrong_ns, 0)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+\r
+\r
+class Test_Basic(unittest.TestCase):\r
+\r
+    def setUp(self):\r
+        if not os.path.exists("test_output"):\r
+            os.mkdir("test_output")\r
+\r
+        self.wiki = 'sailormoon'\r
+        self.wikiq_out_name =  self.wiki + ".tsv"\r
+        self.test_output_dir = os.path.join(".", "test_output")\r
+        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+        self.infile = "{0}.xml.7z".format(self.wiki)\r
+        self.base_call = "../wikiq {0} -o {1}"\r
+        self.input_dir = "dumps"\r
+        self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+        self.baseline_output_dir = "baseline_output"\r
+\r
+    def test_noargs(self):\r
+\r
+        test_filename =  "noargs_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+        copyfile(self.call_output, test_file)\r
+\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+\r
+    def test_collapse_user(self):\r
+        test_filename =  "collapse-user_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --collapse-user"\r
+\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+        copyfile(self.call_output, test_file)\r
+\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+    def test_pwr_segment(self):\r
+        test_filename =  "persistence_segment_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --persistence segment"\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+\r
+        copyfile(self.call_output, test_file)\r
+\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+    def test_pwr_legacy(self):\r
+        test_filename =  "persistence_legacy_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --persistence legacy"\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+\r
+        copyfile(self.call_output, test_file)\r
+\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+    def test_pwr(self):\r
+        test_filename =  "persistence_" + self.wikiq_out_name\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file): \r
+           os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --persistence"\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+\r
+        copyfile(self.call_output, test_file)\r
+\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+\r
+    def test_url_encode(self):\r
+        test_filename =  "url-encode_" + self.wikiq_out_name\r
+\r
+        test_file = os.path.join(self.test_output_dir, test_filename)\r
+        if os.path.exists(test_file):\r
+            os.remove(test_file)\r
+        \r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        call = call + " --url-encode"\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+        proc.wait()\r
+\r
+        copyfile(self.call_output, test_file)\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+        test = pd.read_table(test_file)\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+\r
+class Test_Malformed(unittest.TestCase):\r
+    def setUp(self):\r
+        if not os.path.exists("test_output"):\r
+            os.mkdir("test_output")\r
+\r
+        self.wiki = 'twinpeaks'\r
+        self.wikiq_out_name =  self.wiki + ".tsv"\r
+        self.test_output_dir = os.path.join(".", "test_output")\r
+        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+        self.infile = "{0}.xml.7z".format(self.wiki)\r
+        self.base_call = "../wikiq {0} -o {1}"\r
+        self.input_dir = "dumps"\r
+        self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+\r
+\r
+    def test_malformed_noargs(self):\r
+\r
+        call = self.base_call.format(self.input_file, self.test_output_dir)\r
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)\r
+        proc.wait()\r
+        outs, errs = proc.communicate()\r
+        errlines = str(errs).split("\\n")\r
+        self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')\r
+\r
+class Test_Stdout(unittest.TestCase):\r
+\r
+    def setUp(self):\r
+        self.wiki = 'sailormoon'\r
+        self.wikiq_out_name =  self.wiki + ".tsv"\r
+\r
+        self.infile = "{0}.xml.7z".format(self.wiki)\r
+        self.base_call = "../wikiq {0} --stdout"\r
+        self.input_dir = "dumps"\r
+        self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+        self.baseline_output_dir = "baseline_output"\r
+\r
+    def test_noargs(self):\r
+\r
+        call = self.base_call.format(self.input_file)\r
+        proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)\r
+        outs = proc.stdout.decode("utf8")\r
+\r
+        test_file = "noargs_" + self.wikiq_out_name\r
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_file)\r
+        print(baseline_file)\r
+        test = pd.read_table(StringIO(outs))\r
+        baseline = pd.read_table(baseline_file)\r
+        assert_frame_equal(test,baseline)\r
+\r
+class Test_Regex(unittest.TestCase):\r
+\r
+    def setUp(self):\r
+        self.wiki = 'emptytext'\r
+        self.wikiq_out_name = self.wiki + '.tsv'\r
+        self.infile = "{0}.xml.bz2".format(self.wiki)\r
+\r
+        self.input_dir = "dumps"\r
+        self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+\r
+        if not os.path.exists("test_output"):\r
+            os.mkdir("test_output")\r
+\r
+        self.test_output_dir = os.path.join(".", "test_output")\r
+        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+        # we have two base calls, one for checking inputs and the other for checking outputs\r
+        self.base_call = "../wikiq {0}"\r
+        self.base_call_outs = "../wikiq {0} -o {1}"\r
+\r
+        self.baseline_output_dir = "baseline_output"\r
+\r
+        # sample inputs for checking that bad inputs get terminated / test_regex_inputs\r
+        self.bad_inputs_list = [\r
+            #label is missing            \r
+            "-RP '\\b\\d+\\b'", \r
+            #number of reg and number of labels do not match \r
+            "-RP 'NPO V' -RP THE -RPl testlabel",\r
+            #cp but rp label\r
+            "-CP '(Tamil|Li)' -RPl testlabel",\r
+            #regex is missing\r
+            "-CPl testlabel",\r
+            "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"\r
+        ]\r
+\r
+        # sample inputs for checking the outcomes of good inputs / test_basic_regex\r
+        self.good_inputs_list = [\r
+            "-RP '\\b\\d{3}\\b' -RPl threedigits",\r
+            "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",\r
+            "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",\r
+            "-CP 'WP:EVADE' -CPl wp_evade"         \r
+        ]\r
+\r
+        \r
+        self.cap_inputs_list = [\r
+            "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",\r
+            "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"\r
+        ]\r
+\r
+\r
+\r
+    def test_regex_inputs(self):\r
+        for input in self.bad_inputs_list:\r
+            call = self.base_call.format(self.input_file)\r
+            call = call + " --stdout " + input\r
+            #print(call)\r
+            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+            stdout,stderr = proc.communicate()\r
+            #print(proc.returncode)\r
+            \r
+            # we want to check that the bad inputs were caught and sys.exit is stopping the code\r
+            #print(stderr.decode("utf-8"))\r
+            self.assertNotEqual(proc.returncode,0)\r
+\r
+    def test_basic_regex(self):\r
+        for i, input in enumerate(self.good_inputs_list):\r
+\r
+            test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))\r
+            #print(test_filename)\r
+            test_file = os.path.join(self.test_output_dir, test_filename)\r
+            if os.path.exists(test_file):\r
+                os.remove(test_file)\r
+\r
+            call = self.base_call_outs.format(self.input_file, self.test_output_dir)\r
+            call = call + " " + input\r
+            #print(call)\r
+\r
+            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+            proc.wait()\r
+            copyfile(self.call_output, test_file)\r
+\r
+            test = pd.read_table(test_file)\r
+            \r
+            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+            baseline = pd.read_table(baseline_file)\r
+            #assert_frame_equal(test, baseline)\r
+            #print(i)\r
+\r
+\r
+    def test_capturegroup_regex(self):\r
+        for i, input in enumerate(self.cap_inputs_list):\r
+            test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))\r
+            #print(test_filename)\r
+            test_file = os.path.join(self.test_output_dir, test_filename)\r
+            if os.path.exists(test_file):\r
+                os.remove(test_file)\r
+\r
+            call = self.base_call_outs.format(self.input_file, self.test_output_dir)\r
+            call = call + " " + input\r
+            #print(call)\r
+\r
+            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+            proc.wait()\r
+\r
+            copyfile(self.call_output, test_file)\r
+            \r
+            test = pd.read_table(test_file)\r
+            \r
+            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+            baseline = pd.read_table(baseline_file)\r
+            #assert_frame_equal(test, baseline)\r
+\r
+\r
+if __name__ == '__main__':\r
+    unittest.main()\r
diff --git a/test/baseline_output/basic_emptytext_0.tsv b/test/baseline_output/basic_emptytext_0.tsv
new file mode 100644 (file)
index 0000000..c101be3
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       date_time       deleted editor  editor_id       minor   namespace       revert  reverteds       revid   sha1    text_chars      threedigits     title
+FALSE  56237363        2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 FALSE   3       FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 0       None    "User talk:86.139.142.254"
+FALSE  56237364        2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        FALSE   3       FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo 663     None    "User talk:Kavin kavitha"
+FALSE  56237365        2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        FALSE   3       FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv 399     None    "User talk:Dr.vivek163"
+FALSE  56237366        2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260    None    "User talk:Twistorl"
+FALSE  56237368        2018-01-07 10:41:51     FALSE   "Khruner"       8409334 FALSE   0       FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249    119, 978, 500, 292, 225, 199, 292       "Kom Firin"
+FALSE  56237368        2018-01-27 12:16:02     FALSE   "Khruner"       8409334 TRUE    0       FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230    119, 978, 500, 292, 225, 199, 292       "Kom Firin"
+FALSE  56237369        2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        FALSE   2       FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 27840   798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621  "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 TRUE    2       FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 27787   798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621  "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj 27784   798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621  "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h 27783   798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621  "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg 27782   798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621  "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        FALSE   2       FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757   798, 150, 150, 150, 621, 100, 621       "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 FALSE   2       FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 27667   798, 150, 150, 150, 621, 100, 621       "User:Editingaccount1994/sandbox"
+FALSE  56237370        2018-01-07 10:42:20     FALSE   "PamD"  1368779 FALSE   0       FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 25      None    "Anita del Rey"
+FALSE  56237371        2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274    119, 157, 119, 157, 119, 157, 119, 157  "User talk:119.94.96.157"
+FALSE  56237372        2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  FALSE   14      FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr 113     None    "Category:Ohmi Railway"
+FALSE  56237375        2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  FALSE   3       FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199     None    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 TRUE    3       FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840    None    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 14:33:36     FALSE   "Only"  702940  FALSE   3       FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949    126, 126, 126, 126      "User talk:92.226.219.222"
+FALSE  56237376        2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk 28      None    "User:Dipayanacharya"
+FALSE  56237376        2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw 38      None    "User:Dipayanacharya"
+FALSE  56237378        2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        FALSE   0       FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65      None    "BSCIC"
+FALSE  56237379        2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  FALSE   14      FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n 285     None    "Category:Women government ministers of Yemen"
+FALSE  56237381        2018-01-07 10:45:54     FALSE   "PRehse"        410898  FALSE   1       FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103     None    "Talk:List of Morning Glories Characters"
+FALSE  56237382        2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330    106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114  "User talk:106.207.126.114"
+FALSE  56237382        2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        FALSE   3       FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe 2355    106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114  "User talk:106.207.126.114"
diff --git a/test/baseline_output/basic_emptytext_1.tsv b/test/baseline_output/basic_emptytext_1.tsv
new file mode 100644 (file)
index 0000000..ffdc680
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       date_time       deleted editor  editor_id       minor   namespace       page_word       revert  reverteds       revid   sha1    testcases       text_chars      title
+FALSE  56237363        2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 FALSE   3       None    FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 None    0       "User talk:86.139.142.254"
+FALSE  56237364        2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        FALSE   3       None    FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo None    663     "User talk:Kavin kavitha"
+FALSE  56237365        2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        FALSE   3       None    FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv TestCase, TestCase      399     "User talk:Dr.vivek163"
+FALSE  56237366        2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        FALSE   3       page    FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf None    1260    "User talk:Twistorl"
+FALSE  56237368        2018-01-07 10:41:51     FALSE   "Khruner"       8409334 FALSE   0       page    FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 TestCase        2249    "Kom Firin"
+FALSE  56237368        2018-01-27 12:16:02     FALSE   "Khruner"       8409334 TRUE    0       page    FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh None    2230    "Kom Firin"
+FALSE  56237369        2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        FALSE   2       page, page      FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 None    27840   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 TRUE    2       page, page      FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 None    27787   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        TRUE    2       page, page      FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj None    27784   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        TRUE    2       page, page      FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h None    27783   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        TRUE    2       page, page      FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg None    27782   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        FALSE   2       page, page      FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 None    27757   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 FALSE   2       page, page      FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 None    27667   "User:Editingaccount1994/sandbox"
+FALSE  56237370        2018-01-07 10:42:20     FALSE   "PamD"  1368779 FALSE   0       None    FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 None    25      "Anita del Rey"
+FALSE  56237371        2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        FALSE   3       page    FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None    1274    "User talk:119.94.96.157"
+FALSE  56237372        2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  FALSE   14      None    FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr None    113     "Category:Ohmi Railway"
+FALSE  56237375        2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  FALSE   3       None    FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq None    199     "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 TRUE    3       page, page, page, page  FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg None    1840    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 14:33:36     FALSE   "Only"  702940  FALSE   3       page, page, page, page, page, page      FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 None    2949    "User talk:92.226.219.222"
+FALSE  56237376        2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        FALSE   2       None    FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk None    28      "User:Dipayanacharya"
+FALSE  56237376        2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        FALSE   2       None    FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw None    38      "User:Dipayanacharya"
+FALSE  56237378        2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        FALSE   0       None    FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd None    65      "BSCIC"
+FALSE  56237379        2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  FALSE   14      None    FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n None    285     "Category:Women government ministers of Yemen"
+FALSE  56237381        2018-01-07 10:45:54     FALSE   "PRehse"        410898  FALSE   1       None    FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None    103     "Talk:List of Morning Glories Characters"
+FALSE  56237382        2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        FALSE   3       page    FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi None    1330    "User talk:106.207.126.114"
+FALSE  56237382        2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        FALSE   3       page    FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe None    2355    "User talk:106.207.126.114"
diff --git a/test/baseline_output/basic_emptytext_2.tsv b/test/baseline_output/basic_emptytext_2.tsv
new file mode 100644 (file)
index 0000000..8a5748c
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       chev_com        date_time       deleted editor  editor_id       minor   namespace       revert  reverteds       revid   sha1    text_chars      title   warning wiki_welcome
+FALSE  56237363        None    2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 FALSE   3       FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 0       "User talk:86.139.142.254"      None    None
+FALSE  56237364        None    2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        FALSE   3       FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo 663     "User talk:Kavin kavitha"       None    None
+FALSE  56237365        None    2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        FALSE   3       FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv 399     "User talk:Dr.vivek163" None    None
+FALSE  56237366        None    2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260    "User talk:Twistorl"    Warning welcome to Wikipedia
+FALSE  56237368        None    2018-01-07 10:41:51     FALSE   "Khruner"       8409334 FALSE   0       FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249    "Kom Firin"     None    None
+FALSE  56237368        None    2018-01-27 12:16:02     FALSE   "Khruner"       8409334 TRUE    0       FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230    "Kom Firin"     None    None
+FALSE  56237369        Chevalier, Chevalier    2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        FALSE   2       FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 27840   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        None    2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 TRUE    2       FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 27787   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        None    2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj 27784   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        None    2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h 27783   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        None    2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg 27782   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        None    2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        FALSE   2       FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237369        Chevalier, Chevalier    2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 FALSE   2       FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 27667   "User:Editingaccount1994/sandbox"       None    None
+FALSE  56237370        None    2018-01-07 10:42:20     FALSE   "PamD"  1368779 FALSE   0       FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 25      "Anita del Rey" None    None
+FALSE  56237371        None    2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274    "User talk:119.94.96.157"       Warning welcome to Wikipedia
+FALSE  56237372        None    2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  FALSE   14      FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr 113     "Category:Ohmi Railway" None    None
+FALSE  56237375        None    2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  FALSE   3       FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199     "User talk:92.226.219.222"      None    None
+FALSE  56237375        None    2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 TRUE    3       FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840    "User talk:92.226.219.222"      None    None
+FALSE  56237375        None    2018-01-07 14:33:36     FALSE   "Only"  702940  FALSE   3       FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949    "User talk:92.226.219.222"      None    None
+FALSE  56237376        None    2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk 28      "User:Dipayanacharya"   None    None
+FALSE  56237376        None    2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw 38      "User:Dipayanacharya"   None    None
+FALSE  56237378        None    2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        FALSE   0       FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65      "BSCIC" None    None
+FALSE  56237379        None    2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  FALSE   14      FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n 285     "Category:Women government ministers of Yemen"  None    None
+FALSE  56237381        None    2018-01-07 10:45:54     FALSE   "PRehse"        410898  FALSE   1       FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103     "Talk:List of Morning Glories Characters"       None    None
+FALSE  56237382        None    2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330    "User talk:106.207.126.114"     Warning welcome to Wikipedia
+FALSE  56237382        None    2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        FALSE   3       FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe 2355    "User talk:106.207.126.114"     None    welcome to Wikipedia
diff --git a/test/baseline_output/basic_emptytext_3.tsv b/test/baseline_output/basic_emptytext_3.tsv
new file mode 100644 (file)
index 0000000..fe1a1ea
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       date_time       deleted editor  editor_id       minor   namespace       revert  reverteds       revid   sha1    text_chars      title   wp_evade
+FALSE  56237363        2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 FALSE   3       FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 0       "User talk:86.139.142.254"      WP:EVADE
+FALSE  56237364        2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        FALSE   3       FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo 663     "User talk:Kavin kavitha"       None
+FALSE  56237365        2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        FALSE   3       FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv 399     "User talk:Dr.vivek163" None
+FALSE  56237366        2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260    "User talk:Twistorl"    None
+FALSE  56237368        2018-01-07 10:41:51     FALSE   "Khruner"       8409334 FALSE   0       FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249    "Kom Firin"     None
+FALSE  56237368        2018-01-27 12:16:02     FALSE   "Khruner"       8409334 TRUE    0       FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230    "Kom Firin"     None
+FALSE  56237369        2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        FALSE   2       FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 27840   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 TRUE    2       FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 27787   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj 27784   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h 27783   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        TRUE    2       FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg 27782   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        FALSE   2       FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757   "User:Editingaccount1994/sandbox"       None
+FALSE  56237369        2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 FALSE   2       FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 27667   "User:Editingaccount1994/sandbox"       None
+FALSE  56237370        2018-01-07 10:42:20     FALSE   "PamD"  1368779 FALSE   0       FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 25      "Anita del Rey" None
+FALSE  56237371        2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274    "User talk:119.94.96.157"       None
+FALSE  56237372        2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  FALSE   14      FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr 113     "Category:Ohmi Railway" None
+FALSE  56237375        2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  FALSE   3       FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199     "User talk:92.226.219.222"      None
+FALSE  56237375        2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 TRUE    3       FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840    "User talk:92.226.219.222"      None
+FALSE  56237375        2018-01-07 14:33:36     FALSE   "Only"  702940  FALSE   3       FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949    "User talk:92.226.219.222"      WP:EVADE
+FALSE  56237376        2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk 28      "User:Dipayanacharya"   None
+FALSE  56237376        2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        FALSE   2       FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw 38      "User:Dipayanacharya"   None
+FALSE  56237378        2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        FALSE   0       FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65      "BSCIC" None
+FALSE  56237379        2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  FALSE   14      FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n 285     "Category:Women government ministers of Yemen"  None
+FALSE  56237381        2018-01-07 10:45:54     FALSE   "PRehse"        410898  FALSE   1       FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103     "Talk:List of Morning Glories Characters"       None
+FALSE  56237382        2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        FALSE   3       FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330    "User talk:106.207.126.114"     None
+FALSE  56237382        2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        FALSE   3       FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe 2355    "User talk:106.207.126.114"     None
diff --git a/test/baseline_output/capturegroup_emptytext_0.tsv b/test/baseline_output/capturegroup_emptytext_0.tsv
new file mode 100644 (file)
index 0000000..cb75024
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       date_time       deleted editor  editor_id       li_cheval       minor   namespace       revert  reverteds       revid   sha1    text_chars      three_cat       three_letter    three_number    title
+FALSE  56237363        2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 None    FALSE   3       FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 0       None    has, has        None    "User talk:86.139.142.254"
+FALSE  56237364        2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        None    FALSE   3       FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo 663     None    AES, for        01, 12, 2001    "User talk:Kavin kavitha"
+FALSE  56237365        2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        None    FALSE   3       FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv 399     None    new     None    "User talk:Dr.vivek163"
+FALSE  56237366        2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        None    FALSE   3       FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260    None    None    1       "User talk:Twistorl"
+FALSE  56237368        2018-01-07 10:41:51     FALSE   "Khruner"       8409334 None    FALSE   0       FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249    None    AES, jpg, the, the, the, the, and, you, Tor     67, 119 "Kom Firin"
+FALSE  56237368        2018-01-27 12:16:02     FALSE   "Khruner"       8409334 None    TRUE    0       FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230    None    None    None    "Kom Firin"
+FALSE  56237369        2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    FALSE   2       FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 27840   None    AES, nom        None    "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    TRUE    2       FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 27787   None    web, See, for   None    "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    TRUE    2       FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj 27784   None    per, TFD, TFD   None    "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    TRUE    2       FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h 27783   None    per, for, Log, TFD      2010, 13        "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    TRUE    2       FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg 27782   None    per, for, Log, TFD      2011, 17        "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier    FALSE   2       FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757   None    you, are, tor, you      None    "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier        FALSE   2       FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 27667   None    jpg, jpg, has, COM      16, 2018        "User:Editingaccount1994/sandbox"
+FALSE  56237370        2018-01-07 10:42:20     FALSE   "PamD"  1368779 None    FALSE   0       FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 25      None    alt     None    "Anita del Rey"
+FALSE  56237371        2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        None    FALSE   3       FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274    None    None    119, 94, 96, 157, 119, 94, 96, 157, 1   "User talk:119.94.96.157"
+FALSE  56237372        2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  None    FALSE   14      FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr 113     None    AES     None    "Category:Ohmi Railway"
+FALSE  56237375        2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  None    FALSE   3       FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199     None    AES     None    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 None    TRUE    3       FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840    None    See, for        None    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 14:33:36     FALSE   "Only"  702940  None    FALSE   3       FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949    None    has, has        None    "User talk:92.226.219.222"
+FALSE  56237376        2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        None    FALSE   2       FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk 28      None    None    None    "User:Dipayanacharya"
+FALSE  56237376        2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        None    FALSE   2       FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw 38      None    None    None    "User:Dipayanacharya"
+FALSE  56237378        2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        None    FALSE   0       FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65      None    AES, and        None    "BSCIC"
+FALSE  56237379        2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  None    FALSE   14      FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n 285     None    AES, Non        None    "Category:Women government ministers of Yemen"
+FALSE  56237381        2018-01-07 10:45:54     FALSE   "PRehse"        410898  None    FALSE   1       FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103     None    AES, low, low   None    "Talk:List of Morning Glories Characters"
+FALSE  56237382        2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        None    FALSE   3       FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330    None    None    106, 207, 126, 114, 106, 207, 126, 114, 1       "User talk:106.207.126.114"
+FALSE  56237382        2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        None    FALSE   3       FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe 2355    None    None    None    "User talk:106.207.126.114"
diff --git a/test/baseline_output/capturegroup_emptytext_1.tsv b/test/baseline_output/capturegroup_emptytext_1.tsv
new file mode 100644 (file)
index 0000000..dfb2a52
--- /dev/null
@@ -0,0 +1,27 @@
+anon   articleid       date_time       deleted editor  editor_id       minor   namespace       npov_neutral    npov_npov       revert  reverteds       revid   sha1    testcase_a      testcase_b      testcase_c      testcase_d      text_chars      title
+FALSE  56237363        2018-01-07 10:40:58     FALSE   "NinjaRobotPirate"      3742946 FALSE   3       None    None    FALSE           819091731       135nz8q6lfam6cojla7azb7k5alx3t3 None    None    None    None    0       "User talk:86.139.142.254"
+FALSE  56237364        2018-01-07 10:41:10     FALSE   "Kavin kavitha" 32792125        FALSE   3       None    None    FALSE           819091755       0pwezjc6yopz0smc8al6ogc4fax5bwo None    None    None    None    663     "User talk:Kavin kavitha"
+FALSE  56237365        2018-01-07 10:41:26     FALSE   "Amicable always"       32621254        FALSE   3       None    NPOV, NPOV      FALSE           819091788       sz3t2ap7z8bpkdvdvi195f3i35949bv None    None    None    None    399     "User talk:Dr.vivek163"
+FALSE  56237366        2018-01-07 10:41:31     FALSE   "ClueBot NG"    13286072        FALSE   3       None    None    FALSE           819091796       r6s5j8j3iykenrhuhpnkpsmmd71vubf None    None    None    None    1260    "User talk:Twistorl"
+FALSE  56237368        2018-01-07 10:41:51     FALSE   "Khruner"       8409334 FALSE   0       None    NPOV    FALSE           819091825       tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 None    TestCaseB       None    None    2249    "Kom Firin"
+FALSE  56237368        2018-01-27 12:16:02     FALSE   "Khruner"       8409334 TRUE    0       None    None    FALSE           822610647       e6oa4g0qv64icdaq26uu1zzbyr5hcbh None    None    None    None    2230    "Kom Firin"
+FALSE  56237369        2018-01-07 10:42:05     FALSE   "Editingaccount1994"    32794215        FALSE   2       None    None    FALSE           819091844       0fyvyh2a8xu41gt8obr34oba0bfixj6 None    None    None    None    27840   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-07 11:09:52     FALSE   "AnomieBOT"     7611264 TRUE    2       None    None    FALSE           819093984       8gy52aolt5rg3eaketwj5v7eiw0apv2 None    None    None    None    27787   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 21:45:50     FALSE   "SporkBot"      12406635        TRUE    2       None    None    FALSE           820064189       he8ydemaanxlrpftqxkez8jfpge1fsj None    None    None    None    27784   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:11     FALSE   "SporkBot"      12406635        TRUE    2       None    None    FALSE           820078679       0to17w9rth3url8n7gvucdtobybdq5h None    None    None    None    27783   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-12 23:28:39     FALSE   "SporkBot"      12406635        TRUE    2       None    None    FALSE           820078733       531dizmmloyxffbkdr5vph7owh921eg None    None    None    None    27782   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-13 13:45:33     FALSE   "Frietjes"      13791031        FALSE   2       None    None    FALSE           820177382       nik9p2u2fuk4yazjxt8ymbicxv5qid9 None    None    None    TestCaseD       27757   "User:Editingaccount1994/sandbox"
+FALSE  56237369        2018-01-24 01:35:22     FALSE   "CommonsDelinker"       2304267 FALSE   2       None    None    FALSE           822038928       gwk6pampl8si1v5pv3kwgteg710sfw3 None    None    None    None    27667   "User:Editingaccount1994/sandbox"
+FALSE  56237370        2018-01-07 10:42:20     FALSE   "PamD"  1368779 FALSE   0       None    None    FALSE           819091874       n4ozbsgle13p9yywtfrz982ccj8woc9 None    None    None    None    25      "Anita del Rey"
+FALSE  56237371        2018-01-07 10:42:27     FALSE   "ClueBot NG"    13286072        FALSE   3       None    None    FALSE           819091883       ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None    None    None    None    1274    "User talk:119.94.96.157"
+FALSE  56237372        2018-01-07 10:42:50     FALSE   "Underbar dk"   677153  FALSE   14      None    None    FALSE           819091914       je7aw21fedbwyqsyofpisdrynsu7olr None    None    None    None    113     "Category:Ohmi Railway"
+FALSE  56237375        2018-01-07 10:43:32     FALSE   "TastyPoutine"  882433  FALSE   3       None    None    FALSE           819091968       cpm4tkzcx4hc6irr9ukbi06ogud8dtq None    None    None    None    199     "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 11:10:24     FALSE   "AnomieBOT"     7611264 TRUE    3       None    None    FALSE           819094036       artmfz8b2gxhb3pp8a5p4ksplxqfkpg None    None    None    None    1840    "User talk:92.226.219.222"
+FALSE  56237375        2018-01-07 14:33:36     FALSE   "Only"  702940  FALSE   3       None    None    FALSE           819112363       dn9wj0n8d8pdd5lqe56uw5xamupowr1 None    None    None    None    2949    "User talk:92.226.219.222"
+FALSE  56237376        2018-01-07 10:44:01     FALSE   "Dipayanacharya"        32794237        FALSE   2       None    None    FALSE           819092004       ofueugwatmmn7u73isw732neuza57gk None    None    None    None    28      "User:Dipayanacharya"
+FALSE  56237376        2018-01-07 10:49:08     FALSE   "Dipayanacharya"        32794237        FALSE   2       None    None    FALSE           819092390       dsz55xv96ec2uv6w9c1z7c52ipfovbw None    None    None    None    38      "User:Dipayanacharya"
+FALSE  56237378        2018-01-07 10:44:56     FALSE   "Vinegarymass911"       21516552        FALSE   0       None    None    FALSE           819092066       9ma38hak0ef1ew4fpiutxpnzd8oz1wd None    None    None    None    65      "BSCIC"
+FALSE  56237379        2018-01-07 10:45:21     FALSE   "BrownHairedGirl"       754619  FALSE   14      None    None    FALSE           819092102       4dvakoat58bzyf5hmtthxukt29hip6n None    None    None    None    285     "Category:Women government ministers of Yemen"
+FALSE  56237381        2018-01-07 10:45:54     FALSE   "PRehse"        410898  FALSE   1       None    None    FALSE           819092135       2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None    None    None    None    103     "Talk:List of Morning Glories Characters"
+FALSE  56237382        2018-01-07 10:45:56     FALSE   "ClueBot NG"    13286072        FALSE   3       None    None    FALSE           819092138       3y9t5wpk6ur5jhone75rhm4wjf01fgi None    None    None    None    1330    "User talk:106.207.126.114"
+FALSE  56237382        2018-01-07 10:50:22     FALSE   "HindWIKI"      31190506        FALSE   3       None    None    FALSE           819092495       8wvn6vh3isyt0dorpe89lztrburgupe None    None    None    None    2355    "User talk:106.207.126.114"
diff --git a/test/dumps/emptytext.xml.bz2 b/test/dumps/emptytext.xml.bz2
new file mode 100644 (file)
index 0000000..878c1d6
Binary files /dev/null and b/test/dumps/emptytext.xml.bz2 differ
diff --git a/wikiq b/wikiq
index 0543a3383b57ae27281882885bad08978c6de5f9..0dad9e32deb3fea6345d0e7c9a68e58239a7dde6 100755 (executable)
--- a/wikiq
+++ b/wikiq
-#!/usr/bin/env python3
-
-# original wikiq headers are: title articleid revid date_time anon
-# editor editor_id minor text_size text_entropy text_md5 reversion
-# additions_size deletions_size
-
-import argparse
-import sys
-import os, os.path
-import re
-
-from subprocess import Popen, PIPE
-from collections import deque
-from hashlib import sha1
-
-from mwxml import Dump
-
-from deltas.tokenizers import wikitext_split
-import mwpersistence
-import mwreverts
-from urllib.parse import quote
-TO_ENCODE = ('title', 'editor')
-PERSISTENCE_RADIUS=7
-from deltas import SequenceMatcher
-from deltas import SegmentMatcher
-
-class PersistMethod:
-    none = 0
-    sequence = 1
-    segment = 2
-    legacy = 3
-
-def calculate_persistence(tokens_added):
-    return(sum([(len(x.revisions)-1) for x in tokens_added]),
-           len(tokens_added))
-
-
-class WikiqIterator():
-    def __init__(self, fh, collapse_user=False):
-        self.fh = fh
-        self.collapse_user = collapse_user
-        self.mwiterator = Dump.from_file(self.fh)
-        self.namespace_map = { ns.id : ns.name for ns in
-                               self.mwiterator.site_info.namespaces }
-        self.__pages = self.load_pages()
-
-    def load_pages(self):
-        for page in self.mwiterator:
-            yield WikiqPage(page,
-                            namespace_map = self.namespace_map,
-                            collapse_user=self.collapse_user)
-
-    def __iter__(self):
-        return self.__pages
-
-    def __next__(self):
-        return next(self._pages)
-
-class WikiqPage():
-    __slots__ = ('id', 'title', 'namespace', 'redirect',
-                 'restrictions', 'mwpage', '__revisions',
-                 'collapse_user')
-    
-    def __init__(self, page, namespace_map, collapse_user=False):
-        self.id = page.id
-        self.namespace = page.namespace
-        # following mwxml, we assume namespace 0 in cases where
-        # page.namespace is inconsistent with namespace_map
-        if page.namespace not in namespace_map:
-            self.title = page.title
-            page.namespace = 0
-        if page.namespace != 0:
-            self.title = ':'.join([namespace_map[page.namespace], page.title])
-        else:
-            self.title = page.title
-        self.restrictions = page.restrictions
-        self.collapse_user = collapse_user
-        self.mwpage = page
-        self.__revisions = self.rev_list()
-
-    def rev_list(self):
-        # Outline for how we want to handle collapse_user=True
-        # iteration   rev.user   prev_rev.user   add prev_rev?
-        #         0          A            None           Never
-        #         1          A               A           False
-        #         2          B               A            True
-        #         3          A               B            True
-        #         4          A               A           False
-        # Post-loop                          A          Always
-        for i, rev in enumerate(self.mwpage):
-            # never yield the first time
-            if i == 0:
-                if self.collapse_user: 
-                    collapsed_revs = 1
-                    rev.collapsed_revs = collapsed_revs
-
-            else:
-                if self.collapse_user:
-                    # yield if this is the last edit in a seq by a user and reset
-                    # also yield if we do know who the user is
-
-                    if rev.deleted.user or prev_rev.deleted.user:
-                        yield prev_rev
-                        collapsed_revs = 1
-                        rev.collapsed_revs = collapsed_revs
-
-                    elif not rev.user.text == prev_rev.user.text:
-                        yield prev_rev
-                        collapsed_revs = 1
-                        rev.collapsed_revs = collapsed_revs
-                    # otherwise, add one to the counter
-                    else:
-                        collapsed_revs += 1
-                        rev.collapsed_revs = collapsed_revs
-                # if collapse_user is false, we always yield
-                else:
-                    yield prev_rev
-
-            prev_rev = rev
-
-        # also yield the final time
-        yield prev_rev
-
-    def __iter__(self):
-        return self.__revisions
-
-    def __next__(self):
-        return next(self.__revisions)
-
-
-class RegexPair(object):
-    def __init__(self, pattern, label):
-        self.pattern = re.compile(pattern)
-        self.label = label
-        self.has_groups = bool(self.pattern.groupindex)
-        if self.has_groups:
-            self.capture_groups = list(self.pattern.groupindex.keys())
-            
-    def _make_key(self, cap_group):
-        return ("{}_{}".format(self.label, cap_group))
-
-    def matchmake(self, content, rev_data):
-        
-        temp_dict = {}
-        # if there are named capture groups in the regex
-        if self.has_groups:
-
-            # if there are matches of some sort in this revision content, fill the lists for each cap_group
-            if self.pattern.search(content) is not None:
-                m = self.pattern.finditer(content)
-                matchobjects = list(m)
-
-                for cap_group in self.capture_groups:
-                    key = self._make_key(cap_group)
-                    temp_list = []
-                    for match in matchobjects:
-                        # we only want to add the match for the capture group if the match is not None
-                        if match.group(cap_group) != None:
-                            temp_list.append(match.group(cap_group))
-
-                    # if temp_list of matches is empty just make that column None
-                    if len(temp_list)==0:
-                        temp_dict[key] = None
-                    # else we put in the list we made in the for-loop above
-                    else:
-                        temp_dict[key] = ', '.join(temp_list)
-
-            # there are no matches at all in this revision content, we default values to None
-            else:
-                for cap_group in self.capture_groups:
-                    key = self._make_key(cap_group)
-                    temp_dict[key] = None
-
-        # there are no capture groups, we just search for all the matches of the regex
-        else:
-            #given that there are matches to be made
-            if self.pattern.search(content) is not None:
-                m = self.pattern.findall(content)
-                temp_dict[self.label] = ', '.join(m)
-            else:
-                temp_dict[self.label] = None    
-        # update rev_data with our new columns
-        rev_data.update(temp_dict)
-        return rev_data
-
-        
-class WikiqParser():
-    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
-        """ 
-        Parameters:
-           persist : what persistence method to use. Takes a PersistMethod value
-        """
-        self.input_file = input_file
-        self.output_file = output_file
-        self.collapse_user = collapse_user
-        self.persist = persist
-        self.printed_header = False
-        self.namespaces = []
-        self.urlencode = urlencode
-        self.revert_radius = revert_radius
-
-        if namespaces is not None:
-            self.namespace_filter = set(namespaces)
-        else:
-            self.namespace_filter = None
-
-        self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
-        self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
-        
-
-    def make_matchmake_pairs(self, patterns, labels):
-        if (patterns is not None and labels is not None) and \
-           (len(patterns) == len(labels)):
-            return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
-        elif (patterns is None and labels is None):
-            return []
-        else:
-            sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
-
-    def matchmake(self, rev, rev_data):
-        rev_data = self.matchmake_revision(rev.text, rev_data)
-        rev_data = self.matchmake_comment(rev.comment, rev_data)
-        return rev_data
-
-    def matchmake_revision(self, text, rev_data):
-        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
-
-    def matchmake_comment(self, comment, rev_data):
-        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
-
-    def matchmake_pairs(self, text, rev_data, pairs):
-        for pair in pairs:
-            rev_data = pair.matchmake(text, rev_data)
-        return rev_data
-
-    def __get_namespace_from_title(self, title):
-        default_ns = None
-
-        for ns in self.namespaces:
-            # skip if the namespace is not defined
-            if ns == None:
-                default_ns = self.namespaces[ns]
-                continue
-
-            if title.startswith(ns + ":"):
-                return self.namespaces[ns]
-
-        # if we've made it this far with no matches, we return the default namespace
-        return default_ns
-
-
-    def process(self):
-
-        # create a regex that creates the output filename
-        # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
-        #                         r'output/wikiq-\1-\2.tsv',
-        #                         input_filename)
-
-        # Construct dump file iterator
-        dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
-
-        # extract list of namspaces
-        self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
-
-        page_count = 0
-        rev_count = 0
-
-
-        # Iterate through pages
-        for page in dump:
-            namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
-
-            # skip namespaces not in the filter
-            if self.namespace_filter is not None:
-                if namespace not in self.namespace_filter:
-                    continue
-
-            rev_detector = mwreverts.Detector(radius = self.revert_radius)
-
-            if self.persist != PersistMethod.none:
-                window = deque(maxlen=PERSISTENCE_RADIUS)
-
-                if self.persist == PersistMethod.sequence:
-                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
-                                                    revert_radius=PERSISTENCE_RADIUS)
-
-                elif self.persist == PersistMethod.segment:
-                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
-                                                    revert_radius=PERSISTENCE_RADIUS)
-
-                # self.persist == PersistMethod.legacy
-                else:
-                    from mw.lib import persistence
-                    state = persistence.State()
-
-            # Iterate through a page's revisions
-            for rev in page:
-                
-                # initialize rev_data
-                rev_data = {
-                    'revid':rev.id,
-                    'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
-                    'articleid' : page.id,
-                    'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
-                    'title' : '"' + page.title + '"',
-                    'namespace' : namespace,
-                    'deleted' : "TRUE" if rev.deleted.text else "FALSE"
-                }
-
-                rev_data = self.matchmake(rev, rev_data)
-
-                # if revisions are deleted, /many/ things will be missing
-                if rev.deleted.text:
-                    rev_data['text_chars'] = ""
-                    rev_data['sha1'] = ""
-                    rev_data['revert'] = ""
-                    rev_data['reverteds'] = ""
-
-                else:
-                    # rev.text can be None if the page has no text
-                    if not rev.text:
-                        rev.text = ""
-                    # if text exists, we'll check for a sha1 and generate one otherwise
-
-                    if rev.sha1:
-                        text_sha1 = rev.sha1
-                    else:
-
-                        text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
-                    
-                    rev_data['sha1'] = text_sha1
-
-                    # TODO rev.bytes doesn't work.. looks like a bug
-                    rev_data['text_chars'] = len(rev.text)
-
-                    # generate revert data
-                    revert = rev_detector.process(text_sha1, rev.id)
-                    
-                    if revert:
-                        rev_data['revert'] = "TRUE"
-                        rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
-                    else:
-                        rev_data['revert'] = "FALSE"
-                        rev_data['reverteds'] = ""
-
-                # if the fact that the edit was minor can be hidden, this might be an issue
-                rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
-
-                if not rev.deleted.user:
-                    # wrap user-defined editors in quotes for fread
-                    rev_data['editor'] = '"' + rev.user.text + '"'
-                    rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
-                    
-                else:
-                    rev_data['anon'] = ""
-                    rev_data['editor'] = ""
-
-                #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
-                #    redirect = True
-                #else:
-                #    redirect = False
-                
-                #TODO missing: additions_size deletions_size
-                
-                # if collapse user was on, lets run that
-                if self.collapse_user:
-                    rev_data['collapsed_revs'] = rev.collapsed_revs
-
-                if self.persist != PersistMethod.none:
-                    if rev.deleted.text:
-                        for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
-                            old_rev_data[k] = None
-                    else:
-
-                        if self.persist != PersistMethod.legacy:
-                            _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
-
-                        else:
-                            _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
-                            
-                        window.append((rev.id, rev_data, tokens_added, tokens_removed))
-                        
-                        if len(window) == PERSISTENCE_RADIUS:
-                            old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
-                            
-                            num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
-
-                            old_rev_data["token_revs"] = num_token_revs
-                            old_rev_data["tokens_added"] = num_tokens
-                            old_rev_data["tokens_removed"] = len(old_tokens_removed)
-                            old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
-
-                            self.print_rev_data(old_rev_data)
-
-                else:
-                    self.print_rev_data(rev_data)
-
-                rev_count += 1
-
-            if self.persist != PersistMethod.none:
-                # print out metadata for the last RADIUS revisions
-                for i, item in enumerate(window):
-                    # if the window was full, we've already printed item 0
-                    if len(window) == PERSISTENCE_RADIUS and i == 0:
-                        continue
-
-                    rev_id, rev_data, tokens_added, tokens_removed = item
-                    num_token_revs, num_tokens = calculate_persistence(tokens_added)
-
-                    rev_data["token_revs"] = num_token_revs
-                    rev_data["tokens_added"] = num_tokens
-                    rev_data["tokens_removed"] = len(tokens_removed)
-                    rev_data["tokens_window"] = len(window)-(i+1)
-                    
-                    self.print_rev_data(rev_data)
-
-            page_count += 1
-
-        print("Done: %s revisions and %s pages." % (rev_count, page_count),
-              file=sys.stderr)
-
-    def print_rev_data(self, rev_data):
-        # if it's the first time through, print the header
-        if self.urlencode:
-            for field in TO_ENCODE:
-                rev_data[field] = quote(str(rev_data[field]))
-
-        if not self.printed_header:
-            print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
-            self.printed_header = True
-        
-        print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
-
-
-def open_input_file(input_filename):
-    if re.match(r'.*\.7z$', input_filename):
-        cmd = ["7za", "x", "-so", input_filename, '*'] 
-    elif re.match(r'.*\.gz$', input_filename):
-        cmd = ["zcat", input_filename] 
-    elif re.match(r'.*\.bz2$', input_filename):
-        cmd = ["bzcat", "-dk", input_filename] 
-
-    try:
-        input_file = Popen(cmd, stdout=PIPE).stdout
-    except NameError:
-        input_file = open(input_filename, 'r')
-
-    return input_file
-
-def open_output_file(input_filename):
-    # create a regex that creates the output filename
-    output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
-    output_filename = re.sub(r'\.xml', '', output_filename)
-    output_filename = output_filename + ".tsv"
-    output_file = open(output_filename, "w")
-
-    return output_file
-
-parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
-
-# arguments for the input direction
-parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
-                    help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
-
-parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
-                    help="Directory for output files.")
-
-parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
-                    help="Write output to standard out (do not create dump file)")
-
-parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
-                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
-
-parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
-                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
-
-parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
-                    help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
-
-parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
-                    help="Id number of namspace to include. Can be specified more than once.")
-
-parser.add_argument('-rr',
-                    '--revert-radius',
-                    dest="revert_radius",
-                    type=int,
-                    action='store',
-                    default=15,
-                    help="Number of edits to check when looking for reverts (default: 15)")
-
-parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
-                    help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
-
-parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
-                    help="The label for the outputted column based on matching the regex in revision text.")
-
-parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
-                    help="The regular expression to search for in comments of revisions.")
-
-parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
-                    help="The label for the outputted column based on matching the regex in comments.")
-
-args = parser.parse_args()
-
-# set persistence method
-
-if args.persist is None:
-    persist = PersistMethod.none
-elif args.persist == "segment":
-    persist = PersistMethod.segment
-elif args.persist == "legacy":
-    persist = PersistMethod.legacy
-else:
-    persist = PersistMethod.sequence
-
-if args.namespace_filter is not None:
-    namespaces = args.namespace_filter
-else:
-    namespaces = None
-
-if len(args.dumpfiles) > 0:
-    for filename in args.dumpfiles:
-        input_file = open_input_file(filename)
-
-        # open directory for output
-        if args.output_dir:
-            output_dir = args.output_dir[0]
-        else:
-            output_dir = "."
-
-        print("Processing file: %s" % filename, file=sys.stderr)
-
-        if args.stdout:
-            output_file = sys.stdout
-        else:
-            filename = os.path.join(output_dir, os.path.basename(filename))
-            output_file = open_output_file(filename)
-
-        wikiq = WikiqParser(input_file,
-                            output_file,
-                            collapse_user=args.collapse_user,
-                            persist=persist,
-                            urlencode=args.urlencode,
-                            namespaces=namespaces,
-                            revert_radius=args.revert_radius,
-                            regex_match_revision = args.regex_match_revision,
-                            regex_revision_label = args.regex_revision_label,
-                            regex_match_comment = args.regex_match_comment,
-                            regex_comment_label = args.regex_comment_label)
-
-        wikiq.process()
-
-        # close things 
-        input_file.close()
-        output_file.close()
-else:
-    wikiq = WikiqParser(sys.stdin,
-                        sys.stdout,
-                        collapse_user=args.collapse_user,
-                        persist=persist,
-                        #persist_legacy=args.persist_legacy,
-                        urlencode=args.urlencode,
-                        namespaces=namespaces,
-                        revert_radius=args.revert_radius,
-                        regex_match_revision = args.regex_match_revision,
-                        regex_revision_label = args.regex_revision_label,
-                        regex_match_comment = args.regex_match_comment,
-                        regex_comment_label = args.regex_comment_label)
-
-    wikiq.process() 
-
-# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
-# stop_words = stop_words.split(",")
+#!/usr/bin/env python3\r
+\r
+# original wikiq headers are: title articleid revid date_time anon\r
+# editor editor_id minor text_size text_entropy text_md5 reversion\r
+# additions_size deletions_size\r
+\r
+import argparse\r
+import sys\r
+import os, os.path\r
+import re\r
+\r
+from subprocess import Popen, PIPE\r
+from collections import deque\r
+from hashlib import sha1\r
+\r
+from mwxml import Dump\r
+\r
+from deltas.tokenizers import wikitext_split\r
+import mwpersistence\r
+import mwreverts\r
+from urllib.parse import quote\r
+TO_ENCODE = ('title', 'editor')\r
+PERSISTENCE_RADIUS=7\r
+from deltas import SequenceMatcher\r
+from deltas import SegmentMatcher\r
+\r
+class PersistMethod:\r
+    none = 0\r
+    sequence = 1\r
+    segment = 2\r
+    legacy = 3\r
+\r
+def calculate_persistence(tokens_added):\r
+    return(sum([(len(x.revisions)-1) for x in tokens_added]),\r
+           len(tokens_added))\r
+\r
+\r
+class WikiqIterator():\r
+    def __init__(self, fh, collapse_user=False):\r
+        self.fh = fh\r
+        self.collapse_user = collapse_user\r
+        self.mwiterator = Dump.from_file(self.fh)\r
+        self.namespace_map = { ns.id : ns.name for ns in\r
+                               self.mwiterator.site_info.namespaces }\r
+        self.__pages = self.load_pages()\r
+\r
+    def load_pages(self):\r
+        for page in self.mwiterator:\r
+            yield WikiqPage(page,\r
+                            namespace_map = self.namespace_map,\r
+                            collapse_user=self.collapse_user)\r
+\r
+    def __iter__(self):\r
+        return self.__pages\r
+\r
+    def __next__(self):\r
+        return next(self._pages)\r
+\r
+class WikiqPage():\r
+    __slots__ = ('id', 'title', 'namespace', 'redirect',\r
+                 'restrictions', 'mwpage', '__revisions',\r
+                 'collapse_user')\r
+    \r
+    def __init__(self, page, namespace_map, collapse_user=False):\r
+        self.id = page.id\r
+        self.namespace = page.namespace\r
+        # following mwxml, we assume namespace 0 in cases where\r
+        # page.namespace is inconsistent with namespace_map\r
+        if page.namespace not in namespace_map:\r
+            self.title = page.title\r
+            page.namespace = 0\r
+        if page.namespace != 0:\r
+            self.title = ':'.join([namespace_map[page.namespace], page.title])\r
+        else:\r
+            self.title = page.title\r
+        self.restrictions = page.restrictions\r
+        self.collapse_user = collapse_user\r
+        self.mwpage = page\r
+        self.__revisions = self.rev_list()\r
+\r
+    def rev_list(self):\r
+        # Outline for how we want to handle collapse_user=True\r
+        # iteration   rev.user   prev_rev.user   add prev_rev?\r
+        #         0          A            None           Never\r
+        #         1          A               A           False\r
+        #         2          B               A            True\r
+        #         3          A               B            True\r
+        #         4          A               A           False\r
+        # Post-loop                          A          Always\r
+        for i, rev in enumerate(self.mwpage):\r
+            # never yield the first time\r
+            if i == 0:\r
+                if self.collapse_user: \r
+                    collapsed_revs = 1\r
+                    rev.collapsed_revs = collapsed_revs\r
+\r
+            else:\r
+                if self.collapse_user:\r
+                    # yield if this is the last edit in a seq by a user and reset\r
+                    # also yield if we do know who the user is\r
+\r
+                    if rev.deleted.user or prev_rev.deleted.user:\r
+                        yield prev_rev\r
+                        collapsed_revs = 1\r
+                        rev.collapsed_revs = collapsed_revs\r
+\r
+                    elif not rev.user.text == prev_rev.user.text:\r
+                        yield prev_rev\r
+                        collapsed_revs = 1\r
+                        rev.collapsed_revs = collapsed_revs\r
+                    # otherwise, add one to the counter\r
+                    else:\r
+                        collapsed_revs += 1\r
+                        rev.collapsed_revs = collapsed_revs\r
+                # if collapse_user is false, we always yield\r
+                else:\r
+                    yield prev_rev\r
+\r
+            prev_rev = rev\r
+\r
+        # also yield the final time\r
+        yield prev_rev\r
+\r
+    def __iter__(self):\r
+        return self.__revisions\r
+\r
+    def __next__(self):\r
+        return next(self.__revisions)\r
+\r
+\r
+class RegexPair(object):\r
+    def __init__(self, pattern, label):\r
+        self.pattern = re.compile(pattern)\r
+        self.label = label\r
+        self.has_groups = bool(self.pattern.groupindex)\r
+        if self.has_groups:\r
+            self.capture_groups = list(self.pattern.groupindex.keys())\r
+            \r
+    def _make_key(self, cap_group):\r
+        return ("{}_{}".format(self.label, cap_group))\r
+\r
+    def matchmake(self, content, rev_data):\r
+        \r
+        temp_dict = {}\r
+        \r
+        # the searched text (content, which is rev.comment or rev.text) is empty\r
+        if (content==""):\r
+            # if there are capture groups, we go through and put in a value for each group\r
+            if self.has_groups:\r
+                for cap_group in self.capture_groups:\r
+                    key = self._make_key(cap_group)\r
+                    temp_dict[key] = None\r
+            # if no capture groups, just put the value in for the associated label\r
+            else:\r
+                temp_dict[self.label] = None\r
+        # searched text is not empty and we do the searches\r
+        else:\r
+            # if there are named capture groups in the regex\r
+            if self.has_groups:\r
+\r
+                # if there are matches of some sort in this revision content, fill the lists for each cap_group\r
+                if self.pattern.search(content) is not None:\r
+                    m = self.pattern.finditer(content)\r
+                    matchobjects = list(m)\r
+\r
+                    for cap_group in self.capture_groups:\r
+                        key = self._make_key(cap_group)\r
+                        temp_list = []\r
+                        for match in matchobjects:\r
+                            # we only want to add the match for the capture group if the match is not None\r
+                            if match.group(cap_group) != None:\r
+                                temp_list.append(match.group(cap_group))\r
+\r
+                        # if temp_list of matches is empty just make that column None\r
+                        if len(temp_list)==0:\r
+                            temp_dict[key] = None\r
+                        # else we put in the list we made in the for-loop above\r
+                        else:\r
+                            temp_dict[key] = ', '.join(temp_list)\r
+\r
+                # there are no matches at all in this revision content, we default values to None\r
+                else:\r
+                    for cap_group in self.capture_groups:\r
+                        key = self._make_key(cap_group)\r
+                        temp_dict[key] = None\r
+\r
+            # there are no capture groups, we just search for all the matches of the regex\r
+            else:\r
+                #given that there are matches to be made\r
+                if self.pattern.search(content) is not None:\r
+                    m = self.pattern.findall(content)\r
+                    temp_dict[self.label] = ', '.join(m)\r
+                else:\r
+                    temp_dict[self.label] = None    \r
+        \r
+        # update rev_data with our new columns\r
+        rev_data.update(temp_dict)\r
+        return rev_data\r
+\r
+        \r
+class WikiqParser():\r
+    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):\r
+        """ \r
+        Parameters:\r
+           persist : what persistence method to use. Takes a PersistMethod value\r
+        """\r
+        self.input_file = input_file\r
+        self.output_file = output_file\r
+        self.collapse_user = collapse_user\r
+        self.persist = persist\r
+        self.printed_header = False\r
+        self.namespaces = []\r
+        self.urlencode = urlencode\r
+        self.revert_radius = revert_radius\r
+\r
+        if namespaces is not None:\r
+            self.namespace_filter = set(namespaces)\r
+        else:\r
+            self.namespace_filter = None\r
+\r
+        self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)\r
+        self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)\r
+        \r
+\r
+    def make_matchmake_pairs(self, patterns, labels):\r
+        if (patterns is not None and labels is not None) and \\r
+           (len(patterns) == len(labels)):\r
+            return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]\r
+        elif (patterns is None and labels is None):\r
+            return []\r
+        else:\r
+            sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')\r
+\r
+    def matchmake(self, rev, rev_data):\r
+        if not rev.text:\r
+            rev.text = ""\r
+        if not rev.comment:\r
+            rev.comment = ""\r
+\r
+        rev_data = self.matchmake_revision(rev.text, rev_data)\r
+        rev_data = self.matchmake_comment(rev.comment, rev_data)\r
+        return rev_data\r
+\r
+    def matchmake_revision(self, text, rev_data):\r
+        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)\r
+\r
+    def matchmake_comment(self, comment, rev_data):\r
+        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)\r
+\r
+    def matchmake_pairs(self, content, rev_data, pairs):\r
+        for pair in pairs:\r
+            rev_data = pair.matchmake(content, rev_data)\r
+        return rev_data\r
+\r
+    def __get_namespace_from_title(self, title):\r
+        default_ns = None\r
+\r
+        for ns in self.namespaces:\r
+            # skip if the namespace is not defined\r
+            if ns == None:\r
+                default_ns = self.namespaces[ns]\r
+                continue\r
+\r
+            if title.startswith(ns + ":"):\r
+                return self.namespaces[ns]\r
+\r
+        # if we've made it this far with no matches, we return the default namespace\r
+        return default_ns\r
+\r
+\r
+    def process(self):\r
+\r
+        # create a regex that creates the output filename\r
+        # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',\r
+        #                         r'output/wikiq-\1-\2.tsv',\r
+        #                         input_filename)\r
+\r
+        # Construct dump file iterator\r
+        dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)\r
+\r
+        # extract list of namspaces\r
+        self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}\r
+\r
+        page_count = 0\r
+        rev_count = 0\r
+\r
+\r
+        # Iterate through pages\r
+        for page in dump:\r
+            namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)\r
+\r
+            # skip namespaces not in the filter\r
+            if self.namespace_filter is not None:\r
+                if namespace not in self.namespace_filter:\r
+                    continue\r
+\r
+            rev_detector = mwreverts.Detector(radius = self.revert_radius)\r
+\r
+            if self.persist != PersistMethod.none:\r
+                window = deque(maxlen=PERSISTENCE_RADIUS)\r
+\r
+                if self.persist == PersistMethod.sequence:\r
+                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),\r
+                                                    revert_radius=PERSISTENCE_RADIUS)\r
+\r
+                elif self.persist == PersistMethod.segment:\r
+                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),\r
+                                                    revert_radius=PERSISTENCE_RADIUS)\r
+\r
+                # self.persist == PersistMethod.legacy\r
+                else:\r
+                    from mw.lib import persistence\r
+                    state = persistence.State()\r
+\r
+            # Iterate through a page's revisions\r
+            for rev in page:\r
+                \r
+                # initialize rev_data\r
+                rev_data = {\r
+                    'revid':rev.id,\r
+                    'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),\r
+                    'articleid' : page.id,\r
+                    'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,\r
+                    'title' : '"' + page.title + '"',\r
+                    'namespace' : namespace,\r
+                    'deleted' : "TRUE" if rev.deleted.text else "FALSE"\r
+                }\r
+\r
+                rev_data = self.matchmake(rev, rev_data)\r
+\r
+                # if revisions are deleted, /many/ things will be missing\r
+                if rev.deleted.text:\r
+                    rev_data['text_chars'] = ""\r
+                    rev_data['sha1'] = ""\r
+                    rev_data['revert'] = ""\r
+                    rev_data['reverteds'] = ""\r
+\r
+                else:\r
+                    # rev.text can be None if the page has no text\r
+                    if not rev.text:\r
+                        rev.text = ""\r
+                    # if text exists, we'll check for a sha1 and generate one otherwise\r
+\r
+                    if rev.sha1:\r
+                        text_sha1 = rev.sha1\r
+                    else:\r
+\r
+                        text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()\r
+                    \r
+                    rev_data['sha1'] = text_sha1\r
+\r
+                    # TODO rev.bytes doesn't work.. looks like a bug\r
+                    rev_data['text_chars'] = len(rev.text)\r
+\r
+                    # generate revert data\r
+                    revert = rev_detector.process(text_sha1, rev.id)\r
+                    \r
+                    if revert:\r
+                        rev_data['revert'] = "TRUE"\r
+                        rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'\r
+                    else:\r
+                        rev_data['revert'] = "FALSE"\r
+                        rev_data['reverteds'] = ""\r
+\r
+                # if the fact that the edit was minor can be hidden, this might be an issue\r
+                rev_data['minor'] = "TRUE" if rev.minor else "FALSE"\r
+\r
+                if not rev.deleted.user:\r
+                    # wrap user-defined editors in quotes for fread\r
+                    rev_data['editor'] = '"' + rev.user.text + '"'\r
+                    rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"\r
+                    \r
+                else:\r
+                    rev_data['anon'] = ""\r
+                    rev_data['editor'] = ""\r
+\r
+                #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):\r
+                #    redirect = True\r
+                #else:\r
+                #    redirect = False\r
+                \r
+                #TODO missing: additions_size deletions_size\r
+                \r
+                # if collapse user was on, lets run that\r
+                if self.collapse_user:\r
+                    rev_data['collapsed_revs'] = rev.collapsed_revs\r
+\r
+                if self.persist != PersistMethod.none:\r
+                    if rev.deleted.text:\r
+                        for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:\r
+                            old_rev_data[k] = None\r
+                    else:\r
+\r
+                        if self.persist != PersistMethod.legacy:\r
+                            _, tokens_added, tokens_removed = state.update(rev.text, rev.id)\r
+\r
+                        else:\r
+                            _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)\r
+                            \r
+                        window.append((rev.id, rev_data, tokens_added, tokens_removed))\r
+                        \r
+                        if len(window) == PERSISTENCE_RADIUS:\r
+                            old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]\r
+                            \r
+                            num_token_revs, num_tokens = calculate_persistence(old_tokens_added)\r
+\r
+                            old_rev_data["token_revs"] = num_token_revs\r
+                            old_rev_data["tokens_added"] = num_tokens\r
+                            old_rev_data["tokens_removed"] = len(old_tokens_removed)\r
+                            old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1\r
+\r
+                            self.print_rev_data(old_rev_data)\r
+\r
+                else:\r
+                    self.print_rev_data(rev_data)\r
+\r
+                rev_count += 1\r
+\r
+            if self.persist != PersistMethod.none:\r
+                # print out metadata for the last RADIUS revisions\r
+                for i, item in enumerate(window):\r
+                    # if the window was full, we've already printed item 0\r
+                    if len(window) == PERSISTENCE_RADIUS and i == 0:\r
+                        continue\r
+\r
+                    rev_id, rev_data, tokens_added, tokens_removed = item\r
+                    num_token_revs, num_tokens = calculate_persistence(tokens_added)\r
+\r
+                    rev_data["token_revs"] = num_token_revs\r
+                    rev_data["tokens_added"] = num_tokens\r
+                    rev_data["tokens_removed"] = len(tokens_removed)\r
+                    rev_data["tokens_window"] = len(window)-(i+1)\r
+                    \r
+                    self.print_rev_data(rev_data)\r
+\r
+            page_count += 1\r
+\r
+        print("Done: %s revisions and %s pages." % (rev_count, page_count),\r
+              file=sys.stderr)\r
+\r
+    def print_rev_data(self, rev_data):\r
+        # if it's the first time through, print the header\r
+        if self.urlencode:\r
+            for field in TO_ENCODE:\r
+                rev_data[field] = quote(str(rev_data[field]))\r
+\r
+        if not self.printed_header:\r
+            print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)\r
+            self.printed_header = True\r
+        \r
+        print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)\r
+\r
+\r
+def open_input_file(input_filename):\r
+    if re.match(r'.*\.7z$', input_filename):\r
+        cmd = ["7za", "x", "-so", input_filename, '*'] \r
+    elif re.match(r'.*\.gz$', input_filename):\r
+        cmd = ["zcat", input_filename] \r
+    elif re.match(r'.*\.bz2$', input_filename):\r
+        cmd = ["bzcat", "-dk", input_filename] \r
+\r
+    try:\r
+        input_file = Popen(cmd, stdout=PIPE).stdout\r
+    except NameError:\r
+        input_file = open(input_filename, 'r')\r
+\r
+    return input_file\r
+\r
+def open_output_file(input_filename):\r
+    # create a regex that creates the output filename\r
+    output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)\r
+    output_filename = re.sub(r'\.xml', '', output_filename)\r
+    output_filename = output_filename + ".tsv"\r
+    output_file = open(output_filename, "w")\r
+\r
+    return output_file\r
+\r
+parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')\r
+\r
+# arguments for the input direction\r
+parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, \r
+                    help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")\r
+\r
+parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,\r
+                    help="Directory for output files.")\r
+\r
+parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",\r
+                    help="Write output to standard out (do not create dump file)")\r
+\r
+parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",\r
+                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")\r
+\r
+parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',\r
+                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")\r
+\r
+parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",\r
+                    help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")\r
+\r
+parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',\r
+                    help="Id number of namspace to include. Can be specified more than once.")\r
+\r
+parser.add_argument('-rr',\r
+                    '--revert-radius',\r
+                    dest="revert_radius",\r
+                    type=int,\r
+                    action='store',\r
+                    default=15,\r
+                    help="Number of edits to check when looking for reverts (default: 15)")\r
+\r
+parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',\r
+                    help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")\r
+\r
+parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',\r
+                    help="The label for the outputted column based on matching the regex in revision text.")\r
+\r
+parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',\r
+                    help="The regular expression to search for in comments of revisions.")\r
+\r
+parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',\r
+                    help="The label for the outputted column based on matching the regex in comments.")\r
+\r
+args = parser.parse_args()\r
+\r
+# set persistence method\r
+\r
+if args.persist is None:\r
+    persist = PersistMethod.none\r
+elif args.persist == "segment":\r
+    persist = PersistMethod.segment\r
+elif args.persist == "legacy":\r
+    persist = PersistMethod.legacy\r
+else:\r
+    persist = PersistMethod.sequence\r
+\r
+if args.namespace_filter is not None:\r
+    namespaces = args.namespace_filter\r
+else:\r
+    namespaces = None\r
+\r
+if len(args.dumpfiles) > 0:\r
+    for filename in args.dumpfiles:\r
+        input_file = open_input_file(filename)\r
+\r
+        # open directory for output\r
+        if args.output_dir:\r
+            output_dir = args.output_dir[0]\r
+        else:\r
+            output_dir = "."\r
+\r
+        print("Processing file: %s" % filename, file=sys.stderr)\r
+\r
+        if args.stdout:\r
+            output_file = sys.stdout\r
+        else:\r
+            filename = os.path.join(output_dir, os.path.basename(filename))\r
+            output_file = open_output_file(filename)\r
+\r
+        wikiq = WikiqParser(input_file,\r
+                            output_file,\r
+                            collapse_user=args.collapse_user,\r
+                            persist=persist,\r
+                            urlencode=args.urlencode,\r
+                            namespaces=namespaces,\r
+                            revert_radius=args.revert_radius,\r
+                            regex_match_revision = args.regex_match_revision,\r
+                            regex_revision_label = args.regex_revision_label,\r
+                            regex_match_comment = args.regex_match_comment,\r
+                            regex_comment_label = args.regex_comment_label)\r
+\r
+        wikiq.process()\r
+\r
+        # close things \r
+        input_file.close()\r
+        output_file.close()\r
+else:\r
+    wikiq = WikiqParser(sys.stdin,\r
+                        sys.stdout,\r
+                        collapse_user=args.collapse_user,\r
+                        persist=persist,\r
+                        #persist_legacy=args.persist_legacy,\r
+                        urlencode=args.urlencode,\r
+                        namespaces=namespaces,\r
+                        revert_radius=args.revert_radius,\r
+                        regex_match_revision = args.regex_match_revision,\r
+                        regex_revision_label = args.regex_revision_label,\r
+                        regex_match_comment = args.regex_match_comment,\r
+                        regex_comment_label = args.regex_comment_label)\r
+\r
+    wikiq.process() \r
+\r
+# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"\r
+# stop_words = stop_words.split(",")\r

Community Data Science Collective || Want to submit a patch?