-import unittest
-import os
-import subprocess
-from shutil import copyfile
-import pandas as pd
-from pandas.util.testing import assert_frame_equal
-from io import StringIO
-
-# with / without pwr DONE
-# with / without url encode DONE
-# with / without collapse user DONE
-# with output to sdtout DONE
-# note that the persistence radius is 7 by default
-# reading various file formats including
-# 7z, gz, bz2, xml DONE
-# wikia and wikipedia data DONE
-# malformed xmls DONE
-
-class Test_Wikipedia(unittest.TestCase):
- def setUp(self):
- if not os.path.exists("test_output"):
- os.mkdir("test_output")
-
- self.wiki = 'ikwiki-20180301-pages-meta-history'
- self.wikiq_out_name = self.wiki + ".tsv"
- self.test_output_dir = os.path.join(".", "test_output")
- self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
- self.infile = "{0}.xml.bz2".format(self.wiki)
- self.base_call = "../wikiq {0} -o {1}"
- self.input_dir = "dumps"
- self.input_file = os.path.join(".", self.input_dir,self.infile)
- self.baseline_output_dir = "baseline_output"
-
- def test_WP_url_encode(self):
- test_filename = "url-encode_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --url-encode"
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
- copyfile(self.call_output, test_file)
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
- # as a test let's make sure that we get equal data frames
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
- def test_WP_namespaces(self):
- print(os.path.abspath('.'))
- test_filename = "namespaces_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " -n 0 -n 1"
- print(call)
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
- copyfile(self.call_output, test_file)
- baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
-
- # as a test let's make sure that we get equal data frames
- test = pd.read_table(test_file)
- num_wrong_ns = sum(~ test.namespace.isin({0,1}))
- self.assertEqual(num_wrong_ns, 0)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
- def test_WP_revert_radius(self):
- print(os.path.abspath('.'))
- test_filename = "revert_radius_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " -n 0 -n 1 -rr 1"
- print(call)
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
- copyfile(self.call_output, test_file)
- baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
-
- # as a test let's make sure that we get equal data frames
- test = pd.read_table(test_file)
- num_wrong_ns = sum(~ test.namespace.isin({0,1}))
- self.assertEqual(num_wrong_ns, 0)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
-
-
-class Test_Basic(unittest.TestCase):
-
- def setUp(self):
- if not os.path.exists("test_output"):
- os.mkdir("test_output")
-
- self.wiki = 'sailormoon'
- self.wikiq_out_name = self.wiki + ".tsv"
- self.test_output_dir = os.path.join(".", "test_output")
- self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
- self.infile = "{0}.xml.7z".format(self.wiki)
- self.base_call = "../wikiq {0} -o {1}"
- self.input_dir = "dumps"
- self.input_file = os.path.join(".", self.input_dir,self.infile)
- self.baseline_output_dir = "baseline_output"
-
- def test_noargs(self):
-
- test_filename = "noargs_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
- copyfile(self.call_output, test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
-
- def test_collapse_user(self):
- test_filename = "collapse-user_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --collapse-user"
-
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
- copyfile(self.call_output, test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
- def test_pwr_segment(self):
- test_filename = "persistence_segment_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --persistence segment"
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
-
- copyfile(self.call_output, test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
- def test_pwr_legacy(self):
- test_filename = "persistence_legacy_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --persistence legacy"
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
-
- copyfile(self.call_output, test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
- def test_pwr(self):
- test_filename = "persistence_" + self.wikiq_out_name
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --persistence"
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
-
- copyfile(self.call_output, test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
-
- def test_url_encode(self):
- test_filename = "url-encode_" + self.wikiq_out_name
-
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- call = call + " --url-encode"
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
- proc.wait()
-
- copyfile(self.call_output, test_file)
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
- test = pd.read_table(test_file)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
-
-class Test_Malformed(unittest.TestCase):
- def setUp(self):
- if not os.path.exists("test_output"):
- os.mkdir("test_output")
-
- self.wiki = 'twinpeaks'
- self.wikiq_out_name = self.wiki + ".tsv"
- self.test_output_dir = os.path.join(".", "test_output")
- self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
- self.infile = "{0}.xml.7z".format(self.wiki)
- self.base_call = "../wikiq {0} -o {1}"
- self.input_dir = "dumps"
- self.input_file = os.path.join(".", self.input_dir,self.infile)
-
-
- def test_malformed_noargs(self):
-
- call = self.base_call.format(self.input_file, self.test_output_dir)
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
- proc.wait()
- outs, errs = proc.communicate()
- errlines = str(errs).split("\\n")
- self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
-
-class Test_Stdout(unittest.TestCase):
-
- def setUp(self):
- self.wiki = 'sailormoon'
- self.wikiq_out_name = self.wiki + ".tsv"
-
- self.infile = "{0}.xml.7z".format(self.wiki)
- self.base_call = "../wikiq {0} --stdout"
- self.input_dir = "dumps"
- self.input_file = os.path.join(".", self.input_dir,self.infile)
- self.baseline_output_dir = "baseline_output"
-
- def test_noargs(self):
-
- call = self.base_call.format(self.input_file)
- proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
- outs = proc.stdout.decode("utf8")
-
- test_file = "noargs_" + self.wikiq_out_name
- baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
- print(baseline_file)
- test = pd.read_table(StringIO(outs))
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test,baseline)
-
-class Test_Regex(unittest.TestCase):
-
- def setUp(self):
- self.wiki = 'regextest'
- self.wikiq_out_name = self.wiki + '.tsv'
- self.infile = "{0}.xml.bz2".format(self.wiki)
-
- self.input_dir = "dumps"
- self.input_file = os.path.join(".", self.input_dir,self.infile)
-
- if not os.path.exists("test_output"):
- os.mkdir("test_output")
-
- self.test_output_dir = os.path.join(".", "test_output")
- self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
- # we have two base calls, one for checking inputs and the other for checking outputs
- self.base_call = "../wikiq {0}"
- self.base_call_outs = "../wikiq {0} -o {1}"
-
- self.baseline_output_dir = "baseline_output"
-
- # sample inputs for checking that bad inputs get terminated / test_regex_inputs
- self.bad_inputs_list = [
- #label is missing
- "-RP '\\b\\d+\\b'",
- #number of reg and number of labels do not match
- "-RP 'NPO V' -RP THE -RPl testlabel",
- #cp but rp label
- "-CP '(Tamil|Li)' -RPl testlabel",
- #regex is missing
- "-CPl testlabel",
- "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
- ]
-
- # sample inputs for checking the outcomes of good inputs / test_basic_regex
- self.good_inputs_list = [
- "-RP '\\b\\d{3}\\b' -RPl threedigits",
- "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
- "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
- "-CP 'WP:EVADE' -CPl wp_evade"
- ]
-
-
- self.cap_inputs_list = [
- "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
- "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
- ]
-
-
-
- def test_regex_inputs(self):
- for input in self.bad_inputs_list:
- call = self.base_call.format(self.input_file)
- call = call + " --stdout " + input
- print(call)
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
- stdout,stderr = proc.communicate()
- #print(proc.returncode)
-
- # we want to check that the bad inputs were caught and sys.exit is stopping the code
- print(stderr.decode("utf-8"))
- self.assertNotEqual(proc.returncode,0)
-
- def test_basic_regex(self):
- for i, input in enumerate(self.good_inputs_list):
-
- test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
- #print(test_filename)
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call_outs.format(self.input_file, self.test_output_dir)
- call = call + " " + input
- print(call)
-
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
- proc.wait()
- copyfile(self.call_output, test_file)
-
- test = pd.read_table(test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test, baseline)
- print(i)
-
-
- def test_capturegroup_regex(self):
- for i, input in enumerate(self.cap_inputs_list):
- test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
- print(test_filename)
- test_file = os.path.join(self.test_output_dir, test_filename)
- if os.path.exists(test_file):
- os.remove(test_file)
-
- call = self.base_call_outs.format(self.input_file, self.test_output_dir)
- call = call + " " + input
- print(call)
-
- proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
- proc.wait()
-
- copyfile(self.call_output, test_file)
-
- test = pd.read_table(test_file)
-
- baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
- baseline = pd.read_table(baseline_file)
- assert_frame_equal(test, baseline)
-
-
-if __name__ == '__main__':
- unittest.main()
+import unittest\r
+import os\r
+import subprocess\r
+from shutil import copyfile\r
+import pandas as pd\r
+from pandas.util.testing import assert_frame_equal\r
+from io import StringIO\r
+\r
+# with / without pwr DONE\r
+# with / without url encode DONE\r
+# with / without collapse user DONE\r
+# with output to sdtout DONE\r
+# note that the persistence radius is 7 by default\r
+# reading various file formats including\r
+# 7z, gz, bz2, xml DONE\r
+# wikia and wikipedia data DONE\r
+# malformed xmls DONE\r
+\r
+class Test_Wikipedia(unittest.TestCase):\r
+ def setUp(self):\r
+ if not os.path.exists("test_output"):\r
+ os.mkdir("test_output")\r
+\r
+ self.wiki = 'ikwiki-20180301-pages-meta-history'\r
+ self.wikiq_out_name = self.wiki + ".tsv"\r
+ self.test_output_dir = os.path.join(".", "test_output")\r
+ self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+ self.infile = "{0}.xml.bz2".format(self.wiki) \r
+ self.base_call = "../wikiq {0} -o {1}"\r
+ self.input_dir = "dumps"\r
+ self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+ self.baseline_output_dir = "baseline_output"\r
+\r
+ def test_WP_url_encode(self):\r
+ test_filename = "url-encode_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --url-encode"\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+ copyfile(self.call_output, test_file)\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+ # as a test let's make sure that we get equal data frames\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+ def test_WP_namespaces(self):\r
+ print(os.path.abspath('.'))\r
+ test_filename = "namespaces_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " -n 0 -n 1"\r
+ print(call)\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+ copyfile(self.call_output, test_file)\r
+ baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)\r
+\r
+ # as a test let's make sure that we get equal data frames\r
+ test = pd.read_table(test_file)\r
+ num_wrong_ns = sum(~ test.namespace.isin({0,1}))\r
+ self.assertEqual(num_wrong_ns, 0)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+ def test_WP_revert_radius(self):\r
+ print(os.path.abspath('.'))\r
+ test_filename = "revert_radius_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " -n 0 -n 1 -rr 1"\r
+ print(call)\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+ copyfile(self.call_output, test_file)\r
+ baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)\r
+\r
+ # as a test let's make sure that we get equal data frames\r
+ test = pd.read_table(test_file)\r
+ num_wrong_ns = sum(~ test.namespace.isin({0,1}))\r
+ self.assertEqual(num_wrong_ns, 0)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+\r
+\r
+class Test_Basic(unittest.TestCase):\r
+\r
+ def setUp(self):\r
+ if not os.path.exists("test_output"):\r
+ os.mkdir("test_output")\r
+\r
+ self.wiki = 'sailormoon'\r
+ self.wikiq_out_name = self.wiki + ".tsv"\r
+ self.test_output_dir = os.path.join(".", "test_output")\r
+ self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+ self.infile = "{0}.xml.7z".format(self.wiki)\r
+ self.base_call = "../wikiq {0} -o {1}"\r
+ self.input_dir = "dumps"\r
+ self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+ self.baseline_output_dir = "baseline_output"\r
+\r
+ def test_noargs(self):\r
+\r
+ test_filename = "noargs_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+ copyfile(self.call_output, test_file)\r
+\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+\r
+ def test_collapse_user(self):\r
+ test_filename = "collapse-user_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --collapse-user"\r
+\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+ copyfile(self.call_output, test_file)\r
+\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+ def test_pwr_segment(self):\r
+ test_filename = "persistence_segment_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --persistence segment"\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+\r
+ copyfile(self.call_output, test_file)\r
+\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+ def test_pwr_legacy(self):\r
+ test_filename = "persistence_legacy_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --persistence legacy"\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+\r
+ copyfile(self.call_output, test_file)\r
+\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+ def test_pwr(self):\r
+ test_filename = "persistence_" + self.wikiq_out_name\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file): \r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --persistence"\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+\r
+ copyfile(self.call_output, test_file)\r
+\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+\r
+ def test_url_encode(self):\r
+ test_filename = "url-encode_" + self.wikiq_out_name\r
+\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+ \r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ call = call + " --url-encode"\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+ copyfile(self.call_output, test_file)\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+ test = pd.read_table(test_file)\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+\r
+class Test_Malformed(unittest.TestCase):\r
+ def setUp(self):\r
+ if not os.path.exists("test_output"):\r
+ os.mkdir("test_output")\r
+\r
+ self.wiki = 'twinpeaks'\r
+ self.wikiq_out_name = self.wiki + ".tsv"\r
+ self.test_output_dir = os.path.join(".", "test_output")\r
+ self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+\r
+ self.infile = "{0}.xml.7z".format(self.wiki)\r
+ self.base_call = "../wikiq {0} -o {1}"\r
+ self.input_dir = "dumps"\r
+ self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+\r
+\r
+ def test_malformed_noargs(self):\r
+\r
+ call = self.base_call.format(self.input_file, self.test_output_dir)\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)\r
+ proc.wait()\r
+ outs, errs = proc.communicate()\r
+ errlines = str(errs).split("\\n")\r
+ self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')\r
+\r
+class Test_Stdout(unittest.TestCase):\r
+\r
+ def setUp(self):\r
+ self.wiki = 'sailormoon'\r
+ self.wikiq_out_name = self.wiki + ".tsv"\r
+\r
+ self.infile = "{0}.xml.7z".format(self.wiki)\r
+ self.base_call = "../wikiq {0} --stdout"\r
+ self.input_dir = "dumps"\r
+ self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+ self.baseline_output_dir = "baseline_output"\r
+\r
+ def test_noargs(self):\r
+\r
+ call = self.base_call.format(self.input_file)\r
+ proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)\r
+ outs = proc.stdout.decode("utf8")\r
+\r
+ test_file = "noargs_" + self.wikiq_out_name\r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_file)\r
+ print(baseline_file)\r
+ test = pd.read_table(StringIO(outs))\r
+ baseline = pd.read_table(baseline_file)\r
+ assert_frame_equal(test,baseline)\r
+\r
+class Test_Regex(unittest.TestCase):\r
+\r
+ def setUp(self):\r
+ self.wiki = 'emptytext'\r
+ self.wikiq_out_name = self.wiki + '.tsv'\r
+ self.infile = "{0}.xml.bz2".format(self.wiki)\r
+\r
+ self.input_dir = "dumps"\r
+ self.input_file = os.path.join(".", self.input_dir,self.infile)\r
+\r
+ if not os.path.exists("test_output"):\r
+ os.mkdir("test_output")\r
+\r
+ self.test_output_dir = os.path.join(".", "test_output")\r
+ self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)\r
+ # we have two base calls, one for checking inputs and the other for checking outputs\r
+ self.base_call = "../wikiq {0}"\r
+ self.base_call_outs = "../wikiq {0} -o {1}"\r
+\r
+ self.baseline_output_dir = "baseline_output"\r
+\r
+ # sample inputs for checking that bad inputs get terminated / test_regex_inputs\r
+ self.bad_inputs_list = [\r
+ #label is missing \r
+ "-RP '\\b\\d+\\b'", \r
+ #number of reg and number of labels do not match \r
+ "-RP 'NPO V' -RP THE -RPl testlabel",\r
+ #cp but rp label\r
+ "-CP '(Tamil|Li)' -RPl testlabel",\r
+ #regex is missing\r
+ "-CPl testlabel",\r
+ "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"\r
+ ]\r
+\r
+ # sample inputs for checking the outcomes of good inputs / test_basic_regex\r
+ self.good_inputs_list = [\r
+ "-RP '\\b\\d{3}\\b' -RPl threedigits",\r
+ "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",\r
+ "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",\r
+ "-CP 'WP:EVADE' -CPl wp_evade" \r
+ ]\r
+\r
+ \r
+ self.cap_inputs_list = [\r
+ "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",\r
+ "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"\r
+ ]\r
+\r
+\r
+\r
+ def test_regex_inputs(self):\r
+ for input in self.bad_inputs_list:\r
+ call = self.base_call.format(self.input_file)\r
+ call = call + " --stdout " + input\r
+ #print(call)\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+ stdout,stderr = proc.communicate()\r
+ #print(proc.returncode)\r
+ \r
+ # we want to check that the bad inputs were caught and sys.exit is stopping the code\r
+ #print(stderr.decode("utf-8"))\r
+ self.assertNotEqual(proc.returncode,0)\r
+\r
+ def test_basic_regex(self):\r
+ for i, input in enumerate(self.good_inputs_list):\r
+\r
+ test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))\r
+ #print(test_filename)\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+\r
+ call = self.base_call_outs.format(self.input_file, self.test_output_dir)\r
+ call = call + " " + input\r
+ #print(call)\r
+\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+ copyfile(self.call_output, test_file)\r
+\r
+ test = pd.read_table(test_file)\r
+ \r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+ baseline = pd.read_table(baseline_file)\r
+ #assert_frame_equal(test, baseline)\r
+ #print(i)\r
+\r
+\r
+ def test_capturegroup_regex(self):\r
+ for i, input in enumerate(self.cap_inputs_list):\r
+ test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))\r
+ #print(test_filename)\r
+ test_file = os.path.join(self.test_output_dir, test_filename)\r
+ if os.path.exists(test_file):\r
+ os.remove(test_file)\r
+\r
+ call = self.base_call_outs.format(self.input_file, self.test_output_dir)\r
+ call = call + " " + input\r
+ #print(call)\r
+\r
+ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)\r
+ proc.wait()\r
+\r
+ copyfile(self.call_output, test_file)\r
+ \r
+ test = pd.read_table(test_file)\r
+ \r
+ baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)\r
+ baseline = pd.read_table(baseline_file)\r
+ #assert_frame_equal(test, baseline)\r
+\r
+\r
+if __name__ == '__main__':\r
+ unittest.main()\r
--- /dev/null
+anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars threedigits title
+FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 0 None "User talk:86.139.142.254"
+FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 None "User talk:Kavin kavitha"
+FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 None "User talk:Dr.vivek163"
+FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 None "User talk:Twistorl"
+FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 119, 978, 500, 292, 225, 199, 292 "Kom Firin"
+FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 119, 978, 500, 292, 225, 199, 292 "Kom Firin"
+FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 798, 150, 150, 150, 621, 100, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 798, 150, 150, 150, 621, 100, 621 "User:Editingaccount1994/sandbox"
+FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 None "Anita del Rey"
+FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 119, 157, 119, 157, 119, 157, 119, 157 "User talk:119.94.96.157"
+FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 None "Category:Ohmi Railway"
+FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 None "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 None "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 126, 126, 126, 126 "User talk:92.226.219.222"
+FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 None "User:Dipayanacharya"
+FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 None "User:Dipayanacharya"
+FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 None "BSCIC"
+FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 None "Category:Women government ministers of Yemen"
+FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 None "Talk:List of Morning Glories Characters"
+FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114 "User talk:106.207.126.114"
+FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114 "User talk:106.207.126.114"
--- /dev/null
+anon articleid date_time deleted editor editor_id minor namespace page_word revert reverteds revid sha1 testcases text_chars title
+FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 None FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 None 0 "User talk:86.139.142.254"
+FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 None FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo None 663 "User talk:Kavin kavitha"
+FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 None FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv TestCase, TestCase 399 "User talk:Dr.vivek163"
+FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf None 1260 "User talk:Twistorl"
+FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 page FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 TestCase 2249 "Kom Firin"
+FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 page FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh None 2230 "Kom Firin"
+FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 page, page FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 None 27840 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 page, page FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 None 27787 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj None 27784 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h None 27783 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg None 27782 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 page, page FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 None 27757 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 page, page FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 None 27667 "User:Editingaccount1994/sandbox"
+FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 None FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 None 25 "Anita del Rey"
+FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None 1274 "User talk:119.94.96.157"
+FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 None FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr None 113 "Category:Ohmi Railway"
+FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 None FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq None 199 "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 page, page, page, page FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg None 1840 "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 page, page, page, page, page, page FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 None 2949 "User talk:92.226.219.222"
+FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 None FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk None 28 "User:Dipayanacharya"
+FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 None FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw None 38 "User:Dipayanacharya"
+FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 None FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd None 65 "BSCIC"
+FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 None FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n None 285 "Category:Women government ministers of Yemen"
+FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 None FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None 103 "Talk:List of Morning Glories Characters"
+FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi None 1330 "User talk:106.207.126.114"
+FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 page FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe None 2355 "User talk:106.207.126.114"
--- /dev/null
+anon articleid chev_com date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title warning wiki_welcome
+FALSE 56237363 None 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 0 "User talk:86.139.142.254" None None
+FALSE 56237364 None 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 "User talk:Kavin kavitha" None None
+FALSE 56237365 None 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 "User talk:Dr.vivek163" None None
+FALSE 56237366 None 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 "User talk:Twistorl" Warning welcome to Wikipedia
+FALSE 56237368 None 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 "Kom Firin" None None
+FALSE 56237368 None 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 "Kom Firin" None None
+FALSE 56237369 Chevalier, Chevalier 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 None 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 None 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 None 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 None 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 None 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 "User:Editingaccount1994/sandbox" None None
+FALSE 56237369 Chevalier, Chevalier 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 "User:Editingaccount1994/sandbox" None None
+FALSE 56237370 None 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 "Anita del Rey" None None
+FALSE 56237371 None 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 "User talk:119.94.96.157" Warning welcome to Wikipedia
+FALSE 56237372 None 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 "Category:Ohmi Railway" None None
+FALSE 56237375 None 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 "User talk:92.226.219.222" None None
+FALSE 56237375 None 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 "User talk:92.226.219.222" None None
+FALSE 56237375 None 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 "User talk:92.226.219.222" None None
+FALSE 56237376 None 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 "User:Dipayanacharya" None None
+FALSE 56237376 None 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 "User:Dipayanacharya" None None
+FALSE 56237378 None 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 "BSCIC" None None
+FALSE 56237379 None 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 "Category:Women government ministers of Yemen" None None
+FALSE 56237381 None 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 "Talk:List of Morning Glories Characters" None None
+FALSE 56237382 None 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 "User talk:106.207.126.114" Warning welcome to Wikipedia
+FALSE 56237382 None 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 "User talk:106.207.126.114" None welcome to Wikipedia
--- /dev/null
+anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title wp_evade
+FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 0 "User talk:86.139.142.254" WP:EVADE
+FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 "User talk:Kavin kavitha" None
+FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 "User talk:Dr.vivek163" None
+FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 "User talk:Twistorl" None
+FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 "Kom Firin" None
+FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 "Kom Firin" None
+FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 "User:Editingaccount1994/sandbox" None
+FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 "User:Editingaccount1994/sandbox" None
+FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 "Anita del Rey" None
+FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 "User talk:119.94.96.157" None
+FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 "Category:Ohmi Railway" None
+FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 "User talk:92.226.219.222" None
+FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 "User talk:92.226.219.222" None
+FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 "User talk:92.226.219.222" WP:EVADE
+FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 "User:Dipayanacharya" None
+FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 "User:Dipayanacharya" None
+FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 "BSCIC" None
+FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 "Category:Women government ministers of Yemen" None
+FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 "Talk:List of Morning Glories Characters" None
+FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 "User talk:106.207.126.114" None
+FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 "User talk:106.207.126.114" None
--- /dev/null
+anon articleid date_time deleted editor editor_id li_cheval minor namespace revert reverteds revid sha1 text_chars three_cat three_letter three_number title
+FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 None FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 0 None has, has None "User talk:86.139.142.254"
+FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 None FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 None AES, for 01, 12, 2001 "User talk:Kavin kavitha"
+FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 None FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 None new None "User talk:Dr.vivek163"
+FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 None None 1 "User talk:Twistorl"
+FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 None FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 None AES, jpg, the, the, the, the, and, you, Tor 67, 119 "Kom Firin"
+FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 None TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 None None None "Kom Firin"
+FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 None AES, nom None "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 None web, See, for None "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 None per, TFD, TFD None "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 None per, for, Log, TFD 2010, 13 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 None per, for, Log, TFD 2011, 17 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 None you, are, tor, you None "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 None jpg, jpg, has, COM 16, 2018 "User:Editingaccount1994/sandbox"
+FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 None FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 None alt None "Anita del Rey"
+FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 None None 119, 94, 96, 157, 119, 94, 96, 157, 1 "User talk:119.94.96.157"
+FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 None FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 None AES None "Category:Ohmi Railway"
+FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 None FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 None AES None "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 None TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 None See, for None "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 None FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 None has, has None "User talk:92.226.219.222"
+FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 None FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 None None None "User:Dipayanacharya"
+FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 None FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 None None None "User:Dipayanacharya"
+FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 None FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 None AES, and None "BSCIC"
+FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 None FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 None AES, Non None "Category:Women government ministers of Yemen"
+FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 None FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 None AES, low, low None "Talk:List of Morning Glories Characters"
+FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 None None 106, 207, 126, 114, 106, 207, 126, 114, 1 "User talk:106.207.126.114"
+FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 None FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 None None None "User talk:106.207.126.114"
--- /dev/null
+anon articleid date_time deleted editor editor_id minor namespace npov_neutral npov_npov revert reverteds revid sha1 testcase_a testcase_b testcase_c testcase_d text_chars title
+FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 None None FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 None None None None 0 "User talk:86.139.142.254"
+FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 None None FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo None None None None 663 "User talk:Kavin kavitha"
+FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 None NPOV, NPOV FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv None None None None 399 "User talk:Dr.vivek163"
+FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf None None None None 1260 "User talk:Twistorl"
+FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 None NPOV FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 None TestCaseB None None 2249 "Kom Firin"
+FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 None None FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh None None None None 2230 "Kom Firin"
+FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 None None FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 None None None None 27840 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 None None FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 None None None None 27787 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj None None None None 27784 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h None None None None 27783 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg None None None None 27782 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 None None FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 None None None TestCaseD 27757 "User:Editingaccount1994/sandbox"
+FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 None None FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 None None None None 27667 "User:Editingaccount1994/sandbox"
+FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 None None FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 None None None None 25 "Anita del Rey"
+FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None None None None 1274 "User talk:119.94.96.157"
+FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 None None FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr None None None None 113 "Category:Ohmi Railway"
+FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 None None FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq None None None None 199 "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 None None FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg None None None None 1840 "User talk:92.226.219.222"
+FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 None None FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 None None None None 2949 "User talk:92.226.219.222"
+FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 None None FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk None None None None 28 "User:Dipayanacharya"
+FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 None None FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw None None None None 38 "User:Dipayanacharya"
+FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 None None FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd None None None None 65 "BSCIC"
+FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 None None FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n None None None None 285 "Category:Women government ministers of Yemen"
+FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 None None FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None None None None 103 "Talk:List of Morning Glories Characters"
+FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi None None None None 1330 "User talk:106.207.126.114"
+FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 None None FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe None None None None 2355 "User talk:106.207.126.114"
-#!/usr/bin/env python3
-
-# original wikiq headers are: title articleid revid date_time anon
-# editor editor_id minor text_size text_entropy text_md5 reversion
-# additions_size deletions_size
-
-import argparse
-import sys
-import os, os.path
-import re
-
-from subprocess import Popen, PIPE
-from collections import deque
-from hashlib import sha1
-
-from mwxml import Dump
-
-from deltas.tokenizers import wikitext_split
-import mwpersistence
-import mwreverts
-from urllib.parse import quote
-TO_ENCODE = ('title', 'editor')
-PERSISTENCE_RADIUS=7
-from deltas import SequenceMatcher
-from deltas import SegmentMatcher
-
-class PersistMethod:
- none = 0
- sequence = 1
- segment = 2
- legacy = 3
-
-def calculate_persistence(tokens_added):
- return(sum([(len(x.revisions)-1) for x in tokens_added]),
- len(tokens_added))
-
-
-class WikiqIterator():
- def __init__(self, fh, collapse_user=False):
- self.fh = fh
- self.collapse_user = collapse_user
- self.mwiterator = Dump.from_file(self.fh)
- self.namespace_map = { ns.id : ns.name for ns in
- self.mwiterator.site_info.namespaces }
- self.__pages = self.load_pages()
-
- def load_pages(self):
- for page in self.mwiterator:
- yield WikiqPage(page,
- namespace_map = self.namespace_map,
- collapse_user=self.collapse_user)
-
- def __iter__(self):
- return self.__pages
-
- def __next__(self):
- return next(self._pages)
-
-class WikiqPage():
- __slots__ = ('id', 'title', 'namespace', 'redirect',
- 'restrictions', 'mwpage', '__revisions',
- 'collapse_user')
-
- def __init__(self, page, namespace_map, collapse_user=False):
- self.id = page.id
- self.namespace = page.namespace
- # following mwxml, we assume namespace 0 in cases where
- # page.namespace is inconsistent with namespace_map
- if page.namespace not in namespace_map:
- self.title = page.title
- page.namespace = 0
- if page.namespace != 0:
- self.title = ':'.join([namespace_map[page.namespace], page.title])
- else:
- self.title = page.title
- self.restrictions = page.restrictions
- self.collapse_user = collapse_user
- self.mwpage = page
- self.__revisions = self.rev_list()
-
- def rev_list(self):
- # Outline for how we want to handle collapse_user=True
- # iteration rev.user prev_rev.user add prev_rev?
- # 0 A None Never
- # 1 A A False
- # 2 B A True
- # 3 A B True
- # 4 A A False
- # Post-loop A Always
- for i, rev in enumerate(self.mwpage):
- # never yield the first time
- if i == 0:
- if self.collapse_user:
- collapsed_revs = 1
- rev.collapsed_revs = collapsed_revs
-
- else:
- if self.collapse_user:
- # yield if this is the last edit in a seq by a user and reset
- # also yield if we do know who the user is
-
- if rev.deleted.user or prev_rev.deleted.user:
- yield prev_rev
- collapsed_revs = 1
- rev.collapsed_revs = collapsed_revs
-
- elif not rev.user.text == prev_rev.user.text:
- yield prev_rev
- collapsed_revs = 1
- rev.collapsed_revs = collapsed_revs
- # otherwise, add one to the counter
- else:
- collapsed_revs += 1
- rev.collapsed_revs = collapsed_revs
- # if collapse_user is false, we always yield
- else:
- yield prev_rev
-
- prev_rev = rev
-
- # also yield the final time
- yield prev_rev
-
- def __iter__(self):
- return self.__revisions
-
- def __next__(self):
- return next(self.__revisions)
-
-
-class RegexPair(object):
- def __init__(self, pattern, label):
- self.pattern = re.compile(pattern)
- self.label = label
- self.has_groups = bool(self.pattern.groupindex)
- if self.has_groups:
- self.capture_groups = list(self.pattern.groupindex.keys())
-
- def _make_key(self, cap_group):
- return ("{}_{}".format(self.label, cap_group))
-
- def matchmake(self, content, rev_data):
-
- temp_dict = {}
- # if there are named capture groups in the regex
- if self.has_groups:
-
- # if there are matches of some sort in this revision content, fill the lists for each cap_group
- if self.pattern.search(content) is not None:
- m = self.pattern.finditer(content)
- matchobjects = list(m)
-
- for cap_group in self.capture_groups:
- key = self._make_key(cap_group)
- temp_list = []
- for match in matchobjects:
- # we only want to add the match for the capture group if the match is not None
- if match.group(cap_group) != None:
- temp_list.append(match.group(cap_group))
-
- # if temp_list of matches is empty just make that column None
- if len(temp_list)==0:
- temp_dict[key] = None
- # else we put in the list we made in the for-loop above
- else:
- temp_dict[key] = ', '.join(temp_list)
-
- # there are no matches at all in this revision content, we default values to None
- else:
- for cap_group in self.capture_groups:
- key = self._make_key(cap_group)
- temp_dict[key] = None
-
- # there are no capture groups, we just search for all the matches of the regex
- else:
- #given that there are matches to be made
- if self.pattern.search(content) is not None:
- m = self.pattern.findall(content)
- temp_dict[self.label] = ', '.join(m)
- else:
- temp_dict[self.label] = None
- # update rev_data with our new columns
- rev_data.update(temp_dict)
- return rev_data
-
-
-class WikiqParser():
- def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
- """
- Parameters:
- persist : what persistence method to use. Takes a PersistMethod value
- """
- self.input_file = input_file
- self.output_file = output_file
- self.collapse_user = collapse_user
- self.persist = persist
- self.printed_header = False
- self.namespaces = []
- self.urlencode = urlencode
- self.revert_radius = revert_radius
-
- if namespaces is not None:
- self.namespace_filter = set(namespaces)
- else:
- self.namespace_filter = None
-
- self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
- self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
-
-
- def make_matchmake_pairs(self, patterns, labels):
- if (patterns is not None and labels is not None) and \
- (len(patterns) == len(labels)):
- return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
- elif (patterns is None and labels is None):
- return []
- else:
- sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
-
- def matchmake(self, rev, rev_data):
- rev_data = self.matchmake_revision(rev.text, rev_data)
- rev_data = self.matchmake_comment(rev.comment, rev_data)
- return rev_data
-
- def matchmake_revision(self, text, rev_data):
- return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
-
- def matchmake_comment(self, comment, rev_data):
- return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
-
- def matchmake_pairs(self, text, rev_data, pairs):
- for pair in pairs:
- rev_data = pair.matchmake(text, rev_data)
- return rev_data
-
- def __get_namespace_from_title(self, title):
- default_ns = None
-
- for ns in self.namespaces:
- # skip if the namespace is not defined
- if ns == None:
- default_ns = self.namespaces[ns]
- continue
-
- if title.startswith(ns + ":"):
- return self.namespaces[ns]
-
- # if we've made it this far with no matches, we return the default namespace
- return default_ns
-
-
- def process(self):
-
- # create a regex that creates the output filename
- # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
- # r'output/wikiq-\1-\2.tsv',
- # input_filename)
-
- # Construct dump file iterator
- dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
-
- # extract list of namspaces
- self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
-
- page_count = 0
- rev_count = 0
-
-
- # Iterate through pages
- for page in dump:
- namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
-
- # skip namespaces not in the filter
- if self.namespace_filter is not None:
- if namespace not in self.namespace_filter:
- continue
-
- rev_detector = mwreverts.Detector(radius = self.revert_radius)
-
- if self.persist != PersistMethod.none:
- window = deque(maxlen=PERSISTENCE_RADIUS)
-
- if self.persist == PersistMethod.sequence:
- state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
- revert_radius=PERSISTENCE_RADIUS)
-
- elif self.persist == PersistMethod.segment:
- state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
- revert_radius=PERSISTENCE_RADIUS)
-
- # self.persist == PersistMethod.legacy
- else:
- from mw.lib import persistence
- state = persistence.State()
-
- # Iterate through a page's revisions
- for rev in page:
-
- # initialize rev_data
- rev_data = {
- 'revid':rev.id,
- 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
- 'articleid' : page.id,
- 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
- 'title' : '"' + page.title + '"',
- 'namespace' : namespace,
- 'deleted' : "TRUE" if rev.deleted.text else "FALSE"
- }
-
- rev_data = self.matchmake(rev, rev_data)
-
- # if revisions are deleted, /many/ things will be missing
- if rev.deleted.text:
- rev_data['text_chars'] = ""
- rev_data['sha1'] = ""
- rev_data['revert'] = ""
- rev_data['reverteds'] = ""
-
- else:
- # rev.text can be None if the page has no text
- if not rev.text:
- rev.text = ""
- # if text exists, we'll check for a sha1 and generate one otherwise
-
- if rev.sha1:
- text_sha1 = rev.sha1
- else:
-
- text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
-
- rev_data['sha1'] = text_sha1
-
- # TODO rev.bytes doesn't work.. looks like a bug
- rev_data['text_chars'] = len(rev.text)
-
- # generate revert data
- revert = rev_detector.process(text_sha1, rev.id)
-
- if revert:
- rev_data['revert'] = "TRUE"
- rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
- else:
- rev_data['revert'] = "FALSE"
- rev_data['reverteds'] = ""
-
- # if the fact that the edit was minor can be hidden, this might be an issue
- rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
-
- if not rev.deleted.user:
- # wrap user-defined editors in quotes for fread
- rev_data['editor'] = '"' + rev.user.text + '"'
- rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
-
- else:
- rev_data['anon'] = ""
- rev_data['editor'] = ""
-
- #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
- # redirect = True
- #else:
- # redirect = False
-
- #TODO missing: additions_size deletions_size
-
- # if collapse user was on, lets run that
- if self.collapse_user:
- rev_data['collapsed_revs'] = rev.collapsed_revs
-
- if self.persist != PersistMethod.none:
- if rev.deleted.text:
- for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
- old_rev_data[k] = None
- else:
-
- if self.persist != PersistMethod.legacy:
- _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
-
- else:
- _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
-
- window.append((rev.id, rev_data, tokens_added, tokens_removed))
-
- if len(window) == PERSISTENCE_RADIUS:
- old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
-
- num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
-
- old_rev_data["token_revs"] = num_token_revs
- old_rev_data["tokens_added"] = num_tokens
- old_rev_data["tokens_removed"] = len(old_tokens_removed)
- old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
-
- self.print_rev_data(old_rev_data)
-
- else:
- self.print_rev_data(rev_data)
-
- rev_count += 1
-
- if self.persist != PersistMethod.none:
- # print out metadata for the last RADIUS revisions
- for i, item in enumerate(window):
- # if the window was full, we've already printed item 0
- if len(window) == PERSISTENCE_RADIUS and i == 0:
- continue
-
- rev_id, rev_data, tokens_added, tokens_removed = item
- num_token_revs, num_tokens = calculate_persistence(tokens_added)
-
- rev_data["token_revs"] = num_token_revs
- rev_data["tokens_added"] = num_tokens
- rev_data["tokens_removed"] = len(tokens_removed)
- rev_data["tokens_window"] = len(window)-(i+1)
-
- self.print_rev_data(rev_data)
-
- page_count += 1
-
- print("Done: %s revisions and %s pages." % (rev_count, page_count),
- file=sys.stderr)
-
- def print_rev_data(self, rev_data):
- # if it's the first time through, print the header
- if self.urlencode:
- for field in TO_ENCODE:
- rev_data[field] = quote(str(rev_data[field]))
-
- if not self.printed_header:
- print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
- self.printed_header = True
-
- print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
-
-
-def open_input_file(input_filename):
- if re.match(r'.*\.7z$', input_filename):
- cmd = ["7za", "x", "-so", input_filename, '*']
- elif re.match(r'.*\.gz$', input_filename):
- cmd = ["zcat", input_filename]
- elif re.match(r'.*\.bz2$', input_filename):
- cmd = ["bzcat", "-dk", input_filename]
-
- try:
- input_file = Popen(cmd, stdout=PIPE).stdout
- except NameError:
- input_file = open(input_filename, 'r')
-
- return input_file
-
-def open_output_file(input_filename):
- # create a regex that creates the output filename
- output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
- output_filename = re.sub(r'\.xml', '', output_filename)
- output_filename = output_filename + ".tsv"
- output_file = open(output_filename, "w")
-
- return output_file
-
-parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
-
-# arguments for the input direction
-parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
- help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
-
-parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
- help="Directory for output files.")
-
-parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
- help="Write output to standard out (do not create dump file)")
-
-parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
- help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
-
-parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
- help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
-
-parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
- help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
-
-parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
- help="Id number of namspace to include. Can be specified more than once.")
-
-parser.add_argument('-rr',
- '--revert-radius',
- dest="revert_radius",
- type=int,
- action='store',
- default=15,
- help="Number of edits to check when looking for reverts (default: 15)")
-
-parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
- help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
-
-parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
- help="The label for the outputted column based on matching the regex in revision text.")
-
-parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
- help="The regular expression to search for in comments of revisions.")
-
-parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
- help="The label for the outputted column based on matching the regex in comments.")
-
-args = parser.parse_args()
-
-# set persistence method
-
-if args.persist is None:
- persist = PersistMethod.none
-elif args.persist == "segment":
- persist = PersistMethod.segment
-elif args.persist == "legacy":
- persist = PersistMethod.legacy
-else:
- persist = PersistMethod.sequence
-
-if args.namespace_filter is not None:
- namespaces = args.namespace_filter
-else:
- namespaces = None
-
-if len(args.dumpfiles) > 0:
- for filename in args.dumpfiles:
- input_file = open_input_file(filename)
-
- # open directory for output
- if args.output_dir:
- output_dir = args.output_dir[0]
- else:
- output_dir = "."
-
- print("Processing file: %s" % filename, file=sys.stderr)
-
- if args.stdout:
- output_file = sys.stdout
- else:
- filename = os.path.join(output_dir, os.path.basename(filename))
- output_file = open_output_file(filename)
-
- wikiq = WikiqParser(input_file,
- output_file,
- collapse_user=args.collapse_user,
- persist=persist,
- urlencode=args.urlencode,
- namespaces=namespaces,
- revert_radius=args.revert_radius,
- regex_match_revision = args.regex_match_revision,
- regex_revision_label = args.regex_revision_label,
- regex_match_comment = args.regex_match_comment,
- regex_comment_label = args.regex_comment_label)
-
- wikiq.process()
-
- # close things
- input_file.close()
- output_file.close()
-else:
- wikiq = WikiqParser(sys.stdin,
- sys.stdout,
- collapse_user=args.collapse_user,
- persist=persist,
- #persist_legacy=args.persist_legacy,
- urlencode=args.urlencode,
- namespaces=namespaces,
- revert_radius=args.revert_radius,
- regex_match_revision = args.regex_match_revision,
- regex_revision_label = args.regex_revision_label,
- regex_match_comment = args.regex_match_comment,
- regex_comment_label = args.regex_comment_label)
-
- wikiq.process()
-
-# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
-# stop_words = stop_words.split(",")
+#!/usr/bin/env python3\r
+\r
+# original wikiq headers are: title articleid revid date_time anon\r
+# editor editor_id minor text_size text_entropy text_md5 reversion\r
+# additions_size deletions_size\r
+\r
+import argparse\r
+import sys\r
+import os, os.path\r
+import re\r
+\r
+from subprocess import Popen, PIPE\r
+from collections import deque\r
+from hashlib import sha1\r
+\r
+from mwxml import Dump\r
+\r
+from deltas.tokenizers import wikitext_split\r
+import mwpersistence\r
+import mwreverts\r
+from urllib.parse import quote\r
+TO_ENCODE = ('title', 'editor')\r
+PERSISTENCE_RADIUS=7\r
+from deltas import SequenceMatcher\r
+from deltas import SegmentMatcher\r
+\r
+class PersistMethod:\r
+ none = 0\r
+ sequence = 1\r
+ segment = 2\r
+ legacy = 3\r
+\r
+def calculate_persistence(tokens_added):\r
+ return(sum([(len(x.revisions)-1) for x in tokens_added]),\r
+ len(tokens_added))\r
+\r
+\r
+class WikiqIterator():\r
+ def __init__(self, fh, collapse_user=False):\r
+ self.fh = fh\r
+ self.collapse_user = collapse_user\r
+ self.mwiterator = Dump.from_file(self.fh)\r
+ self.namespace_map = { ns.id : ns.name for ns in\r
+ self.mwiterator.site_info.namespaces }\r
+ self.__pages = self.load_pages()\r
+\r
+ def load_pages(self):\r
+ for page in self.mwiterator:\r
+ yield WikiqPage(page,\r
+ namespace_map = self.namespace_map,\r
+ collapse_user=self.collapse_user)\r
+\r
+ def __iter__(self):\r
+ return self.__pages\r
+\r
+ def __next__(self):\r
+ return next(self._pages)\r
+\r
+class WikiqPage():\r
+ __slots__ = ('id', 'title', 'namespace', 'redirect',\r
+ 'restrictions', 'mwpage', '__revisions',\r
+ 'collapse_user')\r
+ \r
+ def __init__(self, page, namespace_map, collapse_user=False):\r
+ self.id = page.id\r
+ self.namespace = page.namespace\r
+ # following mwxml, we assume namespace 0 in cases where\r
+ # page.namespace is inconsistent with namespace_map\r
+ if page.namespace not in namespace_map:\r
+ self.title = page.title\r
+ page.namespace = 0\r
+ if page.namespace != 0:\r
+ self.title = ':'.join([namespace_map[page.namespace], page.title])\r
+ else:\r
+ self.title = page.title\r
+ self.restrictions = page.restrictions\r
+ self.collapse_user = collapse_user\r
+ self.mwpage = page\r
+ self.__revisions = self.rev_list()\r
+\r
+ def rev_list(self):\r
+ # Outline for how we want to handle collapse_user=True\r
+ # iteration rev.user prev_rev.user add prev_rev?\r
+ # 0 A None Never\r
+ # 1 A A False\r
+ # 2 B A True\r
+ # 3 A B True\r
+ # 4 A A False\r
+ # Post-loop A Always\r
+ for i, rev in enumerate(self.mwpage):\r
+ # never yield the first time\r
+ if i == 0:\r
+ if self.collapse_user: \r
+ collapsed_revs = 1\r
+ rev.collapsed_revs = collapsed_revs\r
+\r
+ else:\r
+ if self.collapse_user:\r
+ # yield if this is the last edit in a seq by a user and reset\r
+ # also yield if we do know who the user is\r
+\r
+ if rev.deleted.user or prev_rev.deleted.user:\r
+ yield prev_rev\r
+ collapsed_revs = 1\r
+ rev.collapsed_revs = collapsed_revs\r
+\r
+ elif not rev.user.text == prev_rev.user.text:\r
+ yield prev_rev\r
+ collapsed_revs = 1\r
+ rev.collapsed_revs = collapsed_revs\r
+ # otherwise, add one to the counter\r
+ else:\r
+ collapsed_revs += 1\r
+ rev.collapsed_revs = collapsed_revs\r
+ # if collapse_user is false, we always yield\r
+ else:\r
+ yield prev_rev\r
+\r
+ prev_rev = rev\r
+\r
+ # also yield the final time\r
+ yield prev_rev\r
+\r
+ def __iter__(self):\r
+ return self.__revisions\r
+\r
+ def __next__(self):\r
+ return next(self.__revisions)\r
+\r
+\r
+class RegexPair(object):\r
+ def __init__(self, pattern, label):\r
+ self.pattern = re.compile(pattern)\r
+ self.label = label\r
+ self.has_groups = bool(self.pattern.groupindex)\r
+ if self.has_groups:\r
+ self.capture_groups = list(self.pattern.groupindex.keys())\r
+ \r
+ def _make_key(self, cap_group):\r
+ return ("{}_{}".format(self.label, cap_group))\r
+\r
+ def matchmake(self, content, rev_data):\r
+ \r
+ temp_dict = {}\r
+ \r
+ # the searched text (content, which is rev.comment or rev.text) is empty\r
+ if (content==""):\r
+ # if there are capture groups, we go through and put in a value for each group\r
+ if self.has_groups:\r
+ for cap_group in self.capture_groups:\r
+ key = self._make_key(cap_group)\r
+ temp_dict[key] = None\r
+ # if no capture groups, just put the value in for the associated label\r
+ else:\r
+ temp_dict[self.label] = None\r
+ # searched text is not empty and we do the searches\r
+ else:\r
+ # if there are named capture groups in the regex\r
+ if self.has_groups:\r
+\r
+ # if there are matches of some sort in this revision content, fill the lists for each cap_group\r
+ if self.pattern.search(content) is not None:\r
+ m = self.pattern.finditer(content)\r
+ matchobjects = list(m)\r
+\r
+ for cap_group in self.capture_groups:\r
+ key = self._make_key(cap_group)\r
+ temp_list = []\r
+ for match in matchobjects:\r
+ # we only want to add the match for the capture group if the match is not None\r
+ if match.group(cap_group) != None:\r
+ temp_list.append(match.group(cap_group))\r
+\r
+ # if temp_list of matches is empty just make that column None\r
+ if len(temp_list)==0:\r
+ temp_dict[key] = None\r
+ # else we put in the list we made in the for-loop above\r
+ else:\r
+ temp_dict[key] = ', '.join(temp_list)\r
+\r
+ # there are no matches at all in this revision content, we default values to None\r
+ else:\r
+ for cap_group in self.capture_groups:\r
+ key = self._make_key(cap_group)\r
+ temp_dict[key] = None\r
+\r
+ # there are no capture groups, we just search for all the matches of the regex\r
+ else:\r
+ #given that there are matches to be made\r
+ if self.pattern.search(content) is not None:\r
+ m = self.pattern.findall(content)\r
+ temp_dict[self.label] = ', '.join(m)\r
+ else:\r
+ temp_dict[self.label] = None \r
+ \r
+ # update rev_data with our new columns\r
+ rev_data.update(temp_dict)\r
+ return rev_data\r
+\r
+ \r
+class WikiqParser():\r
+ def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):\r
+ """ \r
+ Parameters:\r
+ persist : what persistence method to use. Takes a PersistMethod value\r
+ """\r
+ self.input_file = input_file\r
+ self.output_file = output_file\r
+ self.collapse_user = collapse_user\r
+ self.persist = persist\r
+ self.printed_header = False\r
+ self.namespaces = []\r
+ self.urlencode = urlencode\r
+ self.revert_radius = revert_radius\r
+\r
+ if namespaces is not None:\r
+ self.namespace_filter = set(namespaces)\r
+ else:\r
+ self.namespace_filter = None\r
+\r
+ self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)\r
+ self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)\r
+ \r
+\r
+ def make_matchmake_pairs(self, patterns, labels):\r
+ if (patterns is not None and labels is not None) and \\r
+ (len(patterns) == len(labels)):\r
+ return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]\r
+ elif (patterns is None and labels is None):\r
+ return []\r
+ else:\r
+ sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')\r
+\r
+ def matchmake(self, rev, rev_data):\r
+ if not rev.text:\r
+ rev.text = ""\r
+ if not rev.comment:\r
+ rev.comment = ""\r
+\r
+ rev_data = self.matchmake_revision(rev.text, rev_data)\r
+ rev_data = self.matchmake_comment(rev.comment, rev_data)\r
+ return rev_data\r
+\r
+ def matchmake_revision(self, text, rev_data):\r
+ return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)\r
+\r
+ def matchmake_comment(self, comment, rev_data):\r
+ return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)\r
+\r
+ def matchmake_pairs(self, content, rev_data, pairs):\r
+ for pair in pairs:\r
+ rev_data = pair.matchmake(content, rev_data)\r
+ return rev_data\r
+\r
+ def __get_namespace_from_title(self, title):\r
+ default_ns = None\r
+\r
+ for ns in self.namespaces:\r
+ # skip if the namespace is not defined\r
+ if ns == None:\r
+ default_ns = self.namespaces[ns]\r
+ continue\r
+\r
+ if title.startswith(ns + ":"):\r
+ return self.namespaces[ns]\r
+\r
+ # if we've made it this far with no matches, we return the default namespace\r
+ return default_ns\r
+\r
+\r
+ def process(self):\r
+\r
+ # create a regex that creates the output filename\r
+ # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',\r
+ # r'output/wikiq-\1-\2.tsv',\r
+ # input_filename)\r
+\r
+ # Construct dump file iterator\r
+ dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)\r
+\r
+ # extract list of namspaces\r
+ self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}\r
+\r
+ page_count = 0\r
+ rev_count = 0\r
+\r
+\r
+ # Iterate through pages\r
+ for page in dump:\r
+ namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)\r
+\r
+ # skip namespaces not in the filter\r
+ if self.namespace_filter is not None:\r
+ if namespace not in self.namespace_filter:\r
+ continue\r
+\r
+ rev_detector = mwreverts.Detector(radius = self.revert_radius)\r
+\r
+ if self.persist != PersistMethod.none:\r
+ window = deque(maxlen=PERSISTENCE_RADIUS)\r
+\r
+ if self.persist == PersistMethod.sequence:\r
+ state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),\r
+ revert_radius=PERSISTENCE_RADIUS)\r
+\r
+ elif self.persist == PersistMethod.segment:\r
+ state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),\r
+ revert_radius=PERSISTENCE_RADIUS)\r
+\r
+ # self.persist == PersistMethod.legacy\r
+ else:\r
+ from mw.lib import persistence\r
+ state = persistence.State()\r
+\r
+ # Iterate through a page's revisions\r
+ for rev in page:\r
+ \r
+ # initialize rev_data\r
+ rev_data = {\r
+ 'revid':rev.id,\r
+ 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),\r
+ 'articleid' : page.id,\r
+ 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,\r
+ 'title' : '"' + page.title + '"',\r
+ 'namespace' : namespace,\r
+ 'deleted' : "TRUE" if rev.deleted.text else "FALSE"\r
+ }\r
+\r
+ rev_data = self.matchmake(rev, rev_data)\r
+\r
+ # if revisions are deleted, /many/ things will be missing\r
+ if rev.deleted.text:\r
+ rev_data['text_chars'] = ""\r
+ rev_data['sha1'] = ""\r
+ rev_data['revert'] = ""\r
+ rev_data['reverteds'] = ""\r
+\r
+ else:\r
+ # rev.text can be None if the page has no text\r
+ if not rev.text:\r
+ rev.text = ""\r
+ # if text exists, we'll check for a sha1 and generate one otherwise\r
+\r
+ if rev.sha1:\r
+ text_sha1 = rev.sha1\r
+ else:\r
+\r
+ text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()\r
+ \r
+ rev_data['sha1'] = text_sha1\r
+\r
+ # TODO rev.bytes doesn't work.. looks like a bug\r
+ rev_data['text_chars'] = len(rev.text)\r
+\r
+ # generate revert data\r
+ revert = rev_detector.process(text_sha1, rev.id)\r
+ \r
+ if revert:\r
+ rev_data['revert'] = "TRUE"\r
+ rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'\r
+ else:\r
+ rev_data['revert'] = "FALSE"\r
+ rev_data['reverteds'] = ""\r
+\r
+ # if the fact that the edit was minor can be hidden, this might be an issue\r
+ rev_data['minor'] = "TRUE" if rev.minor else "FALSE"\r
+\r
+ if not rev.deleted.user:\r
+ # wrap user-defined editors in quotes for fread\r
+ rev_data['editor'] = '"' + rev.user.text + '"'\r
+ rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"\r
+ \r
+ else:\r
+ rev_data['anon'] = ""\r
+ rev_data['editor'] = ""\r
+\r
+ #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):\r
+ # redirect = True\r
+ #else:\r
+ # redirect = False\r
+ \r
+ #TODO missing: additions_size deletions_size\r
+ \r
+ # if collapse user was on, lets run that\r
+ if self.collapse_user:\r
+ rev_data['collapsed_revs'] = rev.collapsed_revs\r
+\r
+ if self.persist != PersistMethod.none:\r
+ if rev.deleted.text:\r
+ for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:\r
+ old_rev_data[k] = None\r
+ else:\r
+\r
+ if self.persist != PersistMethod.legacy:\r
+ _, tokens_added, tokens_removed = state.update(rev.text, rev.id)\r
+\r
+ else:\r
+ _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)\r
+ \r
+ window.append((rev.id, rev_data, tokens_added, tokens_removed))\r
+ \r
+ if len(window) == PERSISTENCE_RADIUS:\r
+ old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]\r
+ \r
+ num_token_revs, num_tokens = calculate_persistence(old_tokens_added)\r
+\r
+ old_rev_data["token_revs"] = num_token_revs\r
+ old_rev_data["tokens_added"] = num_tokens\r
+ old_rev_data["tokens_removed"] = len(old_tokens_removed)\r
+ old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1\r
+\r
+ self.print_rev_data(old_rev_data)\r
+\r
+ else:\r
+ self.print_rev_data(rev_data)\r
+\r
+ rev_count += 1\r
+\r
+ if self.persist != PersistMethod.none:\r
+ # print out metadata for the last RADIUS revisions\r
+ for i, item in enumerate(window):\r
+ # if the window was full, we've already printed item 0\r
+ if len(window) == PERSISTENCE_RADIUS and i == 0:\r
+ continue\r
+\r
+ rev_id, rev_data, tokens_added, tokens_removed = item\r
+ num_token_revs, num_tokens = calculate_persistence(tokens_added)\r
+\r
+ rev_data["token_revs"] = num_token_revs\r
+ rev_data["tokens_added"] = num_tokens\r
+ rev_data["tokens_removed"] = len(tokens_removed)\r
+ rev_data["tokens_window"] = len(window)-(i+1)\r
+ \r
+ self.print_rev_data(rev_data)\r
+\r
+ page_count += 1\r
+\r
+ print("Done: %s revisions and %s pages." % (rev_count, page_count),\r
+ file=sys.stderr)\r
+\r
+ def print_rev_data(self, rev_data):\r
+ # if it's the first time through, print the header\r
+ if self.urlencode:\r
+ for field in TO_ENCODE:\r
+ rev_data[field] = quote(str(rev_data[field]))\r
+\r
+ if not self.printed_header:\r
+ print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)\r
+ self.printed_header = True\r
+ \r
+ print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)\r
+\r
+\r
+def open_input_file(input_filename):\r
+ if re.match(r'.*\.7z$', input_filename):\r
+ cmd = ["7za", "x", "-so", input_filename, '*'] \r
+ elif re.match(r'.*\.gz$', input_filename):\r
+ cmd = ["zcat", input_filename] \r
+ elif re.match(r'.*\.bz2$', input_filename):\r
+ cmd = ["bzcat", "-dk", input_filename] \r
+\r
+ try:\r
+ input_file = Popen(cmd, stdout=PIPE).stdout\r
+ except NameError:\r
+ input_file = open(input_filename, 'r')\r
+\r
+ return input_file\r
+\r
+def open_output_file(input_filename):\r
+ # create a regex that creates the output filename\r
+ output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)\r
+ output_filename = re.sub(r'\.xml', '', output_filename)\r
+ output_filename = output_filename + ".tsv"\r
+ output_file = open(output_filename, "w")\r
+\r
+ return output_file\r
+\r
+parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')\r
+\r
+# arguments for the input direction\r
+parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, \r
+ help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")\r
+\r
+parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,\r
+ help="Directory for output files.")\r
+\r
+parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",\r
+ help="Write output to standard out (do not create dump file)")\r
+\r
+parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",\r
+ help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")\r
+\r
+parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',\r
+ help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")\r
+\r
+parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",\r
+ help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")\r
+\r
+parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',\r
+ help="Id number of namspace to include. Can be specified more than once.")\r
+\r
+parser.add_argument('-rr',\r
+ '--revert-radius',\r
+ dest="revert_radius",\r
+ type=int,\r
+ action='store',\r
+ default=15,\r
+ help="Number of edits to check when looking for reverts (default: 15)")\r
+\r
+parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',\r
+ help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")\r
+\r
+parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',\r
+ help="The label for the outputted column based on matching the regex in revision text.")\r
+\r
+parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',\r
+ help="The regular expression to search for in comments of revisions.")\r
+\r
+parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',\r
+ help="The label for the outputted column based on matching the regex in comments.")\r
+\r
+args = parser.parse_args()\r
+\r
+# set persistence method\r
+\r
+if args.persist is None:\r
+ persist = PersistMethod.none\r
+elif args.persist == "segment":\r
+ persist = PersistMethod.segment\r
+elif args.persist == "legacy":\r
+ persist = PersistMethod.legacy\r
+else:\r
+ persist = PersistMethod.sequence\r
+\r
+if args.namespace_filter is not None:\r
+ namespaces = args.namespace_filter\r
+else:\r
+ namespaces = None\r
+\r
+if len(args.dumpfiles) > 0:\r
+ for filename in args.dumpfiles:\r
+ input_file = open_input_file(filename)\r
+\r
+ # open directory for output\r
+ if args.output_dir:\r
+ output_dir = args.output_dir[0]\r
+ else:\r
+ output_dir = "."\r
+\r
+ print("Processing file: %s" % filename, file=sys.stderr)\r
+\r
+ if args.stdout:\r
+ output_file = sys.stdout\r
+ else:\r
+ filename = os.path.join(output_dir, os.path.basename(filename))\r
+ output_file = open_output_file(filename)\r
+\r
+ wikiq = WikiqParser(input_file,\r
+ output_file,\r
+ collapse_user=args.collapse_user,\r
+ persist=persist,\r
+ urlencode=args.urlencode,\r
+ namespaces=namespaces,\r
+ revert_radius=args.revert_radius,\r
+ regex_match_revision = args.regex_match_revision,\r
+ regex_revision_label = args.regex_revision_label,\r
+ regex_match_comment = args.regex_match_comment,\r
+ regex_comment_label = args.regex_comment_label)\r
+\r
+ wikiq.process()\r
+\r
+ # close things \r
+ input_file.close()\r
+ output_file.close()\r
+else:\r
+ wikiq = WikiqParser(sys.stdin,\r
+ sys.stdout,\r
+ collapse_user=args.collapse_user,\r
+ persist=persist,\r
+ #persist_legacy=args.persist_legacy,\r
+ urlencode=args.urlencode,\r
+ namespaces=namespaces,\r
+ revert_radius=args.revert_radius,\r
+ regex_match_revision = args.regex_match_revision,\r
+ regex_revision_label = args.regex_revision_label,\r
+ regex_match_comment = args.regex_match_comment,\r
+ regex_comment_label = args.regex_comment_label)\r
+\r
+ wikiq.process() \r
+\r
+# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"\r
+# stop_words = stop_words.split(",")\r