X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/dba793c6ac595e7a5c0ac795575c28231f06f8cb..0c2d72b881b174621ef811a7c8cef4e5c1103e97:/test/Wikiq_Unit_Test.py?ds=sidebyside diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index e43ee7d..e893718 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -16,6 +16,45 @@ from io import StringIO # wikia and wikipedia data DONE # malformed xmls DONE +# class Test_Persistence_Bug(unittest.TestCase): + +# def setUp(self): +# if not os.path.exists("test_output"): +# os.mkdir("test_output") + +# self.wiki = 'enwiki-test' +# self.wikiq_out_name = self.wiki + ".tsv" +# self.test_output_dir = os.path.join(".", "test_output") +# self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + +# self.infile = "{0}.xml".format(self.wiki) +# self.base_call = "../wikiq {0} -o {1}" +# self.input_dir = "dumps" +# self.input_file = os.path.join(".", self.input_dir,self.infile) +# self.baseline_output_dir = "baseline_output" + +# def test_segment_persistence(self): +# test_filename = "sequence-" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) + +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --url-encode --persistence sequence --collapse-user" +# print(os.path.abspath('.')) +# print(call) +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() + +# copyfile(self.call_output, test_file) +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + +# # as a test let's make sure that we get equal data frames +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) + + class Test_Wikipedia(unittest.TestCase): def setUp(self): if not os.path.exists("test_output"): @@ -47,177 +86,220 @@ class Test_Wikipedia(unittest.TestCase): baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) # as a test let's make sure that we get equal data frames + test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) - -class Test_Basic(unittest.TestCase): - - def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - - self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - self.baseline_output_dir = "baseline_output" - - def test_noargs(self): - - test_filename = "noargs_" + self.wikiq_out_name + def test_WP_namespaces(self): + print(os.path.abspath('.')) + test_filename = "namespaces_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) + call = call + " -n 0 -n 1" + print(call) proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) proc.wait() +# copyfile(self.call_output, test_file) + baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - + # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) + num_wrong_ns = sum(~ test.namespace.isin({0,1})) + self.assertEqual(num_wrong_ns, 0) baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) - def test_collapse_user(self): - test_filename = "collapse-user_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) - - call = self.base_call.format(self.input_file, self.test_output_dir) - call = call + " --collapse-user" +# class Test_Basic(unittest.TestCase): - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() +# def setUp(self): +# if not os.path.exists("test_output"): +# os.mkdir("test_output") - copyfile(self.call_output, test_file) +# self.wiki = 'sailormoon' +# self.wikiq_out_name = self.wiki + ".tsv" +# self.test_output_dir = os.path.join(".", "test_output") +# self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) +# self.infile = "{0}.xml.7z".format(self.wiki) +# self.base_call = "../wikiq {0} -o {1}" +# self.input_dir = "dumps" +# self.input_file = os.path.join(".", self.input_dir,self.infile) +# self.baseline_output_dir = "baseline_output" - def test_pwr_legacy(self): - test_filename = "persistence_legacy_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) +# def test_noargs(self): + +# test_filename = "noargs_" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) - call = call + " --persistence-legacy" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() +# call = self.base_call.format(self.input_file, self.test_output_dir) +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() +# copyfile(self.call_output, test_file) - copyfile(self.call_output, test_file) +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) - def test_pwr(self): - test_filename = "persistence_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) +# def test_collapse_user(self): +# test_filename = "collapse-user_" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) - call = call + " --persistence" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --collapse-user" +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() - copyfile(self.call_output, test_file) - - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) +# copyfile(self.call_output, test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) - - def test_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name - - test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) +# def test_pwr_segment(self): +# test_filename = "persistence_segment_" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) - call = call + " --url-encode" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) - proc.wait() +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --persistence segment" +# print(call) +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() - copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) +# copyfile(self.call_output, test_file) -class Test_Malformed(unittest.TestCase): +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") +# test = pd.read_table(test_file) +# print(test) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) - self.wiki = 'twinpeaks' - self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) +# def test_pwr_legacy(self): +# test_filename = "persistence_legacy_" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) + +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --persistence legacy" +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) +# copyfile(self.call_output, test_file) +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - def test_malformed_noargs(self): +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) - call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) - proc.wait() - outs, errs = proc.communicate() - errlines = str(errs).split("\\n") - self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') +# def test_pwr(self): +# test_filename = "persistence_" + self.wikiq_out_name +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) + +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --persistence" +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() -class Test_Stdout(unittest.TestCase): - def setUp(self): - self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" +# copyfile(self.call_output, test_file) - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} --stdout" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - self.baseline_output_dir = "baseline_output" +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) - def test_noargs(self): +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) - call = self.base_call.format(self.input_file) - proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True) - outs = proc.stdout.decode("utf8") - test_file = "noargs_" + self.wikiq_out_name - baseline_file = os.path.join(".", self.baseline_output_dir, test_file) - print(baseline_file) - test = pd.read_table(StringIO(outs)) - baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline) +# def test_url_encode(self): +# test_filename = "url-encode_" + self.wikiq_out_name + +# test_file = os.path.join(self.test_output_dir, test_filename) +# if os.path.exists(test_file): +# os.remove(test_file) + +# call = self.base_call.format(self.input_file, self.test_output_dir) +# call = call + " --url-encode" +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) +# proc.wait() + +# copyfile(self.call_output, test_file) +# baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) +# test = pd.read_table(test_file) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) + + +# class Test_Malformed(unittest.TestCase): +# def setUp(self): +# if not os.path.exists("test_output"): +# os.mkdir("test_output") + +# self.wiki = 'twinpeaks' +# self.wikiq_out_name = self.wiki + ".tsv" +# self.test_output_dir = os.path.join(".", "test_output") +# self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + +# self.infile = "{0}.xml.7z".format(self.wiki) +# self.base_call = "../wikiq {0} -o {1}" +# self.input_dir = "dumps" +# self.input_file = os.path.join(".", self.input_dir,self.infile) + + +# def test_malformed_noargs(self): + +# call = self.base_call.format(self.input_file, self.test_output_dir) +# proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) +# proc.wait() +# outs, errs = proc.communicate() +# errlines = str(errs).split("\\n") +# self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + +# class Test_Stdout(unittest.TestCase): + +# def setUp(self): +# self.wiki = 'sailormoon' +# self.wikiq_out_name = self.wiki + ".tsv" + +# self.infile = "{0}.xml.7z".format(self.wiki) +# self.base_call = "../wikiq {0} --stdout" +# self.input_dir = "dumps" +# self.input_file = os.path.join(".", self.input_dir,self.infile) +# self.baseline_output_dir = "baseline_output" + +# def test_noargs(self): + +# call = self.base_call.format(self.input_file) +# proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True) +# outs = proc.stdout.decode("utf8") + +# test_file = "noargs_" + self.wikiq_out_name +# baseline_file = os.path.join(".", self.baseline_output_dir, test_file) +# print(baseline_file) +# test = pd.read_table(StringIO(outs)) +# baseline = pd.read_table(baseline_file) +# assert_frame_equal(test,baseline) if __name__ == '__main__': unittest.main()