4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 class Test_Wikiq(unittest.TestCase):
21 def mkoutputdir(self):
22 if not os.path.exists("test_output"):
23 os.mkdir("test_output")
25 def setuptoutputfiles(self, suffix="xml.7z"):
26 self.wikiq_out_name = self.wiki + ".tsv"
27 self.test_output_dir = os.path.join(".", "test_output")
28 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
29 self.infile = "{0}.{1}".format(self.wiki,suffix)
30 self.input_dir = "dumps"
31 self.input_file = os.path.join(".", self.input_dir,self.infile)
32 self.baseline_output_dir = "baseline_output"
34 def run_and_check_output(self, call, test_filename):
35 test_file = os.path.join(self.test_output_dir, test_filename)
36 if os.path.exists(test_file):
39 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
42 copyfile(self.call_output, test_file)
43 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
45 # as a test let's make sure that we get equal data frames
46 test = pd.read_table(test_file)
47 baseline = pd.read_table(baseline_file)
48 assert_frame_equal(test,baseline)
50 class Test_Wikipedia(Test_Wikiq):
52 print(os.path.abspath("."))
54 self.wiki = 'ikwiki-20180301-pages-meta-history'
55 self.setuptoutputfiles(suffix="xml.bz2")
56 self.base_call = "../bin/wikiq {0} -o {1}"
58 def test_WP_url_encode(self):
59 test_filename = "url-encode_" + self.wikiq_out_name
60 call = self.base_call.format(self.input_file, self.test_output_dir)
61 call = call + " --url-encode"
62 self.run_and_check_output(call, test_filename)
65 class Test_Basic(Test_Wikiq):
69 self.wiki="sailormoon"
70 self.setuptoutputfiles()
71 self.base_call = "../bin/wikiq {0} -o {1}"
73 def test_noargs(self):
74 test_filename = "noargs_" + self.wikiq_out_name
76 call = self.base_call.format(self.input_file, self.test_output_dir)
78 self.run_and_check_output(call, test_filename)
80 def test_collapse_user(self):
81 test_filename = "collapse-user_" + self.wikiq_out_name
83 call = self.base_call.format(self.input_file, self.test_output_dir)
84 call = call + " --collapse-user"
86 self.run_and_check_output(call, test_filename)
88 def test_pwr_legacy(self):
89 test_filename = "persistence_legacy_" + self.wikiq_out_name
91 call = self.base_call.format(self.input_file, self.test_output_dir)
92 call = call + " --persistence-legacy"
93 self.run_and_check_output(call, test_filename)
96 test_filename = "persistence_" + self.wikiq_out_name
98 call = self.base_call.format(self.input_file, self.test_output_dir)
99 call = call + " --persistence"
100 self.run_and_check_output(call, test_filename)
102 def test_url_encode(self):
103 test_filename = "url-encode_" + self.wikiq_out_name
105 call = self.base_call.format(self.input_file, self.test_output_dir)
106 call = call + " --url-encode"
107 self.run_and_check_output(call, test_filename)
109 class Test_Malformed(Test_Wikiq):
113 self.wiki="twinpeaks"
114 self.setuptoutputfiles()
115 self.base_call = "../bin/wikiq {0} -o {1}"
117 def test_malformed_noargs(self):
118 call = self.base_call.format(self.input_file, self.test_output_dir)
119 proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
121 outs, errs = proc.communicate()
122 errlines = str(errs).split("\\n")
123 self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
125 class Test_Stdout(Test_Wikiq):
129 self.wiki = 'sailormoon'
130 self.setuptoutputfiles()
133 def test_noargs(self):
134 self.base_call = ["../bin/wikiq", self.input_file, "--stdout"]
135 proc = subprocess.Popen(self.base_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
137 test_file = "noargs_" + self.wikiq_out_name
138 baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
139 test = pd.read_table(outs)
140 baseline = pd.read_table(baseline_file)
141 assert_frame_equal(test,baseline)
143 if __name__ == '__main__':