4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 class Test_Persistence(unittest.TestCase):
23 if not os.path.exists("test_output"):
24 os.mkdir("test_output")
26 self.wiki = 'pwr-test'
27 self.wikiq_out_name = self.wiki + ".tsv"
28 self.test_output_dir = os.path.join(".", "test_output")
29 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
31 self.infile = "{0}.xml".format(self.wiki)
32 self.base_call = "../wikiq {0} -o {1}"
33 self.input_dir = "dumps"
34 self.input_file = os.path.join(".", self.input_dir,self.infile)
35 self.baseline_output_dir = "baseline_output"
37 def test_sequence_persistence(self):
38 test_filename = "sequence-" + self.wikiq_out_name
39 test_file = os.path.join(self.test_output_dir, test_filename)
40 if os.path.exists(test_file):
43 call = self.base_call.format(self.input_file, self.test_output_dir)
44 call = call + " --url-encode --persistence sequence"
45 print(os.path.abspath('.'))
47 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
50 copyfile(self.call_output, test_file)
51 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
53 # as a test let's make sure that we get equal data frames
54 test = pd.read_table(test_file)
55 self.assertEqual(test['tokens_added'][0],7)
56 self.assertEqual(test['tokens_added'][1],10)
57 self.assertEqual(test['tokens_added'][2],0)
58 self.assertEqual(test['tokens_added'][3],8)
59 self.assertEqual(test['tokens_added'][4],0)
60 self.assertEqual(test['tokens_removed'][0],0)
61 self.assertEqual(test['tokens_removed'][1],0)
62 self.assertEqual(test['tokens_removed'][2],0)
63 self.assertEqual(test['tokens_removed'][3],4)
64 self.assertEqual(test['tokens_removed'][4],0)
65 self.assertEqual(test['token_revs'][0],8*3)
66 self.assertEqual(test['token_revs'][1],0)
67 self.assertEqual(test['token_revs'][2],0)
68 self.assertEqual(test['token_revs'][3],0)
69 self.assertEqual(test['token_revs'][4],0)
71 baseline = pd.read_table(baseline_file)
72 assert_frame_equal(test,baseline)
74 def test_legacy_persistence(self):
75 test_filename = "legacy-" + self.wikiq_out_name
76 test_file = os.path.join(self.test_output_dir, test_filename)
77 if os.path.exists(test_file):
80 call = self.base_call.format(self.input_file, self.test_output_dir)
81 call = call + " --url-encode --persistence legacy"
82 print(os.path.abspath('.'))
84 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
87 copyfile(self.call_output, test_file)
88 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
90 # as a test let's make sure that we get equal data frames
91 test = pd.read_table(test_file)
92 self.assertEqual(test['tokens_added'][0],7)
93 self.assertEqual(test['tokens_added'][1],10)
94 self.assertEqual(test['tokens_added'][2],0)
95 self.assertEqual(test['tokens_added'][3],11)
96 self.assertEqual(test['tokens_added'][4],0)
97 self.assertEqual(test['tokens_removed'][0],0)
98 self.assertEqual(test['tokens_removed'][1],0)
99 self.assertEqual(test['tokens_removed'][2],0)
100 self.assertEqual(test['tokens_removed'][3],7)
101 self.assertEqual(test['tokens_removed'][4],0)
102 self.assertEqual(test['token_revs'][0],7*3)
103 self.assertEqual(test['token_revs'][1],0)
104 self.assertEqual(test['token_revs'][2],0)
105 self.assertEqual(test['token_revs'][3],0)
106 self.assertEqual(test['token_revs'][4],0)
108 baseline = pd.read_table(baseline_file)
109 assert_frame_equal(test,baseline)
113 class Test_Persistence_Bug(unittest.TestCase):
116 if not os.path.exists("test_output"):
117 os.mkdir("test_output")
119 self.wiki = 'enwiki-test'
120 self.wikiq_out_name = self.wiki + ".tsv"
121 self.test_output_dir = os.path.join(".", "test_output")
122 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
124 self.infile = "{0}.xml".format(self.wiki)
125 self.base_call = "../wikiq {0} -o {1}"
126 self.input_dir = "dumps"
127 self.input_file = os.path.join(".", self.input_dir,self.infile)
128 self.baseline_output_dir = "baseline_output"
130 def test_sequence_persistence(self):
131 test_filename = "sequence-" + self.wikiq_out_name
132 test_file = os.path.join(self.test_output_dir, test_filename)
133 if os.path.exists(test_file):
136 call = self.base_call.format(self.input_file, self.test_output_dir)
137 call = call + " --url-encode --persistence sequence --collapse-user"
138 print(os.path.abspath('.'))
140 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
143 copyfile(self.call_output, test_file)
144 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
146 # as a test let's make sure that we get equal data frames
147 test = pd.read_table(test_file)
148 baseline = pd.read_table(baseline_file)
149 assert_frame_equal(test,baseline)
152 class Test_Wikipedia(unittest.TestCase):
154 if not os.path.exists("test_output"):
155 os.mkdir("test_output")
157 self.wiki = 'ikwiki-20180301-pages-meta-history'
158 self.wikiq_out_name = self.wiki + ".tsv"
159 self.test_output_dir = os.path.join(".", "test_output")
160 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
162 self.infile = "{0}.xml.bz2".format(self.wiki)
163 self.base_call = "../wikiq {0} -o {1}"
164 self.input_dir = "dumps"
165 self.input_file = os.path.join(".", self.input_dir,self.infile)
166 self.baseline_output_dir = "baseline_output"
168 def test_WP_url_encode(self):
169 test_filename = "url-encode_" + self.wikiq_out_name
170 test_file = os.path.join(self.test_output_dir, test_filename)
171 if os.path.exists(test_file):
174 call = self.base_call.format(self.input_file, self.test_output_dir)
175 call = call + " --url-encode"
177 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
180 copyfile(self.call_output, test_file)
181 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
183 # as a test let's make sure that we get equal data frames
185 test = pd.read_table(test_file)
186 baseline = pd.read_table(baseline_file)
187 assert_frame_equal(test,baseline)
189 def test_WP_namespaces(self):
190 print(os.path.abspath('.'))
191 test_filename = "namespaces_" + self.wikiq_out_name
192 test_file = os.path.join(self.test_output_dir, test_filename)
193 if os.path.exists(test_file):
196 call = self.base_call.format(self.input_file, self.test_output_dir)
197 call = call + " -n 0 -n 1"
199 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
201 copyfile(self.call_output, test_file)
202 baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
204 # as a test let's make sure that we get equal data frames
205 test = pd.read_table(test_file)
206 num_wrong_ns = sum(~ test.namespace.isin({0,1}))
207 self.assertEqual(num_wrong_ns, 0)
208 baseline = pd.read_table(baseline_file)
209 assert_frame_equal(test,baseline)
212 class Test_Basic(unittest.TestCase):
215 if not os.path.exists("test_output"):
216 os.mkdir("test_output")
218 self.wiki = 'sailormoon'
219 self.wikiq_out_name = self.wiki + ".tsv"
220 self.test_output_dir = os.path.join(".", "test_output")
221 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
223 self.infile = "{0}.xml.7z".format(self.wiki)
224 self.base_call = "../wikiq {0} -o {1}"
225 self.input_dir = "dumps"
226 self.input_file = os.path.join(".", self.input_dir,self.infile)
227 self.baseline_output_dir = "baseline_output"
229 def test_noargs(self):
231 test_filename = "noargs_" + self.wikiq_out_name
232 test_file = os.path.join(self.test_output_dir, test_filename)
233 if os.path.exists(test_file):
236 call = self.base_call.format(self.input_file, self.test_output_dir)
238 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
241 copyfile(self.call_output, test_file)
243 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
245 test = pd.read_table(test_file)
246 baseline = pd.read_table(baseline_file)
247 assert_frame_equal(test,baseline)
250 def test_collapse_user(self):
251 test_filename = "collapse-user_" + self.wikiq_out_name
252 test_file = os.path.join(self.test_output_dir, test_filename)
253 if os.path.exists(test_file):
256 call = self.base_call.format(self.input_file, self.test_output_dir)
257 call = call + " --collapse-user"
260 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
263 copyfile(self.call_output, test_file)
265 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
266 test = pd.read_table(test_file)
267 baseline = pd.read_table(baseline_file)
268 assert_frame_equal(test,baseline)
270 def test_pwr_segment(self):
271 test_filename = "persistence_segment_" + self.wikiq_out_name
272 test_file = os.path.join(self.test_output_dir, test_filename)
273 if os.path.exists(test_file):
276 call = self.base_call.format(self.input_file, self.test_output_dir)
277 call = call + " --persistence segment"
279 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
283 copyfile(self.call_output, test_file)
285 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
287 test = pd.read_table(test_file)
289 baseline = pd.read_table(baseline_file)
290 assert_frame_equal(test,baseline)
292 def test_pwr_legacy(self):
293 test_filename = "persistence_legacy_" + self.wikiq_out_name
294 test_file = os.path.join(self.test_output_dir, test_filename)
295 if os.path.exists(test_file):
298 call = self.base_call.format(self.input_file, self.test_output_dir)
299 call = call + " --persistence legacy"
301 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
304 copyfile(self.call_output, test_file)
306 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
308 test = pd.read_table(test_file)
309 baseline = pd.read_table(baseline_file)
310 assert_frame_equal(test,baseline)
313 test_filename = "persistence_" + self.wikiq_out_name
314 test_file = os.path.join(self.test_output_dir, test_filename)
315 if os.path.exists(test_file):
318 call = self.base_call.format(self.input_file, self.test_output_dir)
319 call = call + " --persistence"
321 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
325 copyfile(self.call_output, test_file)
327 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
329 test = pd.read_table(test_file)
330 baseline = pd.read_table(baseline_file)
331 assert_frame_equal(test,baseline)
334 def test_url_encode(self):
335 test_filename = "url-encode_" + self.wikiq_out_name
337 test_file = os.path.join(self.test_output_dir, test_filename)
338 if os.path.exists(test_file):
341 call = self.base_call.format(self.input_file, self.test_output_dir)
342 call = call + " --url-encode"
344 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
348 copyfile(self.call_output, test_file)
349 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
350 test = pd.read_table(test_file)
351 baseline = pd.read_table(baseline_file)
352 assert_frame_equal(test,baseline)
355 class Test_Malformed(unittest.TestCase):
357 if not os.path.exists("test_output"):
358 os.mkdir("test_output")
360 self.wiki = 'twinpeaks'
361 self.wikiq_out_name = self.wiki + ".tsv"
362 self.test_output_dir = os.path.join(".", "test_output")
363 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
365 self.infile = "{0}.xml.7z".format(self.wiki)
366 self.base_call = "../wikiq {0} -o {1}"
367 self.input_dir = "dumps"
368 self.input_file = os.path.join(".", self.input_dir,self.infile)
371 def test_malformed_noargs(self):
373 call = self.base_call.format(self.input_file, self.test_output_dir)
375 proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
377 outs, errs = proc.communicate()
378 errlines = str(errs).split("\\n")
379 self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
381 class Test_Stdout(unittest.TestCase):
384 self.wiki = 'sailormoon'
385 self.wikiq_out_name = self.wiki + ".tsv"
387 self.infile = "{0}.xml.7z".format(self.wiki)
388 self.base_call = "../wikiq {0} --stdout"
389 self.input_dir = "dumps"
390 self.input_file = os.path.join(".", self.input_dir,self.infile)
391 self.baseline_output_dir = "baseline_output"
393 def test_noargs(self):
395 call = self.base_call.format(self.input_file)
397 proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
398 outs = proc.stdout.decode("utf8")
400 test_file = "noargs_" + self.wikiq_out_name
401 baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
403 test = pd.read_table(StringIO(outs))
404 baseline = pd.read_table(baseline_file)
405 assert_frame_equal(test,baseline)
407 if __name__ == '__main__':