4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 class Test_Persistence(unittest.TestCase):
23 if not os.path.exists("test_output"):
24 os.mkdir("test_output")
26 self.wiki = 'pwr-test'
27 self.wikiq_out_name = self.wiki + ".tsv"
28 self.test_output_dir = os.path.join(".", "test_output")
29 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
31 self.infile = "{0}.xml".format(self.wiki)
32 self.base_call = "../wikiq {0} -o {1}"
33 self.input_dir = "dumps"
34 self.input_file = os.path.join(".", self.input_dir,self.infile)
35 self.baseline_output_dir = "baseline_output"
37 def test_sequence_persistence(self):
38 test_filename = "sequence-" + self.wikiq_out_name
39 test_file = os.path.join(self.test_output_dir, test_filename)
40 if os.path.exists(test_file):
43 call = self.base_call.format(self.input_file, self.test_output_dir)
44 call = call + " --url-encode --persistence sequence"
45 print(os.path.abspath('.'))
47 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
50 copyfile(self.call_output, test_file)
51 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
53 # as a test let's make sure that we get equal data frames
54 test = pd.read_table(test_file)
55 self.assertEqual(test['tokens_added'][0],7)
56 self.assertEqual(test['tokens_added'][1],10)
57 self.assertEqual(test['tokens_added'][2],0)
58 self.assertEqual(test['tokens_added'][3],8)
59 self.assertEqual(test['tokens_added'][4],0)
60 self.assertEqual(test['tokens_removed'][0],0)
61 self.assertEqual(test['tokens_removed'][1],0)
62 self.assertEqual(test['tokens_removed'][2],0)
63 self.assertEqual(test['tokens_removed'][3],4)
64 self.assertEqual(test['tokens_removed'][4],0)
65 self.assertEqual(test['token_revs'][0],8*3)
66 self.assertEqual(test['token_revs'][1],0)
67 self.assertEqual(test['token_revs'][2],0)
68 self.assertEqual(test['token_revs'][3],0)
69 self.assertEqual(test['token_revs'][4],0)
71 baseline = pd.read_table(baseline_file)
72 assert_frame_equal(test,baseline)
74 def test_legacy_persistence(self):
75 test_filename = "legacy-" + self.wikiq_out_name
76 test_file = os.path.join(self.test_output_dir, test_filename)
77 if os.path.exists(test_file):
80 call = self.base_call.format(self.input_file, self.test_output_dir)
81 call = call + " --url-encode --persistence legacy"
82 print(os.path.abspath('.'))
84 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
87 copyfile(self.call_output, test_file)
88 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
90 # as a test let's make sure that we get equal data frames
91 test = pd.read_table(test_file)
92 self.assertEqual(test['tokens_added'][0],7)
93 self.assertEqual(test['tokens_added'][1],10)
94 self.assertEqual(test['tokens_added'][2],0)
95 self.assertEqual(test['tokens_added'][3],8)
96 self.assertEqual(test['tokens_added'][4],0)
97 self.assertEqual(test['tokens_removed'][0],0)
98 self.assertEqual(test['tokens_removed'][1],0)
99 self.assertEqual(test['tokens_removed'][2],10)
100 self.assertEqual(test['tokens_removed'][3],4)
101 self.assertEqual(test['tokens_removed'][4],0)
102 self.assertEqual(test['token_revs'][0],8*3)
103 self.assertEqual(test['token_revs'][1],0)
104 self.assertEqual(test['token_revs'][2],0)
105 self.assertEqual(test['token_revs'][3],0)
106 self.assertEqual(test['token_revs'][4],0)
108 baseline = pd.read_table(baseline_file)
109 assert_frame_equal(test,baseline)
113 def test_segment_persistence_exclude_ws(self):
114 test_filename = "segment_excludews_" + self.wikiq_out_name
115 test_file = os.path.join(self.test_output_dir, test_filename)
116 if os.path.exists(test_file):
119 call = self.base_call.format(self.input_file, self.test_output_dir)
120 call = call + " --url-encode --persistence segment --exclude-whitespace"
121 print(os.path.abspath('.'))
123 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
126 copyfile(self.call_output, test_file)
127 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
129 # as a test let's make sure that we get equal data frames
130 test = pd.read_table(test_file)
131 self.assertEqual(test['tokens_added'][0],4)
132 self.assertEqual(test['tokens_added'][1],5)
133 self.assertEqual(test['tokens_added'][2],0)
134 self.assertEqual(test['tokens_added'][3],6)
135 self.assertEqual(test['tokens_added'][4],0)
136 self.assertEqual(test['tokens_removed'][0],0)
137 self.assertEqual(test['tokens_removed'][1],0)
138 self.assertEqual(test['tokens_removed'][2],0)
139 self.assertEqual(test['tokens_removed'][3],4)
140 self.assertEqual(test['tokens_removed'][4],0)
141 self.assertEqual(test['token_revs'][0],4*3)
142 self.assertEqual(test['token_revs'][1],0)
143 self.assertEqual(test['token_revs'][2],0)
144 self.assertEqual(test['token_revs'][3],0)
145 self.assertEqual(test['token_revs'][4],0)
147 baseline = pd.read_table(baseline_file)
148 assert_frame_equal(test,baseline)
152 class Test_Persistence_Bug(unittest.TestCase):
155 if not os.path.exists("test_output"):
156 os.mkdir("test_output")
158 self.wiki = 'enwiki-test'
159 self.wikiq_out_name = self.wiki + ".tsv"
160 self.test_output_dir = os.path.join(".", "test_output")
161 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
163 self.infile = "{0}.xml".format(self.wiki)
164 self.base_call = "../wikiq {0} -o {1}"
165 self.input_dir = "dumps"
166 self.input_file = os.path.join(".", self.input_dir,self.infile)
167 self.baseline_output_dir = "baseline_output"
169 def test_sequence_persistence(self):
170 test_filename = "sequence-" + self.wikiq_out_name
171 test_file = os.path.join(self.test_output_dir, test_filename)
172 if os.path.exists(test_file):
175 call = self.base_call.format(self.input_file, self.test_output_dir)
176 call = call + " --url-encode --persistence sequence --collapse-user"
177 print(os.path.abspath('.'))
179 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
182 copyfile(self.call_output, test_file)
183 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
185 # as a test let's make sure that we get equal data frames
186 test = pd.read_table(test_file)
187 baseline = pd.read_table(baseline_file)
188 assert_frame_equal(test,baseline)
191 class Test_Wikipedia(unittest.TestCase):
193 if not os.path.exists("test_output"):
194 os.mkdir("test_output")
196 self.wiki = 'ikwiki-20180301-pages-meta-history'
197 self.wikiq_out_name = self.wiki + ".tsv"
198 self.test_output_dir = os.path.join(".", "test_output")
199 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
201 self.infile = "{0}.xml.bz2".format(self.wiki)
202 self.base_call = "../wikiq {0} -o {1}"
203 self.input_dir = "dumps"
204 self.input_file = os.path.join(".", self.input_dir,self.infile)
205 self.baseline_output_dir = "baseline_output"
207 def test_WP_url_encode(self):
208 test_filename = "url-encode_" + self.wikiq_out_name
209 test_file = os.path.join(self.test_output_dir, test_filename)
210 if os.path.exists(test_file):
213 call = self.base_call.format(self.input_file, self.test_output_dir)
214 call = call + " --url-encode"
216 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
219 copyfile(self.call_output, test_file)
220 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
222 # as a test let's make sure that we get equal data frames
224 test = pd.read_table(test_file)
225 baseline = pd.read_table(baseline_file)
226 assert_frame_equal(test,baseline)
228 def test_WP_namespaces(self):
229 print(os.path.abspath('.'))
230 test_filename = "namespaces_" + self.wikiq_out_name
231 test_file = os.path.join(self.test_output_dir, test_filename)
232 if os.path.exists(test_file):
235 call = self.base_call.format(self.input_file, self.test_output_dir)
236 call = call + " -n 0 -n 1"
238 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
240 copyfile(self.call_output, test_file)
241 baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
243 # as a test let's make sure that we get equal data frames
244 test = pd.read_table(test_file)
245 num_wrong_ns = sum(~ test.namespace.isin({0,1}))
246 self.assertEqual(num_wrong_ns, 0)
247 baseline = pd.read_table(baseline_file)
248 assert_frame_equal(test,baseline)
251 class Test_Basic(unittest.TestCase):
254 if not os.path.exists("test_output"):
255 os.mkdir("test_output")
257 self.wiki = 'sailormoon'
258 self.wikiq_out_name = self.wiki + ".tsv"
259 self.test_output_dir = os.path.join(".", "test_output")
260 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
262 self.infile = "{0}.xml.7z".format(self.wiki)
263 self.base_call = "../wikiq {0} -o {1}"
264 self.input_dir = "dumps"
265 self.input_file = os.path.join(".", self.input_dir,self.infile)
266 self.baseline_output_dir = "baseline_output"
268 def test_noargs(self):
270 test_filename = "noargs_" + self.wikiq_out_name
271 test_file = os.path.join(self.test_output_dir, test_filename)
272 if os.path.exists(test_file):
275 call = self.base_call.format(self.input_file, self.test_output_dir)
277 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
280 copyfile(self.call_output, test_file)
282 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
284 test = pd.read_table(test_file)
285 baseline = pd.read_table(baseline_file)
286 assert_frame_equal(test,baseline)
289 def test_collapse_user(self):
290 test_filename = "collapse-user_" + self.wikiq_out_name
291 test_file = os.path.join(self.test_output_dir, test_filename)
292 if os.path.exists(test_file):
295 call = self.base_call.format(self.input_file, self.test_output_dir)
296 call = call + " --collapse-user"
299 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
302 copyfile(self.call_output, test_file)
304 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
305 test = pd.read_table(test_file)
306 baseline = pd.read_table(baseline_file)
307 assert_frame_equal(test,baseline)
309 def test_pwr_segment(self):
310 test_filename = "persistence_segment_" + self.wikiq_out_name
311 test_file = os.path.join(self.test_output_dir, test_filename)
312 if os.path.exists(test_file):
315 call = self.base_call.format(self.input_file, self.test_output_dir)
316 call = call + " --persistence segment"
318 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
322 copyfile(self.call_output, test_file)
324 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
326 test = pd.read_table(test_file)
328 baseline = pd.read_table(baseline_file)
329 assert_frame_equal(test,baseline)
331 def test_pwr_segment_collapse(self):
332 test_filename = "persistence_segment_collapse_" + self.wikiq_out_name
333 test_file = os.path.join(self.test_output_dir, test_filename)
334 if os.path.exists(test_file):
337 call = self.base_call.format(self.input_file, self.test_output_dir)
338 call = call + " --persistence segment --collapse-user"
340 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
344 copyfile(self.call_output, test_file)
346 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
348 test = pd.read_table(test_file)
350 baseline = pd.read_table(baseline_file)
351 assert_frame_equal(test,baseline)
354 def test_pwr_legacy(self):
355 test_filename = "persistence_legacy_" + self.wikiq_out_name
356 test_file = os.path.join(self.test_output_dir, test_filename)
357 if os.path.exists(test_file):
360 call = self.base_call.format(self.input_file, self.test_output_dir)
361 call = call + " --persistence legacy"
363 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
366 copyfile(self.call_output, test_file)
368 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
370 test = pd.read_table(test_file)
371 baseline = pd.read_table(baseline_file)
372 assert_frame_equal(test,baseline)
375 test_filename = "persistence_" + self.wikiq_out_name
376 test_file = os.path.join(self.test_output_dir, test_filename)
377 if os.path.exists(test_file):
380 call = self.base_call.format(self.input_file, self.test_output_dir)
381 call = call + " --persistence"
383 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
387 copyfile(self.call_output, test_file)
389 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
391 test = pd.read_table(test_file)
392 baseline = pd.read_table(baseline_file)
393 assert_frame_equal(test,baseline)
396 def test_url_encode(self):
397 test_filename = "url-encode_" + self.wikiq_out_name
399 test_file = os.path.join(self.test_output_dir, test_filename)
400 if os.path.exists(test_file):
403 call = self.base_call.format(self.input_file, self.test_output_dir)
404 call = call + " --url-encode"
406 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
410 copyfile(self.call_output, test_file)
411 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
412 test = pd.read_table(test_file)
413 baseline = pd.read_table(baseline_file)
414 assert_frame_equal(test,baseline)
417 class Test_Malformed(unittest.TestCase):
419 if not os.path.exists("test_output"):
420 os.mkdir("test_output")
422 self.wiki = 'twinpeaks'
423 self.wikiq_out_name = self.wiki + ".tsv"
424 self.test_output_dir = os.path.join(".", "test_output")
425 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
427 self.infile = "{0}.xml.7z".format(self.wiki)
428 self.base_call = "../wikiq {0} -o {1}"
429 self.input_dir = "dumps"
430 self.input_file = os.path.join(".", self.input_dir,self.infile)
433 def test_malformed_noargs(self):
435 call = self.base_call.format(self.input_file, self.test_output_dir)
437 proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
439 outs, errs = proc.communicate()
440 errlines = str(errs).split("\\n")
441 self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
443 class Test_Stdout(unittest.TestCase):
446 self.wiki = 'sailormoon'
447 self.wikiq_out_name = self.wiki + ".tsv"
449 self.infile = "{0}.xml.7z".format(self.wiki)
450 self.base_call = "../wikiq {0} --stdout"
451 self.input_dir = "dumps"
452 self.input_file = os.path.join(".", self.input_dir,self.infile)
453 self.baseline_output_dir = "baseline_output"
455 def test_noargs(self):
457 call = self.base_call.format(self.input_file)
459 proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
460 outs = proc.stdout.decode("utf8")
462 test_file = "noargs_" + self.wikiq_out_name
463 baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
465 test = pd.read_table(StringIO(outs))
466 baseline = pd.read_table(baseline_file)
467 assert_frame_equal(test,baseline)
469 if __name__ == '__main__':