]> code.communitydata.science - mediawiki_dump_tools.git/blob - test/Wikiq_Unit_Test.py
add flag for excluding whitespace and punctuation
[mediawiki_dump_tools.git] / test / Wikiq_Unit_Test.py
1 import unittest
2 import os
3 import subprocess
4 from shutil import copyfile
5 import pandas as pd
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
8
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default  
14 # reading various file formats including
15 #        7z, gz, bz2, xml  DONE
16 # wikia and wikipedia data DONE
17 # malformed xmls DONE
18
19 class Test_Persistence(unittest.TestCase):
20
21     def setUp(self):
22
23         if not os.path.exists("test_output"):
24             os.mkdir("test_output")
25
26         self.wiki = 'pwr-test'
27         self.wikiq_out_name =  self.wiki + ".tsv"
28         self.test_output_dir = os.path.join(".", "test_output")
29         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
30
31         self.infile = "{0}.xml".format(self.wiki)    
32         self.base_call = "../wikiq {0} -o {1}"
33         self.input_dir = "dumps"
34         self.input_file = os.path.join(".", self.input_dir,self.infile)
35         self.baseline_output_dir = "baseline_output"
36
37     def test_sequence_persistence(self):
38         test_filename =  "sequence-" + self.wikiq_out_name
39         test_file = os.path.join(self.test_output_dir, test_filename)
40         if os.path.exists(test_file):
41             os.remove(test_file)
42         
43         call = self.base_call.format(self.input_file, self.test_output_dir)
44         call = call + " --url-encode --persistence sequence"
45         print(os.path.abspath('.'))
46         print(call)
47         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
48         proc.wait()
49
50         copyfile(self.call_output, test_file)
51         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
52
53         # as a test let's make sure that we get equal data frames
54         test = pd.read_table(test_file)
55         self.assertEqual(test['tokens_added'][0],7)
56         self.assertEqual(test['tokens_added'][1],10)
57         self.assertEqual(test['tokens_added'][2],0)
58         self.assertEqual(test['tokens_added'][3],8)
59         self.assertEqual(test['tokens_added'][4],0)
60         self.assertEqual(test['tokens_removed'][0],0)
61         self.assertEqual(test['tokens_removed'][1],0)
62         self.assertEqual(test['tokens_removed'][2],0)
63         self.assertEqual(test['tokens_removed'][3],4)
64         self.assertEqual(test['tokens_removed'][4],0)
65         self.assertEqual(test['token_revs'][0],8*3)
66         self.assertEqual(test['token_revs'][1],0)
67         self.assertEqual(test['token_revs'][2],0)
68         self.assertEqual(test['token_revs'][3],0)
69         self.assertEqual(test['token_revs'][4],0)
70
71         baseline = pd.read_table(baseline_file)
72         assert_frame_equal(test,baseline)
73
74     def test_legacy_persistence(self):
75         test_filename =  "legacy-" + self.wikiq_out_name
76         test_file = os.path.join(self.test_output_dir, test_filename)
77         if os.path.exists(test_file):
78             os.remove(test_file)
79         
80         call = self.base_call.format(self.input_file, self.test_output_dir)
81         call = call + " --url-encode --persistence legacy"
82         print(os.path.abspath('.'))
83         print(call)
84         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
85         proc.wait()
86
87         copyfile(self.call_output, test_file)
88         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
89
90         # as a test let's make sure that we get equal data frames
91         test = pd.read_table(test_file)
92         self.assertEqual(test['tokens_added'][0],7)
93         self.assertEqual(test['tokens_added'][1],10)
94         self.assertEqual(test['tokens_added'][2],0)
95         self.assertEqual(test['tokens_added'][3],8)
96         self.assertEqual(test['tokens_added'][4],0)
97         self.assertEqual(test['tokens_removed'][0],0)
98         self.assertEqual(test['tokens_removed'][1],0)
99         self.assertEqual(test['tokens_removed'][2],10)
100         self.assertEqual(test['tokens_removed'][3],4)
101         self.assertEqual(test['tokens_removed'][4],0)
102         self.assertEqual(test['token_revs'][0],8*3)
103         self.assertEqual(test['token_revs'][1],0)
104         self.assertEqual(test['token_revs'][2],0)
105         self.assertEqual(test['token_revs'][3],0)
106         self.assertEqual(test['token_revs'][4],0)
107
108         baseline = pd.read_table(baseline_file)
109         assert_frame_equal(test,baseline)
110
111
112         
113     def test_segment_persistence_exclude_ws(self):
114         test_filename =  "segment_excludews_" + self.wikiq_out_name
115         test_file = os.path.join(self.test_output_dir, test_filename)
116         if os.path.exists(test_file):
117             os.remove(test_file)
118         
119         call = self.base_call.format(self.input_file, self.test_output_dir)
120         call = call + " --url-encode --persistence segment --exclude-whitespace"
121         print(os.path.abspath('.'))
122         print(call)
123         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
124         proc.wait()
125
126         copyfile(self.call_output, test_file)
127         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
128
129         # as a test let's make sure that we get equal data frames
130         test = pd.read_table(test_file)
131         self.assertEqual(test['tokens_added'][0],4)
132         self.assertEqual(test['tokens_added'][1],5)
133         self.assertEqual(test['tokens_added'][2],0)
134         self.assertEqual(test['tokens_added'][3],6)
135         self.assertEqual(test['tokens_added'][4],0)
136         self.assertEqual(test['tokens_removed'][0],0)
137         self.assertEqual(test['tokens_removed'][1],0)
138         self.assertEqual(test['tokens_removed'][2],0)
139         self.assertEqual(test['tokens_removed'][3],4)
140         self.assertEqual(test['tokens_removed'][4],0)
141         self.assertEqual(test['token_revs'][0],4*3)
142         self.assertEqual(test['token_revs'][1],0)
143         self.assertEqual(test['token_revs'][2],0)
144         self.assertEqual(test['token_revs'][3],0)
145         self.assertEqual(test['token_revs'][4],0)
146
147         baseline = pd.read_table(baseline_file)
148         assert_frame_equal(test,baseline)
149
150
151
152 class Test_Persistence_Bug(unittest.TestCase):
153
154     def setUp(self):
155         if not os.path.exists("test_output"):
156             os.mkdir("test_output")
157
158         self.wiki = 'enwiki-test'
159         self.wikiq_out_name =  self.wiki + ".tsv"
160         self.test_output_dir = os.path.join(".", "test_output")
161         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
162
163         self.infile = "{0}.xml".format(self.wiki)    
164         self.base_call = "../wikiq {0} -o {1}"
165         self.input_dir = "dumps"
166         self.input_file = os.path.join(".", self.input_dir,self.infile)
167         self.baseline_output_dir = "baseline_output"
168
169     def test_sequence_persistence(self):
170         test_filename =  "sequence-" + self.wikiq_out_name
171         test_file = os.path.join(self.test_output_dir, test_filename)
172         if os.path.exists(test_file):
173             os.remove(test_file)
174         
175         call = self.base_call.format(self.input_file, self.test_output_dir)
176         call = call + " --url-encode --persistence sequence --collapse-user"
177         print(os.path.abspath('.'))
178         print(call)
179         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
180         proc.wait()
181
182         copyfile(self.call_output, test_file)
183         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
184
185         # as a test let's make sure that we get equal data frames
186         test = pd.read_table(test_file)
187         baseline = pd.read_table(baseline_file)
188         assert_frame_equal(test,baseline)
189
190
191 class Test_Wikipedia(unittest.TestCase):
192     def setUp(self):
193         if not os.path.exists("test_output"):
194             os.mkdir("test_output")
195
196         self.wiki = 'ikwiki-20180301-pages-meta-history'
197         self.wikiq_out_name =  self.wiki + ".tsv"
198         self.test_output_dir = os.path.join(".", "test_output")
199         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
200
201         self.infile = "{0}.xml.bz2".format(self.wiki)    
202         self.base_call = "../wikiq {0} -o {1}"
203         self.input_dir = "dumps"
204         self.input_file = os.path.join(".", self.input_dir,self.infile)
205         self.baseline_output_dir = "baseline_output"
206
207     def test_WP_url_encode(self):
208         test_filename =  "url-encode_" + self.wikiq_out_name
209         test_file = os.path.join(self.test_output_dir, test_filename)
210         if os.path.exists(test_file):
211             os.remove(test_file)
212         
213         call = self.base_call.format(self.input_file, self.test_output_dir)
214         call = call + " --url-encode"
215         print(call)
216         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
217         proc.wait()
218
219         copyfile(self.call_output, test_file)
220         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
221
222 #        as a test let's make sure that we get equal data frames
223
224         test = pd.read_table(test_file)
225         baseline = pd.read_table(baseline_file)
226         assert_frame_equal(test,baseline)
227
228     def test_WP_namespaces(self):
229         print(os.path.abspath('.'))
230         test_filename =  "namespaces_" + self.wikiq_out_name
231         test_file = os.path.join(self.test_output_dir, test_filename)
232         if os.path.exists(test_file):
233             os.remove(test_file)
234         
235         call = self.base_call.format(self.input_file, self.test_output_dir)
236         call = call + " -n 0 -n 1"
237         print(call)
238         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
239         proc.wait()
240         copyfile(self.call_output, test_file)
241         baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
242
243 #        as a test let's make sure that we get equal data frames
244         test = pd.read_table(test_file)
245         num_wrong_ns = sum(~ test.namespace.isin({0,1}))
246         self.assertEqual(num_wrong_ns, 0)
247         baseline = pd.read_table(baseline_file)
248         assert_frame_equal(test,baseline)
249
250
251 class Test_Basic(unittest.TestCase):
252
253     def setUp(self):
254         if not os.path.exists("test_output"):
255             os.mkdir("test_output")
256
257         self.wiki = 'sailormoon'
258         self.wikiq_out_name =  self.wiki + ".tsv"
259         self.test_output_dir = os.path.join(".", "test_output")
260         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
261
262         self.infile = "{0}.xml.7z".format(self.wiki)
263         self.base_call = "../wikiq {0} -o {1}"
264         self.input_dir = "dumps"
265         self.input_file = os.path.join(".", self.input_dir,self.infile)
266         self.baseline_output_dir = "baseline_output"
267
268     def test_noargs(self):
269
270         test_filename =  "noargs_" + self.wikiq_out_name
271         test_file = os.path.join(self.test_output_dir, test_filename)
272         if os.path.exists(test_file):
273             os.remove(test_file)
274         
275         call = self.base_call.format(self.input_file, self.test_output_dir)
276         print(call)
277         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
278         proc.wait()
279
280         copyfile(self.call_output, test_file)
281
282         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
283
284         test = pd.read_table(test_file)
285         baseline = pd.read_table(baseline_file)
286         assert_frame_equal(test,baseline)
287
288
289     def test_collapse_user(self):
290         test_filename =  "collapse-user_" + self.wikiq_out_name
291         test_file = os.path.join(self.test_output_dir, test_filename)
292         if os.path.exists(test_file):
293             os.remove(test_file)
294         
295         call = self.base_call.format(self.input_file, self.test_output_dir)
296         call = call + " --collapse-user"
297
298         print(call)
299         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
300         proc.wait()
301
302         copyfile(self.call_output, test_file)
303
304         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
305         test = pd.read_table(test_file)
306         baseline = pd.read_table(baseline_file)
307         assert_frame_equal(test,baseline)
308
309     def test_pwr_segment(self):
310         test_filename =  "persistence_segment_" + self.wikiq_out_name
311         test_file = os.path.join(self.test_output_dir, test_filename)
312         if os.path.exists(test_file):
313             os.remove(test_file)
314         
315         call = self.base_call.format(self.input_file, self.test_output_dir)
316         call = call + " --persistence segment"
317         print(call)
318         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
319         proc.wait()
320
321
322         copyfile(self.call_output, test_file)
323
324         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
325
326         test = pd.read_table(test_file)
327         print(test)
328         baseline = pd.read_table(baseline_file)
329         assert_frame_equal(test,baseline)
330
331     def test_pwr_segment_collapse(self):
332         test_filename =  "persistence_segment_collapse_" + self.wikiq_out_name
333         test_file = os.path.join(self.test_output_dir, test_filename)
334         if os.path.exists(test_file):
335             os.remove(test_file)
336         
337         call = self.base_call.format(self.input_file, self.test_output_dir)
338         call = call + " --persistence segment --collapse-user"
339         print(call)
340         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
341         proc.wait()
342
343
344         copyfile(self.call_output, test_file)
345
346         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
347
348         test = pd.read_table(test_file)
349         print(test)
350         baseline = pd.read_table(baseline_file)
351         assert_frame_equal(test,baseline)
352
353
354     def test_pwr_legacy(self):
355         test_filename =  "persistence_legacy_" + self.wikiq_out_name
356         test_file = os.path.join(self.test_output_dir, test_filename)
357         if os.path.exists(test_file):
358             os.remove(test_file)
359         
360         call = self.base_call.format(self.input_file, self.test_output_dir)
361         call = call + " --persistence legacy"
362         print(call)
363         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
364         proc.wait()
365
366         copyfile(self.call_output, test_file)
367
368         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
369
370         test = pd.read_table(test_file)
371         baseline = pd.read_table(baseline_file)
372         assert_frame_equal(test,baseline)
373
374     def test_pwr(self):
375         test_filename =  "persistence_" + self.wikiq_out_name
376         test_file = os.path.join(self.test_output_dir, test_filename)
377         if os.path.exists(test_file): 
378            os.remove(test_file)
379         
380         call = self.base_call.format(self.input_file, self.test_output_dir)
381         call = call + " --persistence"
382         print(call)
383         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
384         proc.wait()
385
386
387         copyfile(self.call_output, test_file)
388
389         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
390
391         test = pd.read_table(test_file)
392         baseline = pd.read_table(baseline_file)
393         assert_frame_equal(test,baseline)
394
395
396     def test_url_encode(self):
397         test_filename =  "url-encode_" + self.wikiq_out_name
398
399         test_file = os.path.join(self.test_output_dir, test_filename)
400         if os.path.exists(test_file):
401             os.remove(test_file)
402         
403         call = self.base_call.format(self.input_file, self.test_output_dir)
404         call = call + " --url-encode"
405         print(call)
406         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
407
408         proc.wait()
409
410         copyfile(self.call_output, test_file)
411         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
412         test = pd.read_table(test_file)
413         baseline = pd.read_table(baseline_file)
414         assert_frame_equal(test,baseline)
415
416
417 class Test_Malformed(unittest.TestCase):
418     def setUp(self):
419         if not os.path.exists("test_output"):
420             os.mkdir("test_output")
421
422         self.wiki = 'twinpeaks'
423         self.wikiq_out_name =  self.wiki + ".tsv"
424         self.test_output_dir = os.path.join(".", "test_output")
425         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
426
427         self.infile = "{0}.xml.7z".format(self.wiki)
428         self.base_call = "../wikiq {0} -o {1}"
429         self.input_dir = "dumps"
430         self.input_file = os.path.join(".", self.input_dir,self.infile)
431
432
433     def test_malformed_noargs(self):
434
435         call = self.base_call.format(self.input_file, self.test_output_dir)
436         print(call)
437         proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
438         proc.wait()
439         outs, errs = proc.communicate()
440         errlines = str(errs).split("\\n")
441         self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
442
443 class Test_Stdout(unittest.TestCase):
444
445     def setUp(self):
446         self.wiki = 'sailormoon'
447         self.wikiq_out_name =  self.wiki + ".tsv"
448
449         self.infile = "{0}.xml.7z".format(self.wiki)
450         self.base_call = "../wikiq {0} --stdout"
451         self.input_dir = "dumps"
452         self.input_file = os.path.join(".", self.input_dir,self.infile)
453         self.baseline_output_dir = "baseline_output"
454
455     def test_noargs(self):
456
457         call = self.base_call.format(self.input_file)
458         print(call)
459         proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
460         outs = proc.stdout.decode("utf8")
461
462         test_file = "noargs_" + self.wikiq_out_name
463         baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
464         print(baseline_file)
465         test = pd.read_table(StringIO(outs))
466         baseline = pd.read_table(baseline_file)
467         assert_frame_equal(test,baseline)
468         
469 if __name__ == '__main__':
470     unittest.main()

Community Data Science Collective || Want to submit a patch?