]> code.communitydata.science - mediawiki_dump_tools.git/blob - test/Wikiq_Unit_Test.py
make sailormoon smaller
[mediawiki_dump_tools.git] / test / Wikiq_Unit_Test.py
1 import unittest
2 import os
3 import subprocess
4 from shutil import copyfile
5 import pandas as pd
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
8
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default  
14 # reading various file formats including
15 #        7z, gz, bz2, xml  DONE
16 # wikia and wikipedia data DONE
17 # malformed xmls DONE
18
19 class Test_Persistence(unittest.TestCase):
20
21     def setUp(self):
22
23         if not os.path.exists("test_output"):
24             os.mkdir("test_output")
25
26         self.wiki = 'pwr-test'
27         self.wikiq_out_name =  self.wiki + ".tsv"
28         self.test_output_dir = os.path.join(".", "test_output")
29         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
30
31         self.infile = "{0}.xml".format(self.wiki)    
32         self.base_call = "../wikiq {0} -o {1}"
33         self.input_dir = "dumps"
34         self.input_file = os.path.join(".", self.input_dir,self.infile)
35         self.baseline_output_dir = "baseline_output"
36
37     def test_sequence_persistence(self):
38         test_filename =  "sequence-" + self.wikiq_out_name
39         test_file = os.path.join(self.test_output_dir, test_filename)
40         if os.path.exists(test_file):
41             os.remove(test_file)
42         
43         call = self.base_call.format(self.input_file, self.test_output_dir)
44         call = call + " --url-encode --persistence sequence"
45         print(os.path.abspath('.'))
46         print(call)
47         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
48         proc.wait()
49
50         copyfile(self.call_output, test_file)
51         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
52
53         # as a test let's make sure that we get equal data frames
54         test = pd.read_table(test_file)
55         self.assertEqual(test['tokens_added'][0],7)
56         self.assertEqual(test['tokens_added'][1],10)
57         self.assertEqual(test['tokens_added'][2],0)
58         self.assertEqual(test['tokens_added'][3],8)
59         self.assertEqual(test['tokens_added'][4],0)
60         self.assertEqual(test['tokens_removed'][0],0)
61         self.assertEqual(test['tokens_removed'][1],0)
62         self.assertEqual(test['tokens_removed'][2],0)
63         self.assertEqual(test['tokens_removed'][3],4)
64         self.assertEqual(test['tokens_removed'][4],0)
65         self.assertEqual(test['token_revs'][0],8*3)
66         self.assertEqual(test['token_revs'][1],0)
67         self.assertEqual(test['token_revs'][2],0)
68         self.assertEqual(test['token_revs'][3],0)
69         self.assertEqual(test['token_revs'][4],0)
70
71         baseline = pd.read_table(baseline_file)
72         assert_frame_equal(test,baseline)
73
74     def test_legacy_persistence(self):
75         test_filename =  "legacy-" + self.wikiq_out_name
76         test_file = os.path.join(self.test_output_dir, test_filename)
77         if os.path.exists(test_file):
78             os.remove(test_file)
79         
80         call = self.base_call.format(self.input_file, self.test_output_dir)
81         call = call + " --url-encode --persistence legacy"
82         print(os.path.abspath('.'))
83         print(call)
84         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
85         proc.wait()
86
87         copyfile(self.call_output, test_file)
88         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
89
90         # as a test let's make sure that we get equal data frames
91         test = pd.read_table(test_file)
92         self.assertEqual(test['tokens_added'][0],7)
93         self.assertEqual(test['tokens_added'][1],10)
94         self.assertEqual(test['tokens_added'][2],0)
95         self.assertEqual(test['tokens_added'][3],11)
96         self.assertEqual(test['tokens_added'][4],0)
97         self.assertEqual(test['tokens_removed'][0],0)
98         self.assertEqual(test['tokens_removed'][1],0)
99         self.assertEqual(test['tokens_removed'][2],0)
100         self.assertEqual(test['tokens_removed'][3],7)
101         self.assertEqual(test['tokens_removed'][4],0)
102         self.assertEqual(test['token_revs'][0],7*3)
103         self.assertEqual(test['token_revs'][1],0)
104         self.assertEqual(test['token_revs'][2],0)
105         self.assertEqual(test['token_revs'][3],0)
106         self.assertEqual(test['token_revs'][4],0)
107
108         baseline = pd.read_table(baseline_file)
109         assert_frame_equal(test,baseline)
110
111
112
113 class Test_Persistence_Bug(unittest.TestCase):
114
115     def setUp(self):
116         if not os.path.exists("test_output"):
117             os.mkdir("test_output")
118
119         self.wiki = 'enwiki-test'
120         self.wikiq_out_name =  self.wiki + ".tsv"
121         self.test_output_dir = os.path.join(".", "test_output")
122         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
123
124         self.infile = "{0}.xml".format(self.wiki)    
125         self.base_call = "../wikiq {0} -o {1}"
126         self.input_dir = "dumps"
127         self.input_file = os.path.join(".", self.input_dir,self.infile)
128         self.baseline_output_dir = "baseline_output"
129
130     def test_sequence_persistence(self):
131         test_filename =  "sequence-" + self.wikiq_out_name
132         test_file = os.path.join(self.test_output_dir, test_filename)
133         if os.path.exists(test_file):
134             os.remove(test_file)
135         
136         call = self.base_call.format(self.input_file, self.test_output_dir)
137         call = call + " --url-encode --persistence sequence --collapse-user"
138         print(os.path.abspath('.'))
139         print(call)
140         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
141         proc.wait()
142
143         copyfile(self.call_output, test_file)
144         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
145
146         # as a test let's make sure that we get equal data frames
147         test = pd.read_table(test_file)
148         baseline = pd.read_table(baseline_file)
149         assert_frame_equal(test,baseline)
150
151
152 class Test_Wikipedia(unittest.TestCase):
153     def setUp(self):
154         if not os.path.exists("test_output"):
155             os.mkdir("test_output")
156
157         self.wiki = 'ikwiki-20180301-pages-meta-history'
158         self.wikiq_out_name =  self.wiki + ".tsv"
159         self.test_output_dir = os.path.join(".", "test_output")
160         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
161
162         self.infile = "{0}.xml.bz2".format(self.wiki)    
163         self.base_call = "../wikiq {0} -o {1}"
164         self.input_dir = "dumps"
165         self.input_file = os.path.join(".", self.input_dir,self.infile)
166         self.baseline_output_dir = "baseline_output"
167
168     def test_WP_url_encode(self):
169         test_filename =  "url-encode_" + self.wikiq_out_name
170         test_file = os.path.join(self.test_output_dir, test_filename)
171         if os.path.exists(test_file):
172             os.remove(test_file)
173         
174         call = self.base_call.format(self.input_file, self.test_output_dir)
175         call = call + " --url-encode"
176         print(call)
177         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
178         proc.wait()
179
180         copyfile(self.call_output, test_file)
181         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
182
183 #        as a test let's make sure that we get equal data frames
184
185         test = pd.read_table(test_file)
186         baseline = pd.read_table(baseline_file)
187         assert_frame_equal(test,baseline)
188
189     def test_WP_namespaces(self):
190         print(os.path.abspath('.'))
191         test_filename =  "namespaces_" + self.wikiq_out_name
192         test_file = os.path.join(self.test_output_dir, test_filename)
193         if os.path.exists(test_file):
194             os.remove(test_file)
195         
196         call = self.base_call.format(self.input_file, self.test_output_dir)
197         call = call + " -n 0 -n 1"
198         print(call)
199         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
200         proc.wait()
201         copyfile(self.call_output, test_file)
202         baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
203
204 #        as a test let's make sure that we get equal data frames
205         test = pd.read_table(test_file)
206         num_wrong_ns = sum(~ test.namespace.isin({0,1}))
207         self.assertEqual(num_wrong_ns, 0)
208         baseline = pd.read_table(baseline_file)
209         assert_frame_equal(test,baseline)
210
211
212 class Test_Basic(unittest.TestCase):
213
214     def setUp(self):
215         if not os.path.exists("test_output"):
216             os.mkdir("test_output")
217
218         self.wiki = 'sailormoon'
219         self.wikiq_out_name =  self.wiki + ".tsv"
220         self.test_output_dir = os.path.join(".", "test_output")
221         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
222
223         self.infile = "{0}.xml.7z".format(self.wiki)
224         self.base_call = "../wikiq {0} -o {1}"
225         self.input_dir = "dumps"
226         self.input_file = os.path.join(".", self.input_dir,self.infile)
227         self.baseline_output_dir = "baseline_output"
228
229     def test_noargs(self):
230
231         test_filename =  "noargs_" + self.wikiq_out_name
232         test_file = os.path.join(self.test_output_dir, test_filename)
233         if os.path.exists(test_file):
234             os.remove(test_file)
235         
236         call = self.base_call.format(self.input_file, self.test_output_dir)
237         print(call)
238         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
239         proc.wait()
240
241         copyfile(self.call_output, test_file)
242
243         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
244
245         test = pd.read_table(test_file)
246         baseline = pd.read_table(baseline_file)
247         assert_frame_equal(test,baseline)
248
249
250     def test_collapse_user(self):
251         test_filename =  "collapse-user_" + self.wikiq_out_name
252         test_file = os.path.join(self.test_output_dir, test_filename)
253         if os.path.exists(test_file):
254             os.remove(test_file)
255         
256         call = self.base_call.format(self.input_file, self.test_output_dir)
257         call = call + " --collapse-user"
258
259         print(call)
260         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
261         proc.wait()
262
263         copyfile(self.call_output, test_file)
264
265         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
266         test = pd.read_table(test_file)
267         baseline = pd.read_table(baseline_file)
268         assert_frame_equal(test,baseline)
269
270     def test_pwr_segment(self):
271         test_filename =  "persistence_segment_" + self.wikiq_out_name
272         test_file = os.path.join(self.test_output_dir, test_filename)
273         if os.path.exists(test_file):
274             os.remove(test_file)
275         
276         call = self.base_call.format(self.input_file, self.test_output_dir)
277         call = call + " --persistence segment"
278         print(call)
279         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
280         proc.wait()
281
282
283         copyfile(self.call_output, test_file)
284
285         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
286
287         test = pd.read_table(test_file)
288         print(test)
289         baseline = pd.read_table(baseline_file)
290         assert_frame_equal(test,baseline)
291
292     def test_pwr_legacy(self):
293         test_filename =  "persistence_legacy_" + self.wikiq_out_name
294         test_file = os.path.join(self.test_output_dir, test_filename)
295         if os.path.exists(test_file):
296             os.remove(test_file)
297         
298         call = self.base_call.format(self.input_file, self.test_output_dir)
299         call = call + " --persistence legacy"
300         print(call)
301         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
302         proc.wait()
303
304         copyfile(self.call_output, test_file)
305
306         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
307
308         test = pd.read_table(test_file)
309         baseline = pd.read_table(baseline_file)
310         assert_frame_equal(test,baseline)
311
312     def test_pwr(self):
313         test_filename =  "persistence_" + self.wikiq_out_name
314         test_file = os.path.join(self.test_output_dir, test_filename)
315         if os.path.exists(test_file): 
316            os.remove(test_file)
317         
318         call = self.base_call.format(self.input_file, self.test_output_dir)
319         call = call + " --persistence"
320         print(call)
321         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
322         proc.wait()
323
324
325         copyfile(self.call_output, test_file)
326
327         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
328
329         test = pd.read_table(test_file)
330         baseline = pd.read_table(baseline_file)
331         assert_frame_equal(test,baseline)
332
333
334     def test_url_encode(self):
335         test_filename =  "url-encode_" + self.wikiq_out_name
336
337         test_file = os.path.join(self.test_output_dir, test_filename)
338         if os.path.exists(test_file):
339             os.remove(test_file)
340         
341         call = self.base_call.format(self.input_file, self.test_output_dir)
342         call = call + " --url-encode"
343         print(call)
344         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
345
346         proc.wait()
347
348         copyfile(self.call_output, test_file)
349         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
350         test = pd.read_table(test_file)
351         baseline = pd.read_table(baseline_file)
352         assert_frame_equal(test,baseline)
353
354
355 class Test_Malformed(unittest.TestCase):
356     def setUp(self):
357         if not os.path.exists("test_output"):
358             os.mkdir("test_output")
359
360         self.wiki = 'twinpeaks'
361         self.wikiq_out_name =  self.wiki + ".tsv"
362         self.test_output_dir = os.path.join(".", "test_output")
363         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
364
365         self.infile = "{0}.xml.7z".format(self.wiki)
366         self.base_call = "../wikiq {0} -o {1}"
367         self.input_dir = "dumps"
368         self.input_file = os.path.join(".", self.input_dir,self.infile)
369
370
371     def test_malformed_noargs(self):
372
373         call = self.base_call.format(self.input_file, self.test_output_dir)
374         print(call)
375         proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
376         proc.wait()
377         outs, errs = proc.communicate()
378         errlines = str(errs).split("\\n")
379         self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
380
381 class Test_Stdout(unittest.TestCase):
382
383     def setUp(self):
384         self.wiki = 'sailormoon'
385         self.wikiq_out_name =  self.wiki + ".tsv"
386
387         self.infile = "{0}.xml.7z".format(self.wiki)
388         self.base_call = "../wikiq {0} --stdout"
389         self.input_dir = "dumps"
390         self.input_file = os.path.join(".", self.input_dir,self.infile)
391         self.baseline_output_dir = "baseline_output"
392
393     def test_noargs(self):
394
395         call = self.base_call.format(self.input_file)
396         print(call)
397         proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
398         outs = proc.stdout.decode("utf8")
399
400         test_file = "noargs_" + self.wikiq_out_name
401         baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
402         print(baseline_file)
403         test = pd.read_table(StringIO(outs))
404         baseline = pd.read_table(baseline_file)
405         assert_frame_equal(test,baseline)
406         
407 if __name__ == '__main__':
408     unittest.main()

Community Data Science Collective || Want to submit a patch?