]> code.communitydata.science - mediawiki_dump_tools.git/blob - test/Wikiq_Unit_Test.py
wikiq mostly functional, but reverters take all the credit for the content they restore.
[mediawiki_dump_tools.git] / test / Wikiq_Unit_Test.py
1 import unittest
2 import os
3 import subprocess
4 from shutil import copyfile
5 import pandas as pd
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
8
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default  
14 # reading various file formats including
15 #        7z, gz, bz2, xml  DONE
16 # wikia and wikipedia data DONE
17 # malformed xmls DONE
18
19 class Test_Persistence(unittest.TestCase):
20
21     def setUp(self):
22         if not os.path.exists("test_output"):
23             os.mkdir("test_output")
24
25         self.wiki = 'pwr-test'
26         self.wikiq_out_name =  self.wiki + ".tsv"
27         self.test_output_dir = os.path.join(".", "test_output")
28         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
29
30         self.infile = "{0}.xml".format(self.wiki)    
31         self.base_call = "../wikiq {0} -o {1}"
32         self.input_dir = "dumps"
33         self.input_file = os.path.join(".", self.input_dir,self.infile)
34         self.baseline_output_dir = "baseline_output"
35
36     def test_sequence_persistence(self):
37         test_filename =  "sequence-" + self.wikiq_out_name
38         test_file = os.path.join(self.test_output_dir, test_filename)
39         if os.path.exists(test_file):
40             os.remove(test_file)
41         
42         call = self.base_call.format(self.input_file, self.test_output_dir)
43         call = call + " --url-encode --persistence sequence --collapse-user"
44         print(os.path.abspath('.'))
45         print(call)
46         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
47         proc.wait()
48
49         copyfile(self.call_output, test_file)
50         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
51
52         # as a test let's make sure that we get equal data frames
53         test = pd.read_table(test_file)
54         self.assertEqual(test['tokens_added'][0],4)
55         self.assertEqual(test['tokens_added'][1],5)
56         self.assertEqual(test['tokens_added'][2],0)
57         self.assertEqual(test['tokens_added'][3],6)
58         self.assertEqual(test['tokens_added'][4],4)
59         self.assertEqual(test['tokens_removed'][0],0)
60         self.assertEqual(test['tokens_removed'][1],0)
61         self.assertEqual(test['tokens_removed'][2],5)
62         self.assertEqual(test['tokens_removed'][3],4)
63         self.assertEqual(test['tokens_removed'][4],6)
64         self.assertEqual(test['token_revs'][0],4*3)
65         self.assertEqual(test['token_revs'][1],0)
66         self.assertEqual(test['token_revs'][2],0)
67         self.assertEqual(test['token_revs'][3],0)
68         self.assertEqual(test['token_revs'][4],0)
69
70         baseline = pd.read_table(baseline_file)
71         assert_frame_equal(test,baseline)
72         
73
74
75 class Test_Persistence_Bug(unittest.TestCase):
76
77     def setUp(self):
78         if not os.path.exists("test_output"):
79             os.mkdir("test_output")
80
81         self.wiki = 'enwiki-test'
82         self.wikiq_out_name =  self.wiki + ".tsv"
83         self.test_output_dir = os.path.join(".", "test_output")
84         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
85
86         self.infile = "{0}.xml".format(self.wiki)    
87         self.base_call = "../wikiq {0} -o {1}"
88         self.input_dir = "dumps"
89         self.input_file = os.path.join(".", self.input_dir,self.infile)
90         self.baseline_output_dir = "baseline_output"
91
92     def test_sequence_persistence(self):
93         test_filename =  "sequence-" + self.wikiq_out_name
94         test_file = os.path.join(self.test_output_dir, test_filename)
95         if os.path.exists(test_file):
96             os.remove(test_file)
97         
98         call = self.base_call.format(self.input_file, self.test_output_dir)
99         call = call + " --url-encode --persistence sequence --collapse-user"
100         print(os.path.abspath('.'))
101         print(call)
102         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
103         proc.wait()
104
105         copyfile(self.call_output, test_file)
106         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
107
108         # as a test let's make sure that we get equal data frames
109         test = pd.read_table(test_file)
110         baseline = pd.read_table(baseline_file)
111         assert_frame_equal(test,baseline)
112
113
114 class Test_Wikipedia(unittest.TestCase):
115     def setUp(self):
116         if not os.path.exists("test_output"):
117             os.mkdir("test_output")
118
119         self.wiki = 'ikwiki-20180301-pages-meta-history'
120         self.wikiq_out_name =  self.wiki + ".tsv"
121         self.test_output_dir = os.path.join(".", "test_output")
122         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
123
124         self.infile = "{0}.xml.bz2".format(self.wiki)    
125         self.base_call = "../wikiq {0} -o {1}"
126         self.input_dir = "dumps"
127         self.input_file = os.path.join(".", self.input_dir,self.infile)
128         self.baseline_output_dir = "baseline_output"
129
130     def test_WP_url_encode(self):
131         test_filename =  "url-encode_" + self.wikiq_out_name
132         test_file = os.path.join(self.test_output_dir, test_filename)
133         if os.path.exists(test_file):
134             os.remove(test_file)
135         
136         call = self.base_call.format(self.input_file, self.test_output_dir)
137         call = call + " --url-encode"
138         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
139         proc.wait()
140
141         copyfile(self.call_output, test_file)
142         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
143
144 #        as a test let's make sure that we get equal data frames
145
146         test = pd.read_table(test_file)
147         baseline = pd.read_table(baseline_file)
148         assert_frame_equal(test,baseline)
149
150     def test_WP_namespaces(self):
151         print(os.path.abspath('.'))
152         test_filename =  "namespaces_" + self.wikiq_out_name
153         test_file = os.path.join(self.test_output_dir, test_filename)
154         if os.path.exists(test_file):
155             os.remove(test_file)
156         
157         call = self.base_call.format(self.input_file, self.test_output_dir)
158         call = call + " -n 0 -n 1"
159         print(call)
160         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
161         proc.wait()
162         copyfile(self.call_output, test_file)
163         baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
164
165 #        as a test let's make sure that we get equal data frames
166         test = pd.read_table(test_file)
167         num_wrong_ns = sum(~ test.namespace.isin({0,1}))
168         self.assertEqual(num_wrong_ns, 0)
169         baseline = pd.read_table(baseline_file)
170         assert_frame_equal(test,baseline)
171
172
173 class Test_Basic(unittest.TestCase):
174
175     def setUp(self):
176         if not os.path.exists("test_output"):
177             os.mkdir("test_output")
178
179         self.wiki = 'sailormoon'
180         self.wikiq_out_name =  self.wiki + ".tsv"
181         self.test_output_dir = os.path.join(".", "test_output")
182         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
183
184         self.infile = "{0}.xml.7z".format(self.wiki)
185         self.base_call = "../wikiq {0} -o {1}"
186         self.input_dir = "dumps"
187         self.input_file = os.path.join(".", self.input_dir,self.infile)
188         self.baseline_output_dir = "baseline_output"
189
190     def test_noargs(self):
191
192         test_filename =  "noargs_" + self.wikiq_out_name
193         test_file = os.path.join(self.test_output_dir, test_filename)
194         if os.path.exists(test_file):
195             os.remove(test_file)
196         
197         call = self.base_call.format(self.input_file, self.test_output_dir)
198         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
199         proc.wait()
200
201         copyfile(self.call_output, test_file)
202
203         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
204
205         test = pd.read_table(test_file)
206         baseline = pd.read_table(baseline_file)
207         assert_frame_equal(test,baseline)
208
209
210     def test_collapse_user(self):
211         test_filename =  "collapse-user_" + self.wikiq_out_name
212         test_file = os.path.join(self.test_output_dir, test_filename)
213         if os.path.exists(test_file):
214             os.remove(test_file)
215         
216         call = self.base_call.format(self.input_file, self.test_output_dir)
217         call = call + " --collapse-user"
218
219         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
220         proc.wait()
221
222         copyfile(self.call_output, test_file)
223
224         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
225         test = pd.read_table(test_file)
226         baseline = pd.read_table(baseline_file)
227         assert_frame_equal(test,baseline)
228
229     def test_pwr_segment(self):
230         test_filename =  "persistence_segment_" + self.wikiq_out_name
231         test_file = os.path.join(self.test_output_dir, test_filename)
232         if os.path.exists(test_file):
233             os.remove(test_file)
234         
235         call = self.base_call.format(self.input_file, self.test_output_dir)
236         call = call + " --persistence segment"
237         print(call)
238         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
239         proc.wait()
240
241
242         copyfile(self.call_output, test_file)
243
244         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
245
246         test = pd.read_table(test_file)
247         print(test)
248         baseline = pd.read_table(baseline_file)
249         assert_frame_equal(test,baseline)
250
251     def test_pwr_legacy(self):
252         test_filename =  "persistence_legacy_" + self.wikiq_out_name
253         test_file = os.path.join(self.test_output_dir, test_filename)
254         if os.path.exists(test_file):
255             os.remove(test_file)
256         
257         call = self.base_call.format(self.input_file, self.test_output_dir)
258         call = call + " --persistence legacy"
259         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
260         proc.wait()
261
262         copyfile(self.call_output, test_file)
263
264         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
265
266         test = pd.read_table(test_file)
267         baseline = pd.read_table(baseline_file)
268         assert_frame_equal(test,baseline)
269
270     def test_pwr(self):
271         test_filename =  "persistence_" + self.wikiq_out_name
272         test_file = os.path.join(self.test_output_dir, test_filename)
273         if os.path.exists(test_file): 
274            os.remove(test_file)
275         
276         call = self.base_call.format(self.input_file, self.test_output_dir)
277         call = call + " --persistence"
278         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
279         proc.wait()
280
281
282         copyfile(self.call_output, test_file)
283
284         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
285
286         test = pd.read_table(test_file)
287         baseline = pd.read_table(baseline_file)
288         assert_frame_equal(test,baseline)
289
290
291     def test_url_encode(self):
292         test_filename =  "url-encode_" + self.wikiq_out_name
293
294         test_file = os.path.join(self.test_output_dir, test_filename)
295         if os.path.exists(test_file):
296             os.remove(test_file)
297         
298         call = self.base_call.format(self.input_file, self.test_output_dir)
299         call = call + " --url-encode"
300         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
301         proc.wait()
302
303         copyfile(self.call_output, test_file)
304         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
305         test = pd.read_table(test_file)
306         baseline = pd.read_table(baseline_file)
307         assert_frame_equal(test,baseline)
308
309
310 class Test_Malformed(unittest.TestCase):
311     def setUp(self):
312         if not os.path.exists("test_output"):
313             os.mkdir("test_output")
314
315         self.wiki = 'twinpeaks'
316         self.wikiq_out_name =  self.wiki + ".tsv"
317         self.test_output_dir = os.path.join(".", "test_output")
318         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
319
320         self.infile = "{0}.xml.7z".format(self.wiki)
321         self.base_call = "../wikiq {0} -o {1}"
322         self.input_dir = "dumps"
323         self.input_file = os.path.join(".", self.input_dir,self.infile)
324
325
326     def test_malformed_noargs(self):
327
328         call = self.base_call.format(self.input_file, self.test_output_dir)
329         proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
330         proc.wait()
331         outs, errs = proc.communicate()
332         errlines = str(errs).split("\\n")
333         self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
334
335 class Test_Stdout(unittest.TestCase):
336
337     def setUp(self):
338         self.wiki = 'sailormoon'
339         self.wikiq_out_name =  self.wiki + ".tsv"
340
341         self.infile = "{0}.xml.7z".format(self.wiki)
342         self.base_call = "../wikiq {0} --stdout"
343         self.input_dir = "dumps"
344         self.input_file = os.path.join(".", self.input_dir,self.infile)
345         self.baseline_output_dir = "baseline_output"
346
347     def test_noargs(self):
348
349         call = self.base_call.format(self.input_file)
350         proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
351         outs = proc.stdout.decode("utf8")
352
353         test_file = "noargs_" + self.wikiq_out_name
354         baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
355         print(baseline_file)
356         test = pd.read_table(StringIO(outs))
357         baseline = pd.read_table(baseline_file)
358         assert_frame_equal(test,baseline)
359         
360 if __name__ == '__main__':
361     unittest.main()

Community Data Science Collective || Want to submit a patch?