4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 class Test_Persistence(unittest.TestCase):
22 if not os.path.exists("test_output"):
23 os.mkdir("test_output")
25 self.wiki = 'pwr-test'
26 self.wikiq_out_name = self.wiki + ".tsv"
27 self.test_output_dir = os.path.join(".", "test_output")
28 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
30 self.infile = "{0}.xml".format(self.wiki)
31 self.base_call = "../wikiq {0} -o {1}"
32 self.input_dir = "dumps"
33 self.input_file = os.path.join(".", self.input_dir,self.infile)
34 self.baseline_output_dir = "baseline_output"
36 def test_sequence_persistence(self):
37 test_filename = "sequence-" + self.wikiq_out_name
38 test_file = os.path.join(self.test_output_dir, test_filename)
39 if os.path.exists(test_file):
42 call = self.base_call.format(self.input_file, self.test_output_dir)
43 call = call + " --url-encode --persistence sequence --collapse-user"
44 print(os.path.abspath('.'))
46 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
49 copyfile(self.call_output, test_file)
50 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
52 # as a test let's make sure that we get equal data frames
53 test = pd.read_table(test_file)
54 self.assertEqual(test['tokens_added'][0],4)
55 self.assertEqual(test['tokens_added'][1],5)
56 self.assertEqual(test['tokens_added'][2],0)
57 self.assertEqual(test['tokens_added'][3],6)
58 self.assertEqual(test['tokens_added'][4],4)
59 self.assertEqual(test['tokens_removed'][0],0)
60 self.assertEqual(test['tokens_removed'][1],0)
61 self.assertEqual(test['tokens_removed'][2],5)
62 self.assertEqual(test['tokens_removed'][3],4)
63 self.assertEqual(test['tokens_removed'][4],6)
64 self.assertEqual(test['token_revs'][0],4*3)
65 self.assertEqual(test['token_revs'][1],0)
66 self.assertEqual(test['token_revs'][2],0)
67 self.assertEqual(test['token_revs'][3],0)
68 self.assertEqual(test['token_revs'][4],0)
70 baseline = pd.read_table(baseline_file)
71 assert_frame_equal(test,baseline)
75 class Test_Persistence_Bug(unittest.TestCase):
78 if not os.path.exists("test_output"):
79 os.mkdir("test_output")
81 self.wiki = 'enwiki-test'
82 self.wikiq_out_name = self.wiki + ".tsv"
83 self.test_output_dir = os.path.join(".", "test_output")
84 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
86 self.infile = "{0}.xml".format(self.wiki)
87 self.base_call = "../wikiq {0} -o {1}"
88 self.input_dir = "dumps"
89 self.input_file = os.path.join(".", self.input_dir,self.infile)
90 self.baseline_output_dir = "baseline_output"
92 def test_sequence_persistence(self):
93 test_filename = "sequence-" + self.wikiq_out_name
94 test_file = os.path.join(self.test_output_dir, test_filename)
95 if os.path.exists(test_file):
98 call = self.base_call.format(self.input_file, self.test_output_dir)
99 call = call + " --url-encode --persistence sequence --collapse-user"
100 print(os.path.abspath('.'))
102 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
105 copyfile(self.call_output, test_file)
106 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
108 # as a test let's make sure that we get equal data frames
109 test = pd.read_table(test_file)
110 baseline = pd.read_table(baseline_file)
111 assert_frame_equal(test,baseline)
114 class Test_Wikipedia(unittest.TestCase):
116 if not os.path.exists("test_output"):
117 os.mkdir("test_output")
119 self.wiki = 'ikwiki-20180301-pages-meta-history'
120 self.wikiq_out_name = self.wiki + ".tsv"
121 self.test_output_dir = os.path.join(".", "test_output")
122 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
124 self.infile = "{0}.xml.bz2".format(self.wiki)
125 self.base_call = "../wikiq {0} -o {1}"
126 self.input_dir = "dumps"
127 self.input_file = os.path.join(".", self.input_dir,self.infile)
128 self.baseline_output_dir = "baseline_output"
130 def test_WP_url_encode(self):
131 test_filename = "url-encode_" + self.wikiq_out_name
132 test_file = os.path.join(self.test_output_dir, test_filename)
133 if os.path.exists(test_file):
136 call = self.base_call.format(self.input_file, self.test_output_dir)
137 call = call + " --url-encode"
138 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
141 copyfile(self.call_output, test_file)
142 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
144 # as a test let's make sure that we get equal data frames
146 test = pd.read_table(test_file)
147 baseline = pd.read_table(baseline_file)
148 assert_frame_equal(test,baseline)
150 def test_WP_namespaces(self):
151 print(os.path.abspath('.'))
152 test_filename = "namespaces_" + self.wikiq_out_name
153 test_file = os.path.join(self.test_output_dir, test_filename)
154 if os.path.exists(test_file):
157 call = self.base_call.format(self.input_file, self.test_output_dir)
158 call = call + " -n 0 -n 1"
160 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
162 copyfile(self.call_output, test_file)
163 baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
165 # as a test let's make sure that we get equal data frames
166 test = pd.read_table(test_file)
167 num_wrong_ns = sum(~ test.namespace.isin({0,1}))
168 self.assertEqual(num_wrong_ns, 0)
169 baseline = pd.read_table(baseline_file)
170 assert_frame_equal(test,baseline)
173 class Test_Basic(unittest.TestCase):
176 if not os.path.exists("test_output"):
177 os.mkdir("test_output")
179 self.wiki = 'sailormoon'
180 self.wikiq_out_name = self.wiki + ".tsv"
181 self.test_output_dir = os.path.join(".", "test_output")
182 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
184 self.infile = "{0}.xml.7z".format(self.wiki)
185 self.base_call = "../wikiq {0} -o {1}"
186 self.input_dir = "dumps"
187 self.input_file = os.path.join(".", self.input_dir,self.infile)
188 self.baseline_output_dir = "baseline_output"
190 def test_noargs(self):
192 test_filename = "noargs_" + self.wikiq_out_name
193 test_file = os.path.join(self.test_output_dir, test_filename)
194 if os.path.exists(test_file):
197 call = self.base_call.format(self.input_file, self.test_output_dir)
198 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
201 copyfile(self.call_output, test_file)
203 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
205 test = pd.read_table(test_file)
206 baseline = pd.read_table(baseline_file)
207 assert_frame_equal(test,baseline)
210 def test_collapse_user(self):
211 test_filename = "collapse-user_" + self.wikiq_out_name
212 test_file = os.path.join(self.test_output_dir, test_filename)
213 if os.path.exists(test_file):
216 call = self.base_call.format(self.input_file, self.test_output_dir)
217 call = call + " --collapse-user"
219 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
222 copyfile(self.call_output, test_file)
224 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
225 test = pd.read_table(test_file)
226 baseline = pd.read_table(baseline_file)
227 assert_frame_equal(test,baseline)
229 def test_pwr_segment(self):
230 test_filename = "persistence_segment_" + self.wikiq_out_name
231 test_file = os.path.join(self.test_output_dir, test_filename)
232 if os.path.exists(test_file):
235 call = self.base_call.format(self.input_file, self.test_output_dir)
236 call = call + " --persistence segment"
238 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
242 copyfile(self.call_output, test_file)
244 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
246 test = pd.read_table(test_file)
248 baseline = pd.read_table(baseline_file)
249 assert_frame_equal(test,baseline)
251 def test_pwr_legacy(self):
252 test_filename = "persistence_legacy_" + self.wikiq_out_name
253 test_file = os.path.join(self.test_output_dir, test_filename)
254 if os.path.exists(test_file):
257 call = self.base_call.format(self.input_file, self.test_output_dir)
258 call = call + " --persistence legacy"
259 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
262 copyfile(self.call_output, test_file)
264 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
266 test = pd.read_table(test_file)
267 baseline = pd.read_table(baseline_file)
268 assert_frame_equal(test,baseline)
271 test_filename = "persistence_" + self.wikiq_out_name
272 test_file = os.path.join(self.test_output_dir, test_filename)
273 if os.path.exists(test_file):
276 call = self.base_call.format(self.input_file, self.test_output_dir)
277 call = call + " --persistence"
278 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
282 copyfile(self.call_output, test_file)
284 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
286 test = pd.read_table(test_file)
287 baseline = pd.read_table(baseline_file)
288 assert_frame_equal(test,baseline)
291 def test_url_encode(self):
292 test_filename = "url-encode_" + self.wikiq_out_name
294 test_file = os.path.join(self.test_output_dir, test_filename)
295 if os.path.exists(test_file):
298 call = self.base_call.format(self.input_file, self.test_output_dir)
299 call = call + " --url-encode"
300 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
303 copyfile(self.call_output, test_file)
304 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
305 test = pd.read_table(test_file)
306 baseline = pd.read_table(baseline_file)
307 assert_frame_equal(test,baseline)
310 class Test_Malformed(unittest.TestCase):
312 if not os.path.exists("test_output"):
313 os.mkdir("test_output")
315 self.wiki = 'twinpeaks'
316 self.wikiq_out_name = self.wiki + ".tsv"
317 self.test_output_dir = os.path.join(".", "test_output")
318 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
320 self.infile = "{0}.xml.7z".format(self.wiki)
321 self.base_call = "../wikiq {0} -o {1}"
322 self.input_dir = "dumps"
323 self.input_file = os.path.join(".", self.input_dir,self.infile)
326 def test_malformed_noargs(self):
328 call = self.base_call.format(self.input_file, self.test_output_dir)
329 proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
331 outs, errs = proc.communicate()
332 errlines = str(errs).split("\\n")
333 self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
335 class Test_Stdout(unittest.TestCase):
338 self.wiki = 'sailormoon'
339 self.wikiq_out_name = self.wiki + ".tsv"
341 self.infile = "{0}.xml.7z".format(self.wiki)
342 self.base_call = "../wikiq {0} --stdout"
343 self.input_dir = "dumps"
344 self.input_file = os.path.join(".", self.input_dir,self.infile)
345 self.baseline_output_dir = "baseline_output"
347 def test_noargs(self):
349 call = self.base_call.format(self.input_file)
350 proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
351 outs = proc.stdout.decode("utf8")
353 test_file = "noargs_" + self.wikiq_out_name
354 baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
356 test = pd.read_table(StringIO(outs))
357 baseline = pd.read_table(baseline_file)
358 assert_frame_equal(test,baseline)
360 if __name__ == '__main__':