4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 # class Test_Persistence_Bug(unittest.TestCase):
22 # if not os.path.exists("test_output"):
23 # os.mkdir("test_output")
25 # self.wiki = 'enwiki-test'
26 # self.wikiq_out_name = self.wiki + ".tsv"
27 # self.test_output_dir = os.path.join(".", "test_output")
28 # self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
30 # self.infile = "{0}.xml".format(self.wiki)
31 # self.base_call = "../wikiq {0} -o {1}"
32 # self.input_dir = "dumps"
33 # self.input_file = os.path.join(".", self.input_dir,self.infile)
34 # self.baseline_output_dir = "baseline_output"
36 # def test_segment_persistence(self):
37 # test_filename = "sequence-" + self.wikiq_out_name
38 # test_file = os.path.join(self.test_output_dir, test_filename)
39 # if os.path.exists(test_file):
40 # os.remove(test_file)
42 # call = self.base_call.format(self.input_file, self.test_output_dir)
43 # call = call + " --url-encode --persistence sequence --collapse-user"
44 # print(os.path.abspath('.'))
46 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
49 # copyfile(self.call_output, test_file)
50 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
52 # # as a test let's make sure that we get equal data frames
53 # test = pd.read_table(test_file)
54 # baseline = pd.read_table(baseline_file)
55 # assert_frame_equal(test,baseline)
58 class Test_Wikipedia(unittest.TestCase):
60 if not os.path.exists("test_output"):
61 os.mkdir("test_output")
63 self.wiki = 'ikwiki-20180301-pages-meta-history'
64 self.wikiq_out_name = self.wiki + ".tsv"
65 self.test_output_dir = os.path.join(".", "test_output")
66 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
68 self.infile = "{0}.xml.bz2".format(self.wiki)
69 self.base_call = "../wikiq {0} -o {1}"
70 self.input_dir = "dumps"
71 self.input_file = os.path.join(".", self.input_dir,self.infile)
72 self.baseline_output_dir = "baseline_output"
74 def test_WP_url_encode(self):
75 test_filename = "url-encode_" + self.wikiq_out_name
76 test_file = os.path.join(self.test_output_dir, test_filename)
77 if os.path.exists(test_file):
80 call = self.base_call.format(self.input_file, self.test_output_dir)
81 call = call + " --url-encode"
82 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
85 copyfile(self.call_output, test_file)
86 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
88 # as a test let's make sure that we get equal data frames
90 test = pd.read_table(test_file)
91 baseline = pd.read_table(baseline_file)
92 assert_frame_equal(test,baseline)
94 def test_WP_namespaces(self):
95 print(os.path.abspath('.'))
96 test_filename = "namespaces_" + self.wikiq_out_name
97 test_file = os.path.join(self.test_output_dir, test_filename)
98 if os.path.exists(test_file):
101 call = self.base_call.format(self.input_file, self.test_output_dir)
102 call = call + " -n 0 -n 1"
104 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
106 # copyfile(self.call_output, test_file)
107 baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
109 # as a test let's make sure that we get equal data frames
110 test = pd.read_table(test_file)
111 num_wrong_ns = sum(~ test.namespace.isin({0,1}))
112 self.assertEqual(num_wrong_ns, 0)
113 baseline = pd.read_table(baseline_file)
114 assert_frame_equal(test,baseline)
117 # class Test_Basic(unittest.TestCase):
120 # if not os.path.exists("test_output"):
121 # os.mkdir("test_output")
123 # self.wiki = 'sailormoon'
124 # self.wikiq_out_name = self.wiki + ".tsv"
125 # self.test_output_dir = os.path.join(".", "test_output")
126 # self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
128 # self.infile = "{0}.xml.7z".format(self.wiki)
129 # self.base_call = "../wikiq {0} -o {1}"
130 # self.input_dir = "dumps"
131 # self.input_file = os.path.join(".", self.input_dir,self.infile)
132 # self.baseline_output_dir = "baseline_output"
134 # def test_noargs(self):
136 # test_filename = "noargs_" + self.wikiq_out_name
137 # test_file = os.path.join(self.test_output_dir, test_filename)
138 # if os.path.exists(test_file):
139 # os.remove(test_file)
141 # call = self.base_call.format(self.input_file, self.test_output_dir)
142 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
145 # copyfile(self.call_output, test_file)
147 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
149 # test = pd.read_table(test_file)
150 # baseline = pd.read_table(baseline_file)
151 # assert_frame_equal(test,baseline)
154 # def test_collapse_user(self):
155 # test_filename = "collapse-user_" + self.wikiq_out_name
156 # test_file = os.path.join(self.test_output_dir, test_filename)
157 # if os.path.exists(test_file):
158 # os.remove(test_file)
160 # call = self.base_call.format(self.input_file, self.test_output_dir)
161 # call = call + " --collapse-user"
163 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
166 # copyfile(self.call_output, test_file)
168 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
169 # test = pd.read_table(test_file)
170 # baseline = pd.read_table(baseline_file)
171 # assert_frame_equal(test,baseline)
173 # def test_pwr_segment(self):
174 # test_filename = "persistence_segment_" + self.wikiq_out_name
175 # test_file = os.path.join(self.test_output_dir, test_filename)
176 # if os.path.exists(test_file):
177 # os.remove(test_file)
179 # call = self.base_call.format(self.input_file, self.test_output_dir)
180 # call = call + " --persistence segment"
182 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
186 # copyfile(self.call_output, test_file)
188 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
190 # test = pd.read_table(test_file)
192 # baseline = pd.read_table(baseline_file)
193 # assert_frame_equal(test,baseline)
195 # def test_pwr_legacy(self):
196 # test_filename = "persistence_legacy_" + self.wikiq_out_name
197 # test_file = os.path.join(self.test_output_dir, test_filename)
198 # if os.path.exists(test_file):
199 # os.remove(test_file)
201 # call = self.base_call.format(self.input_file, self.test_output_dir)
202 # call = call + " --persistence legacy"
203 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
206 # copyfile(self.call_output, test_file)
208 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
210 # test = pd.read_table(test_file)
211 # baseline = pd.read_table(baseline_file)
212 # assert_frame_equal(test,baseline)
214 # def test_pwr(self):
215 # test_filename = "persistence_" + self.wikiq_out_name
216 # test_file = os.path.join(self.test_output_dir, test_filename)
217 # if os.path.exists(test_file):
218 # os.remove(test_file)
220 # call = self.base_call.format(self.input_file, self.test_output_dir)
221 # call = call + " --persistence"
222 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
226 # copyfile(self.call_output, test_file)
228 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
230 # test = pd.read_table(test_file)
231 # baseline = pd.read_table(baseline_file)
232 # assert_frame_equal(test,baseline)
235 # def test_url_encode(self):
236 # test_filename = "url-encode_" + self.wikiq_out_name
238 # test_file = os.path.join(self.test_output_dir, test_filename)
239 # if os.path.exists(test_file):
240 # os.remove(test_file)
242 # call = self.base_call.format(self.input_file, self.test_output_dir)
243 # call = call + " --url-encode"
244 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
247 # copyfile(self.call_output, test_file)
248 # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
249 # test = pd.read_table(test_file)
250 # baseline = pd.read_table(baseline_file)
251 # assert_frame_equal(test,baseline)
254 # class Test_Malformed(unittest.TestCase):
256 # if not os.path.exists("test_output"):
257 # os.mkdir("test_output")
259 # self.wiki = 'twinpeaks'
260 # self.wikiq_out_name = self.wiki + ".tsv"
261 # self.test_output_dir = os.path.join(".", "test_output")
262 # self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
264 # self.infile = "{0}.xml.7z".format(self.wiki)
265 # self.base_call = "../wikiq {0} -o {1}"
266 # self.input_dir = "dumps"
267 # self.input_file = os.path.join(".", self.input_dir,self.infile)
270 # def test_malformed_noargs(self):
272 # call = self.base_call.format(self.input_file, self.test_output_dir)
273 # proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
275 # outs, errs = proc.communicate()
276 # errlines = str(errs).split("\\n")
277 # self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
279 # class Test_Stdout(unittest.TestCase):
282 # self.wiki = 'sailormoon'
283 # self.wikiq_out_name = self.wiki + ".tsv"
285 # self.infile = "{0}.xml.7z".format(self.wiki)
286 # self.base_call = "../wikiq {0} --stdout"
287 # self.input_dir = "dumps"
288 # self.input_file = os.path.join(".", self.input_dir,self.infile)
289 # self.baseline_output_dir = "baseline_output"
291 # def test_noargs(self):
293 # call = self.base_call.format(self.input_file)
294 # proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
295 # outs = proc.stdout.decode("utf8")
297 # test_file = "noargs_" + self.wikiq_out_name
298 # baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
299 # print(baseline_file)
300 # test = pd.read_table(StringIO(outs))
301 # baseline = pd.read_table(baseline_file)
302 # assert_frame_equal(test,baseline)
304 if __name__ == '__main__':