4 from shutil import copyfile
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 # 7z, gz, bz2, xml DONE
16 # wikia and wikipedia data DONE
19 class Test_Wikipedia(unittest.TestCase):
21 if not os.path.exists("test_output"):
22 os.mkdir("test_output")
24 self.wiki = 'ikwiki-20180301-pages-meta-history'
25 self.wikiq_out_name = self.wiki + ".tsv"
26 self.test_output_dir = os.path.join(".", "test_output")
27 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
29 self.infile = "{0}.xml.bz2".format(self.wiki)
30 self.base_call = "../wikiq {0} -o {1}"
31 self.input_dir = "dumps"
32 self.input_file = os.path.join(".", self.input_dir,self.infile)
33 self.baseline_output_dir = "baseline_output"
35 def test_WP_url_encode(self):
36 test_filename = "url-encode_" + self.wikiq_out_name
37 test_file = os.path.join(self.test_output_dir, test_filename)
38 if os.path.exists(test_file):
41 call = self.base_call.format(self.input_file, self.test_output_dir)
42 call = call + " --url-encode"
43 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
46 copyfile(self.call_output, test_file)
47 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
49 # as a test let's make sure that we get equal data frames
50 test = pd.read_table(test_file)
51 baseline = pd.read_table(baseline_file)
52 assert_frame_equal(test,baseline)
54 def test_WP_namespaces(self):
55 print(os.path.abspath('.'))
56 test_filename = "namespaces_" + self.wikiq_out_name
57 test_file = os.path.join(self.test_output_dir, test_filename)
58 if os.path.exists(test_file):
61 call = self.base_call.format(self.input_file, self.test_output_dir)
62 call = call + " -n 0 -n 1"
64 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
66 copyfile(self.call_output, test_file)
67 baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
69 # as a test let's make sure that we get equal data frames
70 test = pd.read_table(test_file)
71 num_wrong_ns = sum(~ test.namespace.isin({0,1}))
72 self.assertEqual(num_wrong_ns, 0)
73 baseline = pd.read_table(baseline_file)
74 assert_frame_equal(test,baseline)
77 class Test_Basic(unittest.TestCase):
80 if not os.path.exists("test_output"):
81 os.mkdir("test_output")
83 self.wiki = 'sailormoon'
84 self.wikiq_out_name = self.wiki + ".tsv"
85 self.test_output_dir = os.path.join(".", "test_output")
86 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
88 self.infile = "{0}.xml.7z".format(self.wiki)
89 self.base_call = "../wikiq {0} -o {1}"
90 self.input_dir = "dumps"
91 self.input_file = os.path.join(".", self.input_dir,self.infile)
92 self.baseline_output_dir = "baseline_output"
94 def test_noargs(self):
96 test_filename = "noargs_" + self.wikiq_out_name
97 test_file = os.path.join(self.test_output_dir, test_filename)
98 if os.path.exists(test_file):
101 call = self.base_call.format(self.input_file, self.test_output_dir)
102 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
105 copyfile(self.call_output, test_file)
107 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
109 test = pd.read_table(test_file)
110 baseline = pd.read_table(baseline_file)
111 assert_frame_equal(test,baseline)
114 def test_collapse_user(self):
115 test_filename = "collapse-user_" + self.wikiq_out_name
116 test_file = os.path.join(self.test_output_dir, test_filename)
117 if os.path.exists(test_file):
120 call = self.base_call.format(self.input_file, self.test_output_dir)
121 call = call + " --collapse-user"
123 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
126 copyfile(self.call_output, test_file)
128 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
129 test = pd.read_table(test_file)
130 baseline = pd.read_table(baseline_file)
131 assert_frame_equal(test,baseline)
133 def test_pwr_segment(self):
134 test_filename = "persistence_segment_" + self.wikiq_out_name
135 test_file = os.path.join(self.test_output_dir, test_filename)
136 if os.path.exists(test_file):
139 call = self.base_call.format(self.input_file, self.test_output_dir)
140 call = call + " --persistence segment"
141 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
145 copyfile(self.call_output, test_file)
147 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
149 test = pd.read_table(test_file)
150 baseline = pd.read_table(baseline_file)
151 assert_frame_equal(test,baseline)
153 def test_pwr_legacy(self):
154 test_filename = "persistence_legacy_" + self.wikiq_out_name
155 test_file = os.path.join(self.test_output_dir, test_filename)
156 if os.path.exists(test_file):
159 call = self.base_call.format(self.input_file, self.test_output_dir)
160 call = call + " --persistence legacy"
161 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
165 copyfile(self.call_output, test_file)
167 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
169 test = pd.read_table(test_file)
170 baseline = pd.read_table(baseline_file)
171 assert_frame_equal(test,baseline)
174 test_filename = "persistence_" + self.wikiq_out_name
175 test_file = os.path.join(self.test_output_dir, test_filename)
176 if os.path.exists(test_file):
179 call = self.base_call.format(self.input_file, self.test_output_dir)
180 call = call + " --persistence"
181 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
185 copyfile(self.call_output, test_file)
187 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
189 test = pd.read_table(test_file)
190 baseline = pd.read_table(baseline_file)
191 assert_frame_equal(test,baseline)
194 def test_url_encode(self):
195 test_filename = "url-encode_" + self.wikiq_out_name
197 test_file = os.path.join(self.test_output_dir, test_filename)
198 if os.path.exists(test_file):
201 call = self.base_call.format(self.input_file, self.test_output_dir)
202 call = call + " --url-encode"
203 proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
206 copyfile(self.call_output, test_file)
207 baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
208 test = pd.read_table(test_file)
209 baseline = pd.read_table(baseline_file)
210 assert_frame_equal(test,baseline)
213 class Test_Malformed(unittest.TestCase):
215 if not os.path.exists("test_output"):
216 os.mkdir("test_output")
218 self.wiki = 'twinpeaks'
219 self.wikiq_out_name = self.wiki + ".tsv"
220 self.test_output_dir = os.path.join(".", "test_output")
221 self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
223 self.infile = "{0}.xml.7z".format(self.wiki)
224 self.base_call = "../wikiq {0} -o {1}"
225 self.input_dir = "dumps"
226 self.input_file = os.path.join(".", self.input_dir,self.infile)
229 def test_malformed_noargs(self):
231 call = self.base_call.format(self.input_file, self.test_output_dir)
232 proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
234 outs, errs = proc.communicate()
235 errlines = str(errs).split("\\n")
236 self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
238 class Test_Stdout(unittest.TestCase):
241 self.wiki = 'sailormoon'
242 self.wikiq_out_name = self.wiki + ".tsv"
244 self.infile = "{0}.xml.7z".format(self.wiki)
245 self.base_call = "../wikiq {0} --stdout"
246 self.input_dir = "dumps"
247 self.input_file = os.path.join(".", self.input_dir,self.infile)
248 self.baseline_output_dir = "baseline_output"
250 def test_noargs(self):
252 call = self.base_call.format(self.input_file)
253 proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
254 outs = proc.stdout.decode("utf8")
256 test_file = "noargs_" + self.wikiq_out_name
257 baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
259 test = pd.read_table(StringIO(outs))
260 baseline = pd.read_table(baseline_file)
261 assert_frame_equal(test,baseline)
263 if __name__ == '__main__':