]> code.communitydata.science - mediawiki_dump_tools.git/blob - tests/Wikiq_Test.py
Use dask to parallelize and scale user level datasets
[mediawiki_dump_tools.git] / tests / Wikiq_Test.py
1 import unittest
2 import os
3 import subprocess
4 from shutil import copyfile
5 import pandas as pd
6 from pandas.util.testing import assert_frame_equal
7 from io import StringIO
8
9 # with / without pwr DONE
10 # with / without url encode DONE
11 # with / without collapse user DONE
12 # with output to sdtout DONE
13 # note that the persistence radius is 7 by default
14 # reading various file formats including
15 #        7z, gz, bz2, xml  DONE
16 # wikia and wikipedia data DONE
17 # malformed xmls DONE
18
19 class Test_Wikiq(unittest.TestCase):
20     
21     def mkoutputdir(self):
22         if not os.path.exists("test_output"):
23             os.mkdir("test_output")
24
25     def setuptoutputfiles(self, suffix="xml.7z"):
26         self.wikiq_out_name =  self.wiki + ".tsv"
27         self.test_output_dir = os.path.join(".", "test_output")
28         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
29         self.infile = "{0}.{1}".format(self.wiki,suffix)    
30         self.input_dir = "dumps"
31         self.input_file = os.path.join(".", self.input_dir,self.infile)
32         self.baseline_output_dir = "baseline_output"
33
34     def run_and_check_output(self, call, test_filename):
35         test_file = os.path.join(self.test_output_dir, test_filename)
36         if os.path.exists(test_file):
37             os.remove(test_file)
38
39         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
40         proc.wait()
41
42         copyfile(self.call_output, test_file)
43         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
44
45         # as a test let's make sure that we get equal data frames
46         test = pd.read_table(test_file)
47         baseline = pd.read_table(baseline_file)
48         assert_frame_equal(test,baseline)
49
50 class Test_Wikipedia(Test_Wikiq):
51     def setUp(self):
52         print(os.path.abspath("."))
53         self.mkoutputdir()
54         self.wiki = 'ikwiki-20180301-pages-meta-history'
55         self.setuptoutputfiles(suffix="xml.bz2")
56         self.base_call = "../bin/wikiq {0} -o {1}"
57
58     def test_WP_url_encode(self):
59         test_filename =  "url-encode_" + self.wikiq_out_name
60         call = self.base_call.format(self.input_file, self.test_output_dir)
61         call = call + " --url-encode"
62         self.run_and_check_output(call, test_filename)
63
64
65 class Test_Basic(Test_Wikiq):
66
67     def setUp(self):
68         self.mkoutputdir()
69         self.wiki="sailormoon"
70         self.setuptoutputfiles()
71         self.base_call = "../bin/wikiq {0} -o {1}"
72
73     def test_noargs(self):
74         test_filename =  "noargs_" + self.wikiq_out_name
75         
76         call = self.base_call.format(self.input_file, self.test_output_dir)
77         print(call)
78         self.run_and_check_output(call, test_filename)
79
80     def test_collapse_user(self):
81         test_filename =  "collapse-user_" + self.wikiq_out_name
82         
83         call = self.base_call.format(self.input_file, self.test_output_dir)
84         call = call + " --collapse-user"
85
86         self.run_and_check_output(call, test_filename)
87
88     def test_pwr_legacy(self):
89         test_filename =  "persistence_legacy_" + self.wikiq_out_name
90         
91         call = self.base_call.format(self.input_file, self.test_output_dir)
92         call = call + " --persistence-legacy"
93         self.run_and_check_output(call, test_filename)
94
95     def test_pwr(self):
96         test_filename =  "persistence_" + self.wikiq_out_name
97         
98         call = self.base_call.format(self.input_file, self.test_output_dir)
99         call = call + " --persistence"
100         self.run_and_check_output(call, test_filename)
101
102     def test_url_encode(self):
103         test_filename =  "url-encode_" + self.wikiq_out_name
104
105         call = self.base_call.format(self.input_file, self.test_output_dir)
106         call = call + " --url-encode"
107         self.run_and_check_output(call, test_filename)
108
109 class Test_Malformed(Test_Wikiq):
110
111     def setUp(self):
112         self.mkoutputdir()
113         self.wiki="twinpeaks"
114         self.setuptoutputfiles()
115         self.base_call = "../bin/wikiq {0} -o {1}"
116
117     def test_malformed_noargs(self):
118         call = self.base_call.format(self.input_file, self.test_output_dir)
119         proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
120         proc.wait()
121         outs, errs = proc.communicate()
122         errlines = str(errs).split("\\n")
123         self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
124
125 class Test_Stdout(Test_Wikiq):
126
127     def setUp(self):
128         self.mkoutputdir()
129         self.wiki = 'sailormoon'
130         self.setuptoutputfiles()
131
132
133     def test_noargs(self):
134         self.base_call = ["../bin/wikiq", self.input_file,  "--stdout"]
135         proc = subprocess.Popen(self.base_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
136         outs = proc.stdout
137         test_file = "noargs_" + self.wikiq_out_name
138         baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
139         test = pd.read_table(outs)
140         baseline = pd.read_table(baseline_file)
141         assert_frame_equal(test,baseline)
142         
143 if __name__ == '__main__':
144     unittest.main()

Community Data Science Collective || Want to submit a patch?