4 from shutil import copyfile
 
   6 from pandas.util.testing import assert_frame_equal
 
   7 from io import StringIO
 
   9 # with / without pwr DONE
 
  10 # with / without url encode DONE
 
  11 # with / without collapse user DONE
 
  12 # with output to sdtout DONE
 
  13 # note that the persistence radius is 7 by default  
 
  14 # reading various file formats including
 
  15 #        7z, gz, bz2, xml  DONE
 
  16 # wikia and wikipedia data DONE
 
  19 class Test_Persistence(unittest.TestCase):
 
  23         if not os.path.exists("test_output"):
 
  24             os.mkdir("test_output")
 
  26         self.wiki = 'pwr-test'
 
  27         self.wikiq_out_name =  self.wiki + ".tsv"
 
  28         self.test_output_dir = os.path.join(".", "test_output")
 
  29         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
 
  31         self.infile = "{0}.xml".format(self.wiki)    
 
  32         self.base_call = "../wikiq {0} -o {1}"
 
  33         self.input_dir = "dumps"
 
  34         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
  35         self.baseline_output_dir = "baseline_output"
 
  37     def test_sequence_persistence(self):
 
  38         test_filename =  "sequence-" + self.wikiq_out_name
 
  39         test_file = os.path.join(self.test_output_dir, test_filename)
 
  40         if os.path.exists(test_file):
 
  43         call = self.base_call.format(self.input_file, self.test_output_dir)
 
  44         call = call + " --url-encode --persistence sequence"
 
  45         print(os.path.abspath('.'))
 
  47         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
  50         copyfile(self.call_output, test_file)
 
  51         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
  53         # as a test let's make sure that we get equal data frames
 
  54         test = pd.read_table(test_file)
 
  55         self.assertEqual(test['tokens_added'][0],7)
 
  56         self.assertEqual(test['tokens_added'][1],10)
 
  57         self.assertEqual(test['tokens_added'][2],0)
 
  58         self.assertEqual(test['tokens_added'][3],8)
 
  59         self.assertEqual(test['tokens_added'][4],0)
 
  60         self.assertEqual(test['tokens_removed'][0],0)
 
  61         self.assertEqual(test['tokens_removed'][1],0)
 
  62         self.assertEqual(test['tokens_removed'][2],0)
 
  63         self.assertEqual(test['tokens_removed'][3],4)
 
  64         self.assertEqual(test['tokens_removed'][4],0)
 
  65         self.assertEqual(test['token_revs'][0],8*3)
 
  66         self.assertEqual(test['token_revs'][1],0)
 
  67         self.assertEqual(test['token_revs'][2],0)
 
  68         self.assertEqual(test['token_revs'][3],0)
 
  69         self.assertEqual(test['token_revs'][4],0)
 
  71         baseline = pd.read_table(baseline_file)
 
  72         assert_frame_equal(test,baseline)
 
  74     def test_legacy_persistence(self):
 
  75         test_filename =  "legacy-" + self.wikiq_out_name
 
  76         test_file = os.path.join(self.test_output_dir, test_filename)
 
  77         if os.path.exists(test_file):
 
  80         call = self.base_call.format(self.input_file, self.test_output_dir)
 
  81         call = call + " --url-encode --persistence legacy"
 
  82         print(os.path.abspath('.'))
 
  84         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
  87         copyfile(self.call_output, test_file)
 
  88         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
  90         # as a test let's make sure that we get equal data frames
 
  91         test = pd.read_table(test_file)
 
  92         self.assertEqual(test['tokens_added'][0],7)
 
  93         self.assertEqual(test['tokens_added'][1],10)
 
  94         self.assertEqual(test['tokens_added'][2],0)
 
  95         self.assertEqual(test['tokens_added'][3],11)
 
  96         self.assertEqual(test['tokens_added'][4],0)
 
  97         self.assertEqual(test['tokens_removed'][0],0)
 
  98         self.assertEqual(test['tokens_removed'][1],0)
 
  99         self.assertEqual(test['tokens_removed'][2],0)
 
 100         self.assertEqual(test['tokens_removed'][3],7)
 
 101         self.assertEqual(test['tokens_removed'][4],0)
 
 102         self.assertEqual(test['token_revs'][0],7*3)
 
 103         self.assertEqual(test['token_revs'][1],0)
 
 104         self.assertEqual(test['token_revs'][2],0)
 
 105         self.assertEqual(test['token_revs'][3],0)
 
 106         self.assertEqual(test['token_revs'][4],0)
 
 108         baseline = pd.read_table(baseline_file)
 
 109         assert_frame_equal(test,baseline)
 
 113 class Test_Persistence_Bug(unittest.TestCase):
 
 116         if not os.path.exists("test_output"):
 
 117             os.mkdir("test_output")
 
 119         self.wiki = 'enwiki-test'
 
 120         self.wikiq_out_name =  self.wiki + ".tsv"
 
 121         self.test_output_dir = os.path.join(".", "test_output")
 
 122         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
 
 124         self.infile = "{0}.xml".format(self.wiki)    
 
 125         self.base_call = "../wikiq {0} -o {1}"
 
 126         self.input_dir = "dumps"
 
 127         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
 128         self.baseline_output_dir = "baseline_output"
 
 130     def test_sequence_persistence(self):
 
 131         test_filename =  "sequence-" + self.wikiq_out_name
 
 132         test_file = os.path.join(self.test_output_dir, test_filename)
 
 133         if os.path.exists(test_file):
 
 136         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 137         call = call + " --url-encode --persistence sequence --collapse-user"
 
 138         print(os.path.abspath('.'))
 
 140         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 143         copyfile(self.call_output, test_file)
 
 144         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 146         # as a test let's make sure that we get equal data frames
 
 147         test = pd.read_table(test_file)
 
 148         baseline = pd.read_table(baseline_file)
 
 149         assert_frame_equal(test,baseline)
 
 152 class Test_Wikipedia(unittest.TestCase):
 
 154         if not os.path.exists("test_output"):
 
 155             os.mkdir("test_output")
 
 157         self.wiki = 'ikwiki-20180301-pages-meta-history'
 
 158         self.wikiq_out_name =  self.wiki + ".tsv"
 
 159         self.test_output_dir = os.path.join(".", "test_output")
 
 160         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
 
 162         self.infile = "{0}.xml.bz2".format(self.wiki)    
 
 163         self.base_call = "../wikiq {0} -o {1}"
 
 164         self.input_dir = "dumps"
 
 165         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
 166         self.baseline_output_dir = "baseline_output"
 
 168     def test_WP_url_encode(self):
 
 169         test_filename =  "url-encode_" + self.wikiq_out_name
 
 170         test_file = os.path.join(self.test_output_dir, test_filename)
 
 171         if os.path.exists(test_file):
 
 174         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 175         call = call + " --url-encode"
 
 177         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 180         copyfile(self.call_output, test_file)
 
 181         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 183 #        as a test let's make sure that we get equal data frames
 
 185         test = pd.read_table(test_file)
 
 186         baseline = pd.read_table(baseline_file)
 
 187         assert_frame_equal(test,baseline)
 
 189     def test_WP_namespaces(self):
 
 190         print(os.path.abspath('.'))
 
 191         test_filename =  "namespaces_" + self.wikiq_out_name
 
 192         test_file = os.path.join(self.test_output_dir, test_filename)
 
 193         if os.path.exists(test_file):
 
 196         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 197         call = call + " -n 0 -n 1"
 
 199         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 201         copyfile(self.call_output, test_file)
 
 202         baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
 
 204 #        as a test let's make sure that we get equal data frames
 
 205         test = pd.read_table(test_file)
 
 206         num_wrong_ns = sum(~ test.namespace.isin({0,1}))
 
 207         self.assertEqual(num_wrong_ns, 0)
 
 208         baseline = pd.read_table(baseline_file)
 
 209         assert_frame_equal(test,baseline)
 
 212 class Test_Basic(unittest.TestCase):
 
 215         if not os.path.exists("test_output"):
 
 216             os.mkdir("test_output")
 
 218         self.wiki = 'sailormoon'
 
 219         self.wikiq_out_name =  self.wiki + ".tsv"
 
 220         self.test_output_dir = os.path.join(".", "test_output")
 
 221         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
 
 223         self.infile = "{0}.xml.7z".format(self.wiki)
 
 224         self.base_call = "../wikiq {0} -o {1}"
 
 225         self.input_dir = "dumps"
 
 226         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
 227         self.baseline_output_dir = "baseline_output"
 
 229     def test_noargs(self):
 
 231         test_filename =  "noargs_" + self.wikiq_out_name
 
 232         test_file = os.path.join(self.test_output_dir, test_filename)
 
 233         if os.path.exists(test_file):
 
 236         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 238         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 241         copyfile(self.call_output, test_file)
 
 243         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 245         test = pd.read_table(test_file)
 
 246         baseline = pd.read_table(baseline_file)
 
 247         assert_frame_equal(test,baseline)
 
 250     def test_collapse_user(self):
 
 251         test_filename =  "collapse-user_" + self.wikiq_out_name
 
 252         test_file = os.path.join(self.test_output_dir, test_filename)
 
 253         if os.path.exists(test_file):
 
 256         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 257         call = call + " --collapse-user"
 
 260         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 263         copyfile(self.call_output, test_file)
 
 265         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 266         test = pd.read_table(test_file)
 
 267         baseline = pd.read_table(baseline_file)
 
 268         assert_frame_equal(test,baseline)
 
 270     def test_pwr_segment(self):
 
 271         test_filename =  "persistence_segment_" + self.wikiq_out_name
 
 272         test_file = os.path.join(self.test_output_dir, test_filename)
 
 273         if os.path.exists(test_file):
 
 276         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 277         call = call + " --persistence segment"
 
 279         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 283         copyfile(self.call_output, test_file)
 
 285         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 287         test = pd.read_table(test_file)
 
 289         baseline = pd.read_table(baseline_file)
 
 290         assert_frame_equal(test,baseline)
 
 292     def test_pwr_legacy(self):
 
 293         test_filename =  "persistence_legacy_" + self.wikiq_out_name
 
 294         test_file = os.path.join(self.test_output_dir, test_filename)
 
 295         if os.path.exists(test_file):
 
 298         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 299         call = call + " --persistence legacy"
 
 301         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 304         copyfile(self.call_output, test_file)
 
 306         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 308         test = pd.read_table(test_file)
 
 309         baseline = pd.read_table(baseline_file)
 
 310         assert_frame_equal(test,baseline)
 
 313         test_filename =  "persistence_" + self.wikiq_out_name
 
 314         test_file = os.path.join(self.test_output_dir, test_filename)
 
 315         if os.path.exists(test_file): 
 
 318         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 319         call = call + " --persistence"
 
 321         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 325         copyfile(self.call_output, test_file)
 
 327         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 329         test = pd.read_table(test_file)
 
 330         baseline = pd.read_table(baseline_file)
 
 331         assert_frame_equal(test,baseline)
 
 334     def test_url_encode(self):
 
 335         test_filename =  "url-encode_" + self.wikiq_out_name
 
 337         test_file = os.path.join(self.test_output_dir, test_filename)
 
 338         if os.path.exists(test_file):
 
 341         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 342         call = call + " --url-encode"
 
 344         proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
 
 348         copyfile(self.call_output, test_file)
 
 349         baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
 
 350         test = pd.read_table(test_file)
 
 351         baseline = pd.read_table(baseline_file)
 
 352         assert_frame_equal(test,baseline)
 
 355 class Test_Malformed(unittest.TestCase):
 
 357         if not os.path.exists("test_output"):
 
 358             os.mkdir("test_output")
 
 360         self.wiki = 'twinpeaks'
 
 361         self.wikiq_out_name =  self.wiki + ".tsv"
 
 362         self.test_output_dir = os.path.join(".", "test_output")
 
 363         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
 
 365         self.infile = "{0}.xml.7z".format(self.wiki)
 
 366         self.base_call = "../wikiq {0} -o {1}"
 
 367         self.input_dir = "dumps"
 
 368         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
 371     def test_malformed_noargs(self):
 
 373         call = self.base_call.format(self.input_file, self.test_output_dir)
 
 375         proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
 
 377         outs, errs = proc.communicate()
 
 378         errlines = str(errs).split("\\n")
 
 379         self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
 
 381 class Test_Stdout(unittest.TestCase):
 
 384         self.wiki = 'sailormoon'
 
 385         self.wikiq_out_name =  self.wiki + ".tsv"
 
 387         self.infile = "{0}.xml.7z".format(self.wiki)
 
 388         self.base_call = "../wikiq {0} --stdout"
 
 389         self.input_dir = "dumps"
 
 390         self.input_file = os.path.join(".", self.input_dir,self.infile)
 
 391         self.baseline_output_dir = "baseline_output"
 
 393     def test_noargs(self):
 
 395         call = self.base_call.format(self.input_file)
 
 397         proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
 
 398         outs = proc.stdout.decode("utf8")
 
 400         test_file = "noargs_" + self.wikiq_out_name
 
 401         baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
 
 403         test = pd.read_table(StringIO(outs))
 
 404         baseline = pd.read_table(baseline_file)
 
 405         assert_frame_equal(test,baseline)
 
 407 if __name__ == '__main__':