]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - test/Wikiq_Unit_Test.py
add flag for excluding whitespace and punctuation
[mediawiki_dump_tools.git] / test / Wikiq_Unit_Test.py
index c04bbd8d4d92f41a36db97066587e5e45207f537..02a0769c8745cb4deaf5a252fea8888ee2cdabe6 100644 (file)
@@ -92,14 +92,53 @@ class Test_Persistence(unittest.TestCase):
         self.assertEqual(test['tokens_added'][0],7)
         self.assertEqual(test['tokens_added'][1],10)
         self.assertEqual(test['tokens_added'][2],0)
-        self.assertEqual(test['tokens_added'][3],11)
+        self.assertEqual(test['tokens_added'][3],8)
+        self.assertEqual(test['tokens_added'][4],0)
+        self.assertEqual(test['tokens_removed'][0],0)
+        self.assertEqual(test['tokens_removed'][1],0)
+        self.assertEqual(test['tokens_removed'][2],10)
+        self.assertEqual(test['tokens_removed'][3],4)
+        self.assertEqual(test['tokens_removed'][4],0)
+        self.assertEqual(test['token_revs'][0],8*3)
+        self.assertEqual(test['token_revs'][1],0)
+        self.assertEqual(test['token_revs'][2],0)
+        self.assertEqual(test['token_revs'][3],0)
+        self.assertEqual(test['token_revs'][4],0)
+
+        baseline = pd.read_table(baseline_file)
+        assert_frame_equal(test,baseline)
+
+
+        
+    def test_segment_persistence_exclude_ws(self):
+        test_filename =  "segment_excludews_" + self.wikiq_out_name
+        test_file = os.path.join(self.test_output_dir, test_filename)
+        if os.path.exists(test_file):
+            os.remove(test_file)
+        
+        call = self.base_call.format(self.input_file, self.test_output_dir)
+        call = call + " --url-encode --persistence segment --exclude-whitespace"
+        print(os.path.abspath('.'))
+        print(call)
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
+        proc.wait()
+
+        copyfile(self.call_output, test_file)
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
+
+        # as a test let's make sure that we get equal data frames
+        test = pd.read_table(test_file)
+        self.assertEqual(test['tokens_added'][0],4)
+        self.assertEqual(test['tokens_added'][1],5)
+        self.assertEqual(test['tokens_added'][2],0)
+        self.assertEqual(test['tokens_added'][3],6)
         self.assertEqual(test['tokens_added'][4],0)
         self.assertEqual(test['tokens_removed'][0],0)
         self.assertEqual(test['tokens_removed'][1],0)
         self.assertEqual(test['tokens_removed'][2],0)
-        self.assertEqual(test['tokens_removed'][3],7)
+        self.assertEqual(test['tokens_removed'][3],4)
         self.assertEqual(test['tokens_removed'][4],0)
-        self.assertEqual(test['token_revs'][0],7*3)
+        self.assertEqual(test['token_revs'][0],4*3)
         self.assertEqual(test['token_revs'][1],0)
         self.assertEqual(test['token_revs'][2],0)
         self.assertEqual(test['token_revs'][3],0)
@@ -289,6 +328,29 @@ class Test_Basic(unittest.TestCase):
         baseline = pd.read_table(baseline_file)
         assert_frame_equal(test,baseline)
 
+    def test_pwr_segment_collapse(self):
+        test_filename =  "persistence_segment_collapse_" + self.wikiq_out_name
+        test_file = os.path.join(self.test_output_dir, test_filename)
+        if os.path.exists(test_file):
+            os.remove(test_file)
+        
+        call = self.base_call.format(self.input_file, self.test_output_dir)
+        call = call + " --persistence segment --collapse-user"
+        print(call)
+        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
+        proc.wait()
+
+
+        copyfile(self.call_output, test_file)
+
+        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
+
+        test = pd.read_table(test_file)
+        print(test)
+        baseline = pd.read_table(baseline_file)
+        assert_frame_equal(test,baseline)
+
+
     def test_pwr_legacy(self):
         test_filename =  "persistence_legacy_" + self.wikiq_out_name
         test_file = os.path.join(self.test_output_dir, test_filename)

Community Data Science Collective || Want to submit a patch?