added counting functionality to regex code

author Benjamin Mako Hill <mako@atdot.cc>

Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)

committer Benjamin Mako Hill <mako@atdot.cc>

Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)
author Benjamin Mako Hill <mako@atdot.cc>
Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)
committer Benjamin Mako Hill <mako@atdot.cc>
Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)
diff --git a/wikiq b/wikiq

index 2c1ef3857e9cd5c02e09000fae3ab9125853cf64..91b86d758bfc96191fc3b2660dc911b86652f8ff 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -139,8 +139,7 @@ class RegexPair(object):
      def _make_key(self, cap_group):
          return ("{}_{}".format(self.label, cap_group))
  
-    def matchmake(self, content, rev_data):
-        
+    def matchmake(self, content, rev_data, count_only=False):
          temp_dict = {}
          # if there are named capture groups in the regex
          if self.has_groups:
@@ -163,29 +162,41 @@ class RegexPair(object):
                          temp_dict[key] = None
                      # else we put in the list we made in the for-loop above
                      else:
-                        temp_dict[key] = ', '.join(temp_list)
+                        if count_only:
+                            temp_dict[key] = len(temp_list)
+                        else:
+                            temp_dict[key] = ', '.join(temp_list)
  
              # there are no matches at all in this revision content, we default values to None
              else:
                  for cap_group in self.capture_groups:
                      key = self._make_key(cap_group)
-                    temp_dict[key] = None
+                    if count_only:
+                        temp_dict[key] = 0
+                    else:
+                        temp_dict[key] = None
  
          # there are no capture groups, we just search for all the matches of the regex
          else:
              #given that there are matches to be made
              if content is not None and self.pattern.search(content) is not None:
                  m = self.pattern.findall(content)
-                temp_dict[self.label] = ', '.join(m)
+                if count_only:
+                    temp_dict[self.label] = len(m)
+                else:
+                    temp_dict[self.label] = ', '.join(m)
              else:
-                temp_dict[self.label] = None    
+                if count_only:
+                    temp_dict[self.label] = 0
+                else:
+                    temp_dict[self.label] = None
          # update rev_data with our new columns
          rev_data.update(temp_dict)
          return rev_data
  
          
  class WikiqParser():
-    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
+    def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
          """ 
          Parameters:
             persist : what persistence method to use. Takes a PersistMethod value
@@ -205,8 +216,10 @@ class WikiqParser():
              self.namespace_filter = None
  
          self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
+        self.regex_revision_output_count = regex_revision_output_count
+
          self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
-        
+        self.regex_comment_output_count = regex_comment_output_count
  
      def make_matchmake_pairs(self, patterns, labels):
          if (patterns is not None and labels is not None) and \
@@ -223,14 +236,14 @@ class WikiqParser():
          return rev_data
  
      def matchmake_revision(self, text, rev_data):
-        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
+        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
  
      def matchmake_comment(self, comment, rev_data):
-        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
+        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
  
-    def matchmake_pairs(self, text, rev_data, pairs):
+    def matchmake_pairs(self, text, rev_data, pairs, count_only):
          for pair in pairs:
-            rev_data = pair.matchmake(text, rev_data)
+            rev_data = pair.matchmake(text, rev_data, count_only)
          return rev_data
  
      def __get_namespace_from_title(self, title):
@@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de
  parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
                      help="The label for the outputted column based on matching the regex in revision text.")
  
+parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
+                    help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves.  It will affect all revision patterns.")
+
  parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
                      help="The regular expression to search for in comments of revisions.")
  
  parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
                      help="The label for the outputted column based on matching the regex in comments.")
  
+parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
+                    help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
+
  args = parser.parse_args()
  
  # set persistence method
@@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0:
                              revert_radius=args.revert_radius,
                              regex_match_revision = args.regex_match_revision,
                              regex_revision_label = args.regex_revision_label,
+                            regex_revision_output_count = args.regex_revision_output_count,
                              regex_match_comment = args.regex_match_comment,
-                            regex_comment_label = args.regex_comment_label)
+                            regex_comment_label = args.regex_comment_label,
+                            regex_comment_output_count = args.regex_comment_output_count)
  
          wikiq.process()
  
@@ -566,8 +587,11 @@ else:
                          revert_radius=args.revert_radius,
                          regex_match_revision = args.regex_match_revision,
                          regex_revision_label = args.regex_revision_label,
+                        regex_revision_output_count = args.regex_revision_output_count,
                          regex_match_comment = args.regex_match_comment,
-                        regex_comment_label = args.regex_comment_label)
+                        regex_comment_label = args.regex_comment_label,
+                        regex_comment_output_count = args.regex_comment_output_count)
+
  
      wikiq.process()
author	Benjamin Mako Hill <mako@atdot.cc>
	Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)
committer	Benjamin Mako Hill <mako@atdot.cc>
	Sat, 29 Apr 2023 18:40:03 +0000 (11:40 -0700)