def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group))
- def matchmake(self, content, rev_data):
-
+ def matchmake(self, content, rev_data, count_only=False):
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
- temp_dict[key] = ', '.join(temp_list)
+ if count_only:
+ temp_dict[key] = len(temp_list)
+ else:
+ temp_dict[key] = ', '.join(temp_list)
# there are no matches at all in this revision content, we default values to None
else:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
- temp_dict[key] = None
+ if count_only:
+ temp_dict[key] = 0
+ else:
+ temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex
else:
#given that there are matches to be made
if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content)
- temp_dict[self.label] = ', '.join(m)
+ if count_only:
+ temp_dict[self.label] = len(m)
+ else:
+ temp_dict[self.label] = ', '.join(m)
else:
- temp_dict[self.label] = None
+ if count_only:
+ temp_dict[self.label] = 0
+ else:
+ temp_dict[self.label] = None
# update rev_data with our new columns
rev_data.update(temp_dict)
return rev_data
class WikiqParser():
- def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
+ def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
+ self.regex_revision_output_count = regex_revision_output_count
+
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
-
+ self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \
return rev_data
def matchmake_revision(self, text, rev_data):
- return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
+ return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
def matchmake_comment(self, comment, rev_data):
- return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
+ return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
- def matchmake_pairs(self, text, rev_data, pairs):
+ def matchmake_pairs(self, text, rev_data, pairs, count_only):
for pair in pairs:
- rev_data = pair.matchmake(text, rev_data)
+ rev_data = pair.matchmake(text, rev_data, count_only)
return rev_data
def __get_namespace_from_title(self, title):
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.")
+parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
+ help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
+
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.")
+parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
+ help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
+
args = parser.parse_args()
# set persistence method
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
+ regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
- regex_comment_label = args.regex_comment_label)
+ regex_comment_label = args.regex_comment_label,
+ regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
+ regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
- regex_comment_label = args.regex_comment_label)
+ regex_comment_label = args.regex_comment_label,
+ regex_comment_output_count = args.regex_comment_output_count)
+
wikiq.process()