X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/556285b198db65f81f3fbd9d5eec59e94c2720a0..2ff4d6061399c22eb539f1fd609e7046aa44dba9:/wikiq diff --git a/wikiq b/wikiq index 4a5c129..91b86d7 100755 --- a/wikiq +++ b/wikiq @@ -139,14 +139,13 @@ class RegexPair(object): def _make_key(self, cap_group): return ("{}_{}".format(self.label, cap_group)) - def matchmake(self, content, rev_data): - + def matchmake(self, content, rev_data, count_only=False): temp_dict = {} # if there are named capture groups in the regex if self.has_groups: # if there are matches of some sort in this revision content, fill the lists for each cap_group - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.finditer(content) matchobjects = list(m) @@ -163,29 +162,41 @@ class RegexPair(object): temp_dict[key] = None # else we put in the list we made in the for-loop above else: - temp_dict[key] = ', '.join(temp_list) + if count_only: + temp_dict[key] = len(temp_list) + else: + temp_dict[key] = ', '.join(temp_list) # there are no matches at all in this revision content, we default values to None else: for cap_group in self.capture_groups: key = self._make_key(cap_group) - temp_dict[key] = None + if count_only: + temp_dict[key] = 0 + else: + temp_dict[key] = None # there are no capture groups, we just search for all the matches of the regex else: #given that there are matches to be made - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.findall(content) - temp_dict[self.label] = ', '.join(m) + if count_only: + temp_dict[self.label] = len(m) + else: + temp_dict[self.label] = ', '.join(m) else: - temp_dict[self.label] = None + if count_only: + temp_dict[self.label] = 0 + else: + temp_dict[self.label] = None # update rev_data with our new columns rev_data.update(temp_dict) return rev_data class WikiqParser(): - def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): + def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -205,8 +216,10 @@ class WikiqParser(): self.namespace_filter = None self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) + self.regex_revision_output_count = regex_revision_output_count + self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) - + self.regex_comment_output_count = regex_comment_output_count def make_matchmake_pairs(self, patterns, labels): if (patterns is not None and labels is not None) and \ @@ -223,14 +236,14 @@ class WikiqParser(): return rev_data def matchmake_revision(self, text, rev_data): - return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) + return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count) def matchmake_comment(self, comment, rev_data): - return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs) + return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count) - def matchmake_pairs(self, text, rev_data, pairs): + def matchmake_pairs(self, text, rev_data, pairs, count_only): for pair in pairs: - rev_data = pair.matchmake(text, rev_data) + rev_data = pair.matchmake(text, rev_data, count_only) return rev_data def __get_namespace_from_title(self, title): @@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', help="The label for the outputted column based on matching the regex in revision text.") +parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true', + help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.") + parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', help="The regular expression to search for in comments of revisions.") parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', help="The label for the outputted column based on matching the regex in comments.") +parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true', + help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.") + args = parser.parse_args() # set persistence method @@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0: revert_radius=args.revert_radius, regex_match_revision = args.regex_match_revision, regex_revision_label = args.regex_revision_label, + regex_revision_output_count = args.regex_revision_output_count, regex_match_comment = args.regex_match_comment, - regex_comment_label = args.regex_comment_label) + regex_comment_label = args.regex_comment_label, + regex_comment_output_count = args.regex_comment_output_count) wikiq.process() @@ -566,8 +587,11 @@ else: revert_radius=args.revert_radius, regex_match_revision = args.regex_match_revision, regex_revision_label = args.regex_revision_label, + regex_revision_output_count = args.regex_revision_output_count, regex_match_comment = args.regex_match_comment, - regex_comment_label = args.regex_comment_label) + regex_comment_label = args.regex_comment_label, + regex_comment_output_count = args.regex_comment_output_count) + wikiq.process()