X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/2ff4d6061399c22eb539f1fd609e7046aa44dba9..933ca753ede98a783c935951130458eb2c895eef:/wikiq?ds=inline diff --git a/wikiq b/wikiq index 91b86d7..e8c1247 100755 --- a/wikiq +++ b/wikiq @@ -145,29 +145,26 @@ class RegexPair(object): if self.has_groups: # if there are matches of some sort in this revision content, fill the lists for each cap_group - if content is not None and self.pattern.search(content) is not None: - m = self.pattern.finditer(content) - matchobjects = list(m) - + if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0: for cap_group in self.capture_groups: key = self._make_key(cap_group) temp_list = [] for match in matchobjects: # we only want to add the match for the capture group if the match is not None - if match.group(cap_group) != None: - temp_list.append(match.group(cap_group)) + if (group := match.group(cap_group)) is not None: + temp_list.append(group) - # if temp_list of matches is empty just make that column None - if len(temp_list)==0: - temp_dict[key] = None - # else we put in the list we made in the for-loop above - else: - if count_only: - temp_dict[key] = len(temp_list) + # if temp_list of matches is empty just make that column None + if len(temp_list)==0: + temp_dict[key] = None + # else we put in the list we made in the for-loop above else: - temp_dict[key] = ', '.join(temp_list) + if count_only: + temp_dict[key] = len(temp_list) + else: + temp_dict[key] = ', '.join(temp_list) - # there are no matches at all in this revision content, we default values to None + # there are no matches at all in this revision content, we default values to None else: for cap_group in self.capture_groups: key = self._make_key(cap_group) @@ -196,7 +193,7 @@ class RegexPair(object): class WikiqParser(): - def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): + def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -215,10 +212,10 @@ class WikiqParser(): else: self.namespace_filter = None - self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) + self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label) self.regex_revision_output_count = regex_revision_output_count - self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) + self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label) self.regex_comment_output_count = regex_comment_output_count def make_matchmake_pairs(self, patterns, labels): @@ -503,7 +500,7 @@ parser.add_argument('-rr', default=15, help="Number of edits to check when looking for reverts (default: 15)") -parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', +parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append', help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', @@ -512,7 +509,7 @@ parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_lab parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true', help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.") -parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', +parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append', help="The regular expression to search for in comments of revisions.") parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', @@ -564,10 +561,10 @@ if len(args.dumpfiles) > 0: urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, - regex_match_revision = args.regex_match_revision, + regex_revision_match = args.regex_revision_match, regex_revision_label = args.regex_revision_label, regex_revision_output_count = args.regex_revision_output_count, - regex_match_comment = args.regex_match_comment, + regex_comment_match = args.regex_comment_match, regex_comment_label = args.regex_comment_label, regex_comment_output_count = args.regex_comment_output_count) @@ -585,10 +582,10 @@ else: urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, - regex_match_revision = args.regex_match_revision, + regex_revision_match = args.regex_revision_match, regex_revision_label = args.regex_revision_label, regex_revision_output_count = args.regex_revision_output_count, - regex_match_comment = args.regex_match_comment, + regex_comment_match = args.regex_comment_match, regex_comment_label = args.regex_comment_label, regex_comment_output_count = args.regex_comment_output_count)