X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/02b3250a36a9a412ad5fe7f336caee861c2565b2..556285b198db65f81f3fbd9d5eec59e94c2720a0:/wikiq?ds=sidebyside diff --git a/wikiq b/wikiq index 7e75eda..4a5c129 100755 --- a/wikiq +++ b/wikiq @@ -144,7 +144,7 @@ class RegexPair(object): temp_dict = {} # if there are named capture groups in the regex if self.has_groups: - # initialize the {capture_group_name:list} for each capture group + # if there are matches of some sort in this revision content, fill the lists for each cap_group if self.pattern.search(content) is not None: m = self.pattern.finditer(content) @@ -183,7 +183,6 @@ class RegexPair(object): rev_data.update(temp_dict) return rev_data - class WikiqParser(): def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): @@ -368,6 +367,8 @@ class WikiqParser(): rev_data['collapsed_revs'] = rev.collapsed_revs if self.persist != PersistMethod.none: + # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted + old_rev_data = {} if rev.deleted.text: for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: old_rev_data[k] = None