X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/02b3250a36a9a412ad5fe7f336caee861c2565b2..4729371d5ab057b5d028d5ca9b7aeafd7bc40478:/wikiq diff --git a/wikiq b/wikiq index 7e75eda..2c1ef38 100755 --- a/wikiq +++ b/wikiq @@ -144,9 +144,9 @@ class RegexPair(object): temp_dict = {} # if there are named capture groups in the regex if self.has_groups: - # initialize the {capture_group_name:list} for each capture group + # if there are matches of some sort in this revision content, fill the lists for each cap_group - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.finditer(content) matchobjects = list(m) @@ -174,7 +174,7 @@ class RegexPair(object): # there are no capture groups, we just search for all the matches of the regex else: #given that there are matches to be made - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.findall(content) temp_dict[self.label] = ', '.join(m) else: @@ -183,7 +183,6 @@ class RegexPair(object): rev_data.update(temp_dict) return rev_data - class WikiqParser(): def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): @@ -368,6 +367,8 @@ class WikiqParser(): rev_data['collapsed_revs'] = rev.collapsed_revs if self.persist != PersistMethod.none: + # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted + old_rev_data = {} if rev.deleted.text: for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: old_rev_data[k] = None