X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/cdfa77d66ddc5b0af9fad3b30da7bbe2530ce3bc..4729371d5ab057b5d028d5ca9b7aeafd7bc40478:/wikiq diff --git a/wikiq b/wikiq index 0543a33..2c1ef38 100755 --- a/wikiq +++ b/wikiq @@ -146,7 +146,7 @@ class RegexPair(object): if self.has_groups: # if there are matches of some sort in this revision content, fill the lists for each cap_group - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.finditer(content) matchobjects = list(m) @@ -174,7 +174,7 @@ class RegexPair(object): # there are no capture groups, we just search for all the matches of the regex else: #given that there are matches to be made - if self.pattern.search(content) is not None: + if content is not None and self.pattern.search(content) is not None: m = self.pattern.findall(content) temp_dict[self.label] = ', '.join(m) else: @@ -367,6 +367,8 @@ class WikiqParser(): rev_data['collapsed_revs'] = rev.collapsed_revs if self.persist != PersistMethod.none: + # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted + old_rev_data = {} if rev.deleted.text: for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: old_rev_data[k] = None