temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
- # initialize the {capture_group_name:list} for each capture group
+
# if there are matches of some sort in this revision content, fill the lists for each cap_group
- if self.pattern.search(content) is not None:
+ if content is not None and self.pattern.search(content) is not None:
m = self.pattern.finditer(content)
matchobjects = list(m)
# there are no capture groups, we just search for all the matches of the regex
else:
#given that there are matches to be made
- if self.pattern.search(content) is not None:
+ if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m)
else:
rev_data.update(temp_dict)
return rev_data
-
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
rev_data['collapsed_revs'] = rev.collapsed_revs
if self.persist != PersistMethod.none:
+ # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted
+ old_rev_data = {}
if rev.deleted.text:
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
old_rev_data[k] = None