updated README file

[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq

index 7e75eda22228cd5c44c248c264c8435b3268a948..2c1ef3857e9cd5c02e09000fae3ab9125853cf64 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -144,9 +144,9 @@ class RegexPair(object):
          temp_dict = {}
          # if there are named capture groups in the regex
          if self.has_groups:
-            # initialize the {capture_group_name:list} for each capture group
+
              # if there are matches of some sort in this revision content, fill the lists for each cap_group
-            if self.pattern.search(content) is not None:
+            if content is not None and self.pattern.search(content) is not None:
                  m = self.pattern.finditer(content)
                  matchobjects = list(m)
  
@@ -174,7 +174,7 @@ class RegexPair(object):
          # there are no capture groups, we just search for all the matches of the regex
          else:
              #given that there are matches to be made
-            if self.pattern.search(content) is not None:
+            if content is not None and self.pattern.search(content) is not None:
                  m = self.pattern.findall(content)
                  temp_dict[self.label] = ', '.join(m)
              else:
@@ -183,7 +183,6 @@ class RegexPair(object):
          rev_data.update(temp_dict)
          return rev_data
  
-
          
  class WikiqParser():
      def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
@@ -368,6 +367,8 @@ class WikiqParser():
                      rev_data['collapsed_revs'] = rev.collapsed_revs
  
                  if self.persist != PersistMethod.none:
+                    # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted                    
+                    old_rev_data = {}
                      if rev.deleted.text:
                          for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
                              old_rev_data[k] = None