+    def _make_key(self, cap_group):
+        return ("{}_{}".format(self.label, cap_group))
+
+    def matchmake(self, content, rev_data):
+        
+        temp_dict = {}
+        # if there are named capture groups in the regex
+        if self.has_groups:
+
+            # if there are matches of some sort in this revision content, fill the lists for each cap_group
+            if self.pattern.search(content) is not None:
+                m = self.pattern.finditer(content)
+                matchobjects = list(m)
+
+                for cap_group in self.capture_groups:
+                    key = self._make_key(cap_group)
+                    temp_list = []
+                    for match in matchobjects:
+                        # we only want to add the match for the capture group if the match is not None
+                        if match.group(cap_group) != None:
+                            temp_list.append(match.group(cap_group))
+
+                    # if temp_list of matches is empty just make that column None
+                    if len(temp_list)==0:
+                        temp_dict[key] = None
+                    # else we put in the list we made in the for-loop above
+                    else:
+                        temp_dict[key] = ', '.join(temp_list)
+
+            # there are no matches at all in this revision content, we default values to None
+            else:
+                for cap_group in self.capture_groups:
+                    key = self._make_key(cap_group)
+                    temp_dict[key] = None
+
+        # there are no capture groups, we just search for all the matches of the regex
+        else:
+            #given that there are matches to be made
+            if type(content) in(str, bytes):
+                if self.pattern.search(content) is not None:
+                    m = self.pattern.findall(content)
+                    temp_dict[self.label] = ', '.join(m)
+                else:
+                    temp_dict[self.label] = None
+
+        # update rev_data with our new columns
+        for k, v in temp_dict.items():
+            setattr(rev_data, k, v)
+
+        return rev_data
+
+@dataclass()
+class RevDataBase():
+    revid: int 
+    date_time: datetime
+    articleid: int
+    editorid: int
+    title: str
+    namespace: int
+    deleted: bool
+    text_chars: int = None
+    revert: bool = None
+    reverteds: list[int] = None
+    sha1: str = None
+    minor: bool = None
+    editor: str = None
+    anon: bool = None
+
+    urlencode = False
+    pa_schema_fields = [
+        pa.field("revid", pa.int64()),
+        pa.field("date_time",pa.timestamp('ms')),
+        pa.field("articleid",pa.int64()),
+        pa.field("editorid",pa.int64()),
+        pa.field("title",pa.string()),
+        pa.field("namespace",pa.int32()),
+        pa.field("deleted",pa.bool_()),
+        pa.field("test_chars",pa.int32()),
+        pa.field("revert",pa.bool_()),
+        pa.field("reverteds",pa.list_(pa.int64())),
+        pa.field("sha1",pa.string()),
+        pa.field("minor",pa.bool_()),
+        pa.field("editor",pa.string()),
+        pa.field("anon",pa.bool_())
+    ]
+
+    def to_pyarrow(self):
+        return dc.astuple(self)
+
+    def to_tsv_row(self):
+        
+        row = []
+        for f in dc.fields(self):
+            val = getattr(self, f.name)
+            if getattr(self, f.name) is None:
+                row.append("")
+            elif f.type == bool:
+                row.append("TRUE" if val else "FALSE")
+
+            elif f.type == datetime:
+                row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+            elif f.name in {'editor','title'}:
+                s = '"' + val + '"'
+                if self.urlencode and f.name in TO_ENCODE:
+                    row.append(quote(str(s)))
+                else:
+                    row.append(s)
+
+            elif f.type == list[int]:
+                row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+            elif f.type == str:
+                if self.urlencode and f.name in TO_ENCODE:
+                    row.append(quote(str(val)))
+                else:
+                    row.append(val)
+            else:
+                row.append(val)
+
+        return '\t'.join(map(str,row))
+
+    def header_row(self):
+        return '\t'.join(map(lambda f: f.name, dc.fields(self)))
+
+@dataclass()
+class RevDataCollapse(RevDataBase):
+    collapsed_revs:int = None
+    pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
+    pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
+
+@dataclass()
+class RevDataPersistence(RevDataBase):
+    token_revs:int = None
+    tokens_added:int = None
+    tokens_removed:int = None
+    tokens_window:int = None
+
+    pa_persistence_schema_fields = [
+        pa.field("token_revs", pa.int64()),
+        pa.field("tokens_added", pa.int64()),
+        pa.field("tokens_removed", pa.int64()),
+        pa.field("tokens_window", pa.int64())]