+class RegexPair(object):
+ def __init__(self, pattern, label):
+ self.pattern = re.compile(pattern)
+ self.label = label
+ self.has_groups = bool(self.pattern.groupindex)
+ if self.has_groups:
+ self.capture_groups = list(self.pattern.groupindex.keys())
+
+ def _make_key(self, cap_group):
+ return ("{}_{}".format(self.label, cap_group))
+
+ def matchmake(self, content, rev_data):
+
+ temp_dict = {}
+ # if there are named capture groups in the regex
+ if self.has_groups:
+
+ # if there are matches of some sort in this revision content, fill the lists for each cap_group
+ if self.pattern.search(content) is not None:
+ m = self.pattern.finditer(content)
+ matchobjects = list(m)
+
+ for cap_group in self.capture_groups:
+ key = self._make_key(cap_group)
+ temp_list = []
+ for match in matchobjects:
+ # we only want to add the match for the capture group if the match is not None
+ if match.group(cap_group) != None:
+ temp_list.append(match.group(cap_group))
+
+ # if temp_list of matches is empty just make that column None
+ if len(temp_list)==0:
+ temp_dict[key] = None
+ # else we put in the list we made in the for-loop above
+ else:
+ temp_dict[key] = ', '.join(temp_list)
+
+ # there are no matches at all in this revision content, we default values to None
+ else:
+ for cap_group in self.capture_groups:
+ key = self._make_key(cap_group)
+ temp_dict[key] = None
+
+ # there are no capture groups, we just search for all the matches of the regex
+ else:
+ #given that there are matches to be made
+ if type(content) in(str, bytes):
+ if self.pattern.search(content) is not None:
+ m = self.pattern.findall(content)
+ temp_dict[self.label] = ', '.join(m)
+ else:
+ temp_dict[self.label] = None
+
+ # update rev_data with our new columns
+ for k, v in temp_dict:
+ rev_data.setattr(k,v)
+
+ return rev_data
+
+@dataclass()
+class RevDataBase():
+ revid: int
+ date_time: datetime
+ articleid: int
+ editorid: int
+ title: str
+ namespace: int
+ deleted: bool
+ text_chars: int = None
+ revert: bool = None
+ reverteds: list[int] = None
+ sha1: str = None
+ minor: bool = None
+ editor: str = None
+ anon: bool = None
+ collapsed_revs:int = None
+
+ pa_schema_fields = [
+ pa.field("revid", pa.int64),
+ pa.field("date_time",pa.timestamp('ms')),
+ pa.field("articleid",pa.int64()),
+ pa.field("editorid",pa.int64()),
+ pa.field("title",pa.string()),
+ pa.field("namespace",pa.int32()),
+ pa.field("deleted",pa.binary()),
+ pa.field("test_chars",pa.int32()),
+ pa.field("revert",pa.binary()),
+ pa.field("reverteds",pa.list_(pa.int64())),
+ pa.field("sha1",pa.string()),
+ pa.field("minor",pa.binary()),
+ pa.field("editor",pa.string()),
+ pa.field("anon",pa.binary())
+ ]
+
+ def to_pyarrow(self):
+ return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
+
+
+ def to_tsv_row(self):
+
+ row = []
+ for f in self.fields():
+ val = getattr(self, f.name)
+ if getattr(self, f.name) is None:
+ row.append("")
+ elif f.type == bool:
+ row.append("TRUE" if val else "FALSE")
+
+ elif f.type == datetime:
+ row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+ elif f.name in {'editor','title'}:
+ s = '"' + val + '"'
+ if f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+
+ elif f.type == list[int]:
+ row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+ elif f.type == str:
+ if f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+ else:
+ row.append(val)