+@dataclass()
+class RevDataBase():
+ revid: int
+ date_time: datetime
+ articleid: int
+ editorid: int
+ title: str
+ namespace: int
+ deleted: bool
+ text_chars: int = None
+ revert: bool = None
+ reverteds: list[int] = None
+ sha1: str = None
+ minor: bool = None
+ editor: str = None
+ anon: bool = None
+ collapsed_revs:int = None
+
+ pa_schema_fields = [
+ pa.field("revid", pa.int64),
+ pa.field("date_time",pa.timestamp('ms')),
+ pa.field("articleid",pa.int64()),
+ pa.field("editorid",pa.int64()),
+ pa.field("title",pa.string()),
+ pa.field("namespace",pa.int32()),
+ pa.field("deleted",pa.binary()),
+ pa.field("test_chars",pa.int32()),
+ pa.field("revert",pa.binary()),
+ pa.field("reverteds",pa.list_(pa.int64())),
+ pa.field("sha1",pa.string()),
+ pa.field("minor",pa.binary()),
+ pa.field("editor",pa.string()),
+ pa.field("anon",pa.binary())
+ ]
+
+ def to_pyarrow(self):
+ return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
+
+
+ def to_tsv_row(self):
+
+ row = []
+ for f in self.fields():
+ val = getattr(self, f.name)
+ if getattr(self, f.name) is None:
+ row.append("")
+ elif f.type == bool:
+ row.append("TRUE" if val else "FALSE")
+
+ elif f.type == datetime:
+ row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+ elif f.name in {'editor','title'}:
+ s = '"' + val + '"'
+ if f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+
+ elif f.type == list[int]:
+ row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+ elif f.type == str:
+ if f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+ else:
+ row.append(val)
+
+ return '\t'.join(row)
+
+ # def __init__(revid: int,
+ # date_time: datetime,
+ # articleid: int,
+ # editorid: int,
+ # title: str,
+ # namespace: int,
+ # deleted: bool,
+ # test_chars: int,
+ # revert: bool,
+ # reverteds: list[bool],
+ # sha1: str,
+ # minor: bool,
+ # editor: str,
+ # anon: bool):
+
+
+
+@dataclass()
+class RevDataCollapse(RevDataBase):
+ collapsed_revs:int = None
+ pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
+ pa_schema_fields = RevDataBase.pa_schema_fields + pa_collapsed_revs_schema
+ pa_schema = pa.schema(pa_schema_fields)
+
+@dataclass()
+class RevDataPersistence(RevDataBase):
+ token_revs:int = None
+ tokens_added:int = None
+ tokens_removed:int = None
+ tokens_window:int = None
+ pa_persistence_schema_fields = [
+ pa.field(token_revs, pa.int64()),
+ pa.field(tokens_added, pa.int64()),
+ pa.field(tokens_removed, pa.int64()),
+ pa.tokens_window, pa.int64()]