+ text_chars: int = None
+ revert: bool = None
+ reverteds: list[int] = None
+ sha1: str = None
+ minor: bool = None
+ editor: str = None
+ anon: bool = None
+
+ # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
+ urlencode = False
+
+ # defines pyarrow schema.
+ # each field in the data class needs an entry in this array.
+ # the names should match and be in the same order.
+ # this isn't a dataclass field since it doesn't have a type annotation
+ pa_schema_fields = [
+ pa.field("revid", pa.int64()),
+ pa.field("date_time", pa.timestamp('ms')),
+ pa.field("articleid",pa.int64()),
+ pa.field("editorid",pa.int64(), nullable=True),
+ pa.field("title",pa.string()),
+ pa.field("namespace",pa.int32()),
+ pa.field("deleted",pa.bool_()),
+ pa.field("text_chars",pa.int32()),
+ pa.field("revert",pa.bool_(), nullable=True),
+ pa.field("reverteds",pa.list_(pa.int64()), nullable=True),
+ pa.field("sha1",pa.string()),
+ pa.field("minor",pa.bool_()),
+ pa.field("editor",pa.string()),
+ pa.field("anon",pa.bool_())
+ ]
+
+ # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
+ def to_pyarrow(self):
+ return dc.astuple(self)
+
+ # logic to convert each field into the wikiq tsv format goes here.
+ def to_tsv_row(self):
+
+ row = []
+ for f in dc.fields(self):
+ val = getattr(self, f.name)
+ if getattr(self, f.name) is None:
+ row.append("")
+ elif f.type == bool:
+ row.append("TRUE" if val else "FALSE")
+
+ elif f.type == datetime:
+ row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+ elif f.name in {'editor','title'}:
+ s = '"' + val + '"'
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(s)))
+ else:
+ row.append(s)
+
+ elif f.type == list[int]:
+ row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+ elif f.type == str:
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+ else:
+ row.append(val)
+ else:
+ row.append(val)
+
+ return '\t'.join(map(str,row))
+
+ def header_row(self):
+ return '\t'.join(map(lambda f: f.name, dc.fields(self)))
+
+"""
+
+If collapse=True we'll use a RevDataCollapse dataclass.
+This class inherits from RevDataBase. This means that it has all the same fields and functions.
+
+It just adds a new field and updates the pyarrow schema.
+
+"""
+@dataclass()
+class RevDataCollapse(RevDataBase):
+ collapsed_revs:int = None
+
+ pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
+ pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
+
+"""
+
+If persistence data is to be computed we'll need the fields added by RevDataPersistence.
+
+"""
+@dataclass()
+class RevDataPersistence(RevDataBase):
+ token_revs:int = None
+ tokens_added:int = None
+ tokens_removed:int = None
+ tokens_window:int = None
+
+ pa_persistence_schema_fields = [
+ pa.field("token_revs", pa.int64()),
+ pa.field("tokens_added", pa.int64()),
+ pa.field("tokens_removed", pa.int64()),
+ pa.field("tokens_window", pa.int64())]
+
+ pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
+
+"""
+class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
+
+"""
+@dataclass()
+class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
+ pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields