+"""
+
+We used to use a dictionary to collect fields for the output.
+Now we use dataclasses. Compared to a dictionary, this should help:
+- prevent some bugs
+- make it easier to output parquet data.
+- use class attribute '.' syntax instead of dictionary syntax.
+- improve support for tooling (autocomplete, type hints)
+- use type information to define formatting rules
+
+Depending on the parameters passed into Wikiq, the output schema can be different.
+Therefore, we need to end up constructing a dataclass with the correct output schema.
+It also needs to have the correct pyarrow schema so we can write parquet files.
+
+The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
+"""
+@dataclass()
+class RevDataBase():
+ revid: int
+ date_time: datetime
+ articleid: int
+ editorid: int
+ title: str
+ namespace: int
+ deleted: bool
+ text_chars: int = None
+ revert: bool = None
+ reverteds: list[int] = None
+ sha1: str = None
+ minor: bool = None
+ editor: str = None
+ anon: bool = None
+
+ # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
+ urlencode = False
+
+ # defines pyarrow schema.
+ # each field in the data class needs an entry in this array.
+ # the names should match and be in the same order.
+ # this isn't a dataclass field since it doesn't have a type annotation
+ pa_schema_fields = [
+ pa.field("revid", pa.int64()),
+ pa.field("date_time", pa.timestamp('ms')),
+ pa.field("articleid",pa.int64()),
+ pa.field("editorid",pa.int64(), nullable=True),
+ pa.field("title",pa.string()),
+ pa.field("namespace",pa.int32()),
+ pa.field("deleted",pa.bool_()),
+ pa.field("text_chars",pa.int32()),
+ pa.field("revert",pa.bool_(), nullable=True),
+ pa.field("reverteds",pa.list_(pa.int64()), nullable=True),
+ pa.field("sha1",pa.string()),
+ pa.field("minor",pa.bool_()),
+ pa.field("editor",pa.string()),
+ pa.field("anon",pa.bool_())
+ ]
+
+ # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
+ def to_pyarrow(self):
+ return dc.astuple(self)
+
+ # logic to convert each field into the wikiq tsv format goes here.
+ def to_tsv_row(self):
+
+ row = []
+ for f in dc.fields(self):
+ val = getattr(self, f.name)
+ if getattr(self, f.name) is None:
+ row.append("")
+ elif f.type == bool:
+ row.append("TRUE" if val else "FALSE")
+
+ elif f.type == datetime:
+ row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+ elif f.name in {'editor','title'}:
+ s = '"' + val + '"'
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(s)))
+ else:
+ row.append(s)
+
+ elif f.type == list[int]:
+ row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+ elif f.type == str:
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+ else:
+ row.append(val)
+ else:
+ row.append(val)
+
+ return '\t'.join(map(str,row))
+
+ def header_row(self):
+ return '\t'.join(map(lambda f: f.name, dc.fields(self)))
+
+"""