pa.field("title",pa.string()),
pa.field("namespace",pa.int32()),
pa.field("deleted",pa.bool_()),
- pa.field("test_chars",pa.int32()),
+ pa.field("text_chars",pa.int32()),
pa.field("revert",pa.bool_()),
pa.field("reverteds",pa.list_(pa.int64())),
pa.field("sha1",pa.string()),
namespace = namespace
)
- rev_data = self.matchmake(rev, rev_data)
+ rev_data = self.matchmake_revision(rev, rev_data)
if not rev.deleted.text:
# rev.text can be None if the page has no text
outtable = rows_to_table(self.parquet_buffer, self.schema)
if self.pq_writer is None:
- self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
+ self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
self.pq_writer.write_table(outtable)
self.parquet_buffer = []