X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/bb83d62b745d61e761a1be011e814bbb6aa241aa..refs/heads/parquet_support:/wikiq diff --git a/wikiq b/wikiq index bffbbf4..75c1af8 100755 --- a/wikiq +++ b/wikiq @@ -250,13 +250,13 @@ class RevDataBase(): pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('ms')), pa.field("articleid",pa.int64()), - pa.field("editorid",pa.int64()), + pa.field("editorid",pa.int64(), nullable=True), pa.field("title",pa.string()), pa.field("namespace",pa.int32()), pa.field("deleted",pa.bool_()), - pa.field("test_chars",pa.int32()), - pa.field("revert",pa.bool_()), - pa.field("reverteds",pa.list_(pa.int64())), + pa.field("text_chars",pa.int32()), + pa.field("revert",pa.bool_(), nullable=True), + pa.field("reverteds",pa.list_(pa.int64()), nullable=True), pa.field("sha1",pa.string()), pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), @@ -429,12 +429,12 @@ class WikiqParser(): else: sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') - def matchmake(self, rev, rev_data): - rev_data = self.matchmake_revision(rev.text, rev_data) + def matchmake_revision(self, rev, rev_data): + rev_data = self.matchmake_text(rev.text, rev_data) rev_data = self.matchmake_comment(rev.comment, rev_data) return rev_data - def matchmake_revision(self, text, rev_data): + def matchmake_text(self, text, rev_data): return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) def matchmake_comment(self, comment, rev_data): @@ -518,7 +518,7 @@ class WikiqParser(): namespace = namespace ) - rev_data = self.matchmake(rev, rev_data) + rev_data = self.matchmake_revision(rev, rev_data) if not rev.deleted.text: # rev.text can be None if the page has no text