+ temp_dict = {}
+ # if there are named capture groups in the regex
+ if self.has_groups:
+
+ # if there are matches of some sort in this revision content, fill the lists for each cap_group
+ if self.pattern.search(content) is not None:
+ m = self.pattern.finditer(content)
+ matchobjects = list(m)
+
+ for cap_group in self.capture_groups:
+ key = self._make_key(cap_group)
+ temp_list = []
+ for match in matchobjects:
+ # we only want to add the match for the capture group if the match is not None
+ if match.group(cap_group) != None:
+ temp_list.append(match.group(cap_group))
+
+ # if temp_list of matches is empty just make that column None
+ if len(temp_list)==0:
+ temp_dict[key] = None
+ # else we put in the list we made in the for-loop above
+ else:
+ temp_dict[key] = ', '.join(temp_list)
+
+ # there are no matches at all in this revision content, we default values to None
+ else:
+ for cap_group in self.capture_groups:
+ key = self._make_key(cap_group)
+ temp_dict[key] = None
+
+ # there are no capture groups, we just search for all the matches of the regex
+ else:
+ #given that there are matches to be made
+ if type(content) in(str, bytes):
+ if self.pattern.search(content) is not None:
+ m = self.pattern.findall(content)
+ temp_dict[self.label] = ', '.join(m)
+ else:
+ temp_dict[self.label] = None
+
+ # update rev_data with our new columns
+ for k, v in temp_dict.items():
+ setattr(rev_data, k, v)
+
+ return rev_data
+
+"""
+
+We used to use a dictionary to collect fields for the output.
+Now we use dataclasses. Compared to a dictionary, this should help:
+- prevent some bugs
+- make it easier to output parquet data.
+- use class attribute '.' syntax instead of dictionary syntax.
+- improve support for tooling (autocomplete, type hints)
+- use type information to define formatting rules
+
+Depending on the parameters passed into Wikiq, the output schema can be different.
+Therefore, we need to end up constructing a dataclass with the correct output schema.
+It also needs to have the correct pyarrow schema so we can write parquet files.
+
+The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
+"""
+@dataclass()
+class RevDataBase():
+ revid: int
+ date_time: datetime
+ articleid: int
+ editorid: int
+ title: str
+ namespace: int
+ deleted: bool
+ text_chars: int = None
+ revert: bool = None
+ reverteds: list[int] = None
+ sha1: str = None
+ minor: bool = None
+ editor: str = None
+ anon: bool = None
+
+ # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
+ urlencode = False
+
+ # defines pyarrow schema.
+ # each field in the data class needs an entry in this array.
+ # the names should match and be in the same order.
+ # this isn't a dataclass field since it doesn't have a type annotation
+ pa_schema_fields = [
+ pa.field("revid", pa.int64()),
+ pa.field("date_time", pa.timestamp('ms')),
+ pa.field("articleid",pa.int64()),
+ pa.field("editorid",pa.int64(), nullable=True),
+ pa.field("title",pa.string()),
+ pa.field("namespace",pa.int32()),
+ pa.field("deleted",pa.bool_()),
+ pa.field("text_chars",pa.int32()),
+ pa.field("revert",pa.bool_(), nullable=True),
+ pa.field("reverteds",pa.list_(pa.int64()), nullable=True),
+ pa.field("sha1",pa.string()),
+ pa.field("minor",pa.bool_()),
+ pa.field("editor",pa.string()),
+ pa.field("anon",pa.bool_())
+ ]
+
+ # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
+ def to_pyarrow(self):
+ return dc.astuple(self)
+
+ # logic to convert each field into the wikiq tsv format goes here.
+ def to_tsv_row(self):
+
+ row = []
+ for f in dc.fields(self):
+ val = getattr(self, f.name)
+ if getattr(self, f.name) is None:
+ row.append("")
+ elif f.type == bool:
+ row.append("TRUE" if val else "FALSE")
+
+ elif f.type == datetime:
+ row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
+
+ elif f.name in {'editor','title'}:
+ s = '"' + val + '"'
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(s)))
+ else:
+ row.append(s)
+
+ elif f.type == list[int]:
+ row.append('"' + ",".join([str(x) for x in val]) + '"')
+
+ elif f.type == str:
+ if self.urlencode and f.name in TO_ENCODE:
+ row.append(quote(str(val)))
+ else:
+ row.append(val)
+ else:
+ row.append(val)
+
+ return '\t'.join(map(str,row))
+
+ def header_row(self):
+ return '\t'.join(map(lambda f: f.name, dc.fields(self)))
+
+"""
+
+If collapse=True we'll use a RevDataCollapse dataclass.
+This class inherits from RevDataBase. This means that it has all the same fields and functions.
+
+It just adds a new field and updates the pyarrow schema.
+
+"""
+@dataclass()
+class RevDataCollapse(RevDataBase):
+ collapsed_revs:int = None
+
+ pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
+ pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
+
+"""
+
+If persistence data is to be computed we'll need the fields added by RevDataPersistence.
+
+"""
+@dataclass()
+class RevDataPersistence(RevDataBase):
+ token_revs:int = None
+ tokens_added:int = None
+ tokens_removed:int = None
+ tokens_window:int = None
+
+ pa_persistence_schema_fields = [
+ pa.field("token_revs", pa.int64()),
+ pa.field("tokens_added", pa.int64()),
+ pa.field("tokens_removed", pa.int64()),
+ pa.field("tokens_window", pa.int64())]
+
+ pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
+
+"""
+class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
+
+"""
+@dataclass()
+class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
+ pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
+
+class WikiqParser():
+ def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
+ """
+ Parameters:
+ persist : what persistence method to use. Takes a PersistMethod value
+ """