+ if namespaces is not None:
+ self.namespace_filter = set(namespaces)
+ else:
+ self.namespace_filter = None
+
+ self.regex_schemas = []
+ self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
+ self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
+
+ if self.collapse_user is True:
+ if self.persist == PersistMethod.none:
+ revdata_type = RevDataCollapse
+ else:
+ revdata_type = RevDataCollapsePersistence
+ elif self.persist != PersistMethod.none:
+ revdata_type = RevDataPersistence
+ else:
+ revdata_type = RevDataBase
+
+ regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
+
+ self.revdata_type = make_dataclass('RevData_Parser',
+ fields=regex_fields,
+ bases=(revdata_type,))
+
+ self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
+
+
+ # print(list(map(lambda d: d.name, dc.fields(self.revdata_type))))
+ # print(self.revdata_type.pa_schema_fields)
+
+ if output_parquet is True:
+ self.output_parquet = True
+ self.pq_writer = None
+ self.output_file = output_file
+ self.parquet_buffer = []
+ self.parquet_buffer_size = parquet_buffer_size
+ else:
+ self.output_file = open(output_file,'w')
+
+
+ def make_matchmake_pairs(self, patterns, labels):
+ if (patterns is not None and labels is not None) and \
+ (len(patterns) == len(labels)):
+ result = []
+ for pattern, label in zip(patterns, labels):
+ result.append(RegexPair(pattern, label))
+ self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
+
+ return result
+ elif (patterns is None and labels is None):
+ return []
+ else:
+ sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
+
+ def matchmake(self, rev, rev_data):
+ rev_data = self.matchmake_revision(rev.text, rev_data)
+ rev_data = self.matchmake_comment(rev.comment, rev_data)
+ return rev_data
+
+ def matchmake_revision(self, text, rev_data):
+ return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
+
+ def matchmake_comment(self, comment, rev_data):
+ return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
+
+ def matchmake_pairs(self, text, rev_data, pairs):
+ for pair in pairs:
+ rev_data = pair.matchmake(text, rev_data)
+ return rev_data
+