+ if magicwords:
+ redirect_config = list(filter(lambda obj: obj.get("name") == "redirect", magicwords))
+ redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
+ redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
+ redirect_aliases.append('REDIRECT') # just in case
+
+ # this regular expression is copied from pywikibot
+ pattern = '(?:' + '|'.join(redirect_aliases) + ')'
+ redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
+ .format(pattern=pattern), re.IGNORECASE | re.DOTALL)
+
+ self.regex_revision_pairs.extend(self.make_matchmake_pairs([redirect_regex], ["redirect"]))
+
+ # This is where we set the type for revdata.
+
+ if self.collapse_user is True:
+ if self.persist == PersistMethod.none:
+ revdata_type = RevDataCollapse
+ else:
+ revdata_type = RevDataCollapsePersistence
+ elif self.persist != PersistMethod.none:
+ revdata_type = RevDataPersistence
+ else:
+ revdata_type = RevDataBase
+
+ # if there are regex fields, we need to add them to the revdata type.
+ regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
+
+ # make_dataclass is a function that defines a new dataclass type.
+ # here we extend the type we have already chosen and add the regular expression types
+ self.revdata_type = dc.make_dataclass('RevData_Parser',
+ fields=regex_fields,
+ bases=(revdata_type,))
+
+ # we also need to make sure that we have the right pyarrow schema
+ self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
+
+ self.revdata_type.urlencode = self.urlencode
+
+ self.schema = pa.schema(self.revdata_type.pa_schema_fields)
+
+ # here we initialize the variables we need for output.
+ if output_parquet is True:
+ self.output_parquet = True
+ self.pq_writer = None
+ self.output_file = output_file
+ self.parquet_buffer = []
+ self.parquet_buffer_size = parquet_buffer_size
+ else:
+ self.print_header = True
+ if output_file == sys.stdout:
+
+ self.output_file = output_file
+ else:
+ self.output_file = open(output_file,'w')
+ self.output_parquet = False
+
+ def make_matchmake_pairs(self, patterns, labels):
+ if (patterns is not None and labels is not None) and \
+ (len(patterns) == len(labels)):
+ result = []
+ for pattern, label in zip(patterns, labels):
+ rp = RegexPair(pattern, label)
+ result.append(rp)
+ self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
+ return result
+ elif (patterns is None and labels is None):
+ return []
+ else:
+ sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
+
+ def matchmake_revision(self, rev, rev_data):
+ rev_data = self.matchmake_text(rev.text, rev_data)
+ rev_data = self.matchmake_comment(rev.comment, rev_data)
+ return rev_data
+
+ def matchmake_text(self, text, rev_data):
+ return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
+
+ def matchmake_comment(self, comment, rev_data):
+ return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
+
+ def matchmake_pairs(self, text, rev_data, pairs):
+ for pair in pairs:
+ rev_data = pair.matchmake(text, rev_data)
+ return rev_data
+