if type(content) in(str, bytes):
if self.pattern.search(content) is not None:
m = self.pattern.findall(content)
- temp_dict[self.label] = ', '.join(m)
+ temp_dict[self.label] = m
else:
temp_dict[self.label] = None
elif f.type == list[int]:
row.append('"' + ",".join([str(x) for x in val]) + '"')
+ elif f.type == list[str]:
+ row.append('"' + ",".join([(x) for x in val]) + '"')
+
elif f.type == str:
if self.urlencode and f.name in TO_ENCODE:
row.append(quote(str(val)))
redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
redirect_aliases.append('REDIRECT') # just in case
+
+ # this regular expression is copied from pywikibot
pattern = '(?:' + '|'.join(redirect_aliases) + ')'
redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
.format(pattern=pattern), re.IGNORECASE | re.DOTALL)
# depending on if we are configured to write tsv or parquet, we'll call a different function.
def print_rev_data(self, rev_data):
+
if self.output_parquet is False:
printfunc = self.write_tsv_row
else:
filename = os.path.join(output_dir, os.path.basename(filename))
output_file = get_output_filename(filename, parquet = output_parquet)
- print(args.siteinfo)
+ print(args.siteinfo, file=sys.stderr)
wikiq = WikiqParser(input_file,
output_file,
collapse_user=args.collapse_user,