X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/595728d8da09105c78e9e279cc1c258534ec10ae..b124f9c7c891b8b98441ef1b185ba1a1a4a32179:/wikiq diff --git a/wikiq b/wikiq index 6d2e7fa..be90a8b 100755 --- a/wikiq +++ b/wikiq @@ -202,7 +202,7 @@ class RegexPair(object): if type(content) in(str, bytes): if self.pattern.search(content) is not None: m = self.pattern.findall(content) - temp_dict[self.label] = ', '.join(m) + temp_dict[self.label] = m else: temp_dict[self.label] = None @@ -297,6 +297,9 @@ class RevDataBase(): elif f.type == list[int]: row.append('"' + ",".join([str(x) for x in val]) + '"') + elif f.type == list[str]: + row.append('"' + ",".join([(x) for x in val]) + '"') + elif f.type == str: if self.urlencode and f.name in TO_ENCODE: row.append(quote(str(val))) @@ -389,6 +392,8 @@ class WikiqParser(): redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config)) redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases)) redirect_aliases.append('REDIRECT') # just in case + + # this regular expression is copied from pywikibot pattern = '(?:' + '|'.join(redirect_aliases) + ')' redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]' .format(pattern=pattern), re.IGNORECASE | re.DOTALL) @@ -686,6 +691,7 @@ class WikiqParser(): # depending on if we are configured to write tsv or parquet, we'll call a different function. def print_rev_data(self, rev_data): + if self.output_parquet is False: printfunc = self.write_tsv_row else: @@ -838,7 +844,7 @@ if len(args.dumpfiles) > 0: filename = os.path.join(output_dir, os.path.basename(filename)) output_file = get_output_filename(filename, parquet = output_parquet) - print(args.siteinfo) + print(args.siteinfo, file=sys.stderr) wikiq = WikiqParser(input_file, output_file, collapse_user=args.collapse_user,