X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/blobdiff_plain/32283aa4da2eb256af9bec2e2d42481a1ca19d0b..refs/heads/redirects:/wikiq diff --git a/wikiq b/wikiq index a9b8f2e..be90a8b 100755 --- a/wikiq +++ b/wikiq @@ -202,7 +202,7 @@ class RegexPair(object): if type(content) in(str, bytes): if self.pattern.search(content) is not None: m = self.pattern.findall(content) - temp_dict[self.label] = ', '.join(m) + temp_dict[self.label] = m else: temp_dict[self.label] = None @@ -297,6 +297,9 @@ class RevDataBase(): elif f.type == list[int]: row.append('"' + ",".join([str(x) for x in val]) + '"') + elif f.type == list[str]: + row.append('"' + ",".join([(x) for x in val]) + '"') + elif f.type == str: if self.urlencode and f.name in TO_ENCODE: row.append(quote(str(val))) @@ -688,6 +691,7 @@ class WikiqParser(): # depending on if we are configured to write tsv or parquet, we'll call a different function. def print_rev_data(self, rev_data): + if self.output_parquet is False: printfunc = self.write_tsv_row else: @@ -840,7 +844,7 @@ if len(args.dumpfiles) > 0: filename = os.path.join(output_dir, os.path.basename(filename)) output_file = get_output_filename(filename, parquet = output_parquet) - print(args.siteinfo) + print(args.siteinfo, file=sys.stderr) wikiq = WikiqParser(input_file, output_file, collapse_user=args.collapse_user,