]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq
write regex captures to parquet arrays.
[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq
index 6d2e7fa16645c82e8e9640959834922506bb9c8c..be90a8b6b3bf8ef201f324731c73134162f757de 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -202,7 +202,7 @@ class RegexPair(object):
             if type(content) in(str, bytes):
                 if self.pattern.search(content) is not None:
                     m = self.pattern.findall(content)
-                    temp_dict[self.label] = ', '.join(m)
+                    temp_dict[self.label] = m
                 else:
                     temp_dict[self.label] = None
 
@@ -297,6 +297,9 @@ class RevDataBase():
             elif f.type == list[int]:
                 row.append('"' + ",".join([str(x) for x in val]) + '"')
 
+            elif f.type == list[str]:
+                row.append('"' + ",".join([(x) for x in val]) + '"')
+
             elif f.type == str:
                 if self.urlencode and f.name in TO_ENCODE:
                     row.append(quote(str(val)))
@@ -389,6 +392,8 @@ class WikiqParser():
                 redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
                 redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
                 redirect_aliases.append('REDIRECT') # just in case
+
+                # this regular expression is copied from pywikibot
                 pattern = '(?:' + '|'.join(redirect_aliases) + ')'
                 redirect_regex =  re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
                                              .format(pattern=pattern), re.IGNORECASE | re.DOTALL)
@@ -686,6 +691,7 @@ class WikiqParser():
         
     # depending on if we are configured to write tsv or parquet, we'll call a different function.
     def print_rev_data(self, rev_data):
+
         if self.output_parquet is False:
             printfunc = self.write_tsv_row
         else:
@@ -838,7 +844,7 @@ if len(args.dumpfiles) > 0:
             filename = os.path.join(output_dir, os.path.basename(filename))
             output_file = get_output_filename(filename, parquet = output_parquet)
 
-        print(args.siteinfo)
+        print(args.siteinfo, file=sys.stderr)
         wikiq = WikiqParser(input_file,
                             output_file,
                             collapse_user=args.collapse_user,

Community Data Science Collective || Want to submit a patch?