]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
8c51979ca5b697c73a78525a2b1c4fb14a016229
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11 from datetime import datetime
12
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
16
17 from mwxml import Dump
18
19 from deltas.tokenizers import wikitext_split
20 import mwpersistence
21 import mwreverts
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
24 PERSISTENCE_RADIUS=7
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
27
28 import dataclasses as dc
29 from dataclasses import dataclass, make_dataclass
30 import pyarrow as pa
31 import pyarrow.parquet as pq
32
33 class PersistMethod:
34     none = 0
35     sequence = 1
36     segment = 2
37     legacy = 3
38
39 def calculate_persistence(tokens_added):
40     return(sum([(len(x.revisions)-1) for x in tokens_added]),
41            len(tokens_added))
42
43 class WikiqIterator():
44     def __init__(self, fh, collapse_user=False):
45         self.fh = fh
46         self.collapse_user = collapse_user
47         self.mwiterator = Dump.from_file(self.fh)
48         self.namespace_map = { ns.id : ns.name for ns in
49                                self.mwiterator.site_info.namespaces }
50         self.__pages = self.load_pages()
51
52     def load_pages(self):
53         for page in self.mwiterator:
54             yield WikiqPage(page,
55                             namespace_map = self.namespace_map,
56                             collapse_user=self.collapse_user)
57
58     def __iter__(self):
59         return self.__pages
60
61     def __next__(self):
62         return next(self._pages)
63
64 class WikiqPage():
65     __slots__ = ('id', 'title', 'namespace', 'redirect',
66                  'restrictions', 'mwpage', '__revisions',
67                  'collapse_user')
68     
69     def __init__(self, page, namespace_map, collapse_user=False):
70         self.id = page.id
71         self.namespace = page.namespace
72         # following mwxml, we assume namespace 0 in cases where
73         # page.namespace is inconsistent with namespace_map
74         if page.namespace not in namespace_map:
75             self.title = page.title
76             page.namespace = 0
77         if page.namespace != 0:
78             self.title = ':'.join([namespace_map[page.namespace], page.title])
79         else:
80             self.title = page.title
81         self.restrictions = page.restrictions
82         self.collapse_user = collapse_user
83         self.mwpage = page
84         self.__revisions = self.rev_list()
85
86     def rev_list(self):
87         # Outline for how we want to handle collapse_user=True
88         # iteration   rev.user   prev_rev.user   add prev_rev?
89         #         0          A            None           Never
90         #         1          A               A           False
91         #         2          B               A            True
92         #         3          A               B            True
93         #         4          A               A           False
94         # Post-loop                          A          Always
95         for i, rev in enumerate(self.mwpage):
96             # never yield the first time
97             if i == 0:
98                 if self.collapse_user: 
99                     collapsed_revs = 1
100                     rev.collapsed_revs = collapsed_revs
101
102             else:
103                 if self.collapse_user:
104                     # yield if this is the last edit in a seq by a user and reset
105                     # also yield if we do know who the user is
106
107                     if rev.deleted.user or prev_rev.deleted.user:
108                         yield prev_rev
109                         collapsed_revs = 1
110                         rev.collapsed_revs = collapsed_revs
111
112                     elif not rev.user.text == prev_rev.user.text:
113                         yield prev_rev
114                         collapsed_revs = 1
115                         rev.collapsed_revs = collapsed_revs
116                     # otherwise, add one to the counter
117                     else:
118                         collapsed_revs += 1
119                         rev.collapsed_revs = collapsed_revs
120                 # if collapse_user is false, we always yield
121                 else:
122                     yield prev_rev
123
124             prev_rev = rev
125
126         # also yield the final time
127         yield prev_rev
128
129     def __iter__(self):
130         return self.__revisions
131
132     def __next__(self):
133         return next(self.__revisions)
134
135
136 class RegexPair(object):
137     def __init__(self, pattern, label):
138         self.pattern = re.compile(pattern)
139         self.label = label
140         self.has_groups = bool(self.pattern.groupindex)
141         if self.has_groups:
142             self.capture_groups = list(self.pattern.groupindex.keys())
143             
144     def _make_key(self, cap_group):
145         return ("{}_{}".format(self.label, cap_group))
146
147     def matchmake(self, content, rev_data):
148         
149         temp_dict = {}
150         # if there are named capture groups in the regex
151         if self.has_groups:
152
153             # if there are matches of some sort in this revision content, fill the lists for each cap_group
154             if self.pattern.search(content) is not None:
155                 m = self.pattern.finditer(content)
156                 matchobjects = list(m)
157
158                 for cap_group in self.capture_groups:
159                     key = self._make_key(cap_group)
160                     temp_list = []
161                     for match in matchobjects:
162                         # we only want to add the match for the capture group if the match is not None
163                         if match.group(cap_group) != None:
164                             temp_list.append(match.group(cap_group))
165
166                     # if temp_list of matches is empty just make that column None
167                     if len(temp_list)==0:
168                         temp_dict[key] = None
169                     # else we put in the list we made in the for-loop above
170                     else:
171                         temp_dict[key] = ', '.join(temp_list)
172
173             # there are no matches at all in this revision content, we default values to None
174             else:
175                 for cap_group in self.capture_groups:
176                     key = self._make_key(cap_group)
177                     temp_dict[key] = None
178
179         # there are no capture groups, we just search for all the matches of the regex
180         else:
181             #given that there are matches to be made
182             if type(content) in(str, bytes):
183                 if self.pattern.search(content) is not None:
184                     m = self.pattern.findall(content)
185                     temp_dict[self.label] = ', '.join(m)
186                 else:
187                     temp_dict[self.label] = None
188
189         # update rev_data with our new columns
190         for k, v in temp_dict.items():
191             setattr(rev_data, k, v)
192
193         return rev_data
194
195 @dataclass()
196 class RevDataBase():
197     revid: int 
198     date_time: datetime
199     articleid: int
200     editorid: int
201     title: str
202     namespace: int
203     deleted: bool
204     text_chars: int = None
205     revert: bool = None
206     reverteds: list[int] = None
207     sha1: str = None
208     minor: bool = None
209     editor: str = None
210     anon: bool = None
211
212     pa_schema_fields = [
213         pa.field("revid", pa.int64()),
214         pa.field("date_time",pa.timestamp('ms')),
215         pa.field("articleid",pa.int64()),
216         pa.field("editorid",pa.int64()),
217         pa.field("title",pa.string()),
218         pa.field("namespace",pa.int32()),
219         pa.field("deleted",pa.bool_()),
220         pa.field("test_chars",pa.int32()),
221         pa.field("revert",pa.bool_()),
222         pa.field("reverteds",pa.list_(pa.int64())),
223         pa.field("sha1",pa.string()),
224         pa.field("minor",pa.bool_()),
225         pa.field("editor",pa.string()),
226         pa.field("anon",pa.bool_())
227     ]
228
229     def to_pyarrow(self):
230         return dc.astuple(self)
231
232     def to_tsv_row(self):
233         
234         row = []
235         for f in self.fields():
236             val = getattr(self, f.name)
237             if getattr(self, f.name) is None:
238                 row.append("")
239             elif f.type == bool:
240                 row.append("TRUE" if val else "FALSE")
241
242             elif f.type == datetime:
243                 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
244
245             elif f.name in {'editor','title'}:
246                 s = '"' + val + '"'
247                 if f.name in TO_ENCODE:
248                     row.append(quote(str(val)))
249
250             elif f.type == list[int]:
251                 row.append('"' + ",".join([str(x) for x in val]) + '"')
252
253             elif f.type == str:
254                 if f.name in TO_ENCODE:
255                     row.append(quote(str(val)))
256             else:
257                 row.append(val)
258
259         return '\t'.join(row)
260     
261     # def __init__(revid: int,
262     #              date_time: datetime,
263     #              articleid: int,
264     #              editorid: int,
265     #              title: str,
266     #              namespace: int,
267     #              deleted: bool,
268     #              test_chars: int,
269     #              revert: bool,
270     #              reverteds: list[bool],
271     #              sha1: str,
272     #              minor: bool,
273     #              editor: str,
274     #              anon: bool):
275
276         
277
278 @dataclass()
279 class RevDataCollapse(RevDataBase):
280     collapsed_revs:int = None
281     pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
282     pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
283
284 @dataclass()
285 class RevDataPersistence(RevDataBase):
286     token_revs:int = None
287     tokens_added:int = None
288     tokens_removed:int = None
289     tokens_window:int = None
290
291     pa_persistence_schema_fields = [
292         pa.field("token_revs", pa.int64()),
293         pa.field("tokens_added", pa.int64()),
294         pa.field("tokens_removed", pa.int64()),
295         pa.field("tokens_window", pa.int64())]
296         
297     pa_schema_fields = RevDataBase.pa_schema_fields  + pa_persistence_schema_fields
298
299 @dataclass()
300 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
301     pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
302
303 class WikiqParser():
304     def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
305         """ 
306         Parameters:
307            persist : what persistence method to use. Takes a PersistMethod value
308         """
309         self.input_file = input_file
310
311         self.collapse_user = collapse_user
312         self.persist = persist
313         self.namespaces = []
314         self.urlencode = urlencode
315         self.revert_radius = revert_radius
316         
317         if namespaces is not None:
318             self.namespace_filter = set(namespaces)
319         else:
320             self.namespace_filter = None
321
322         self.regex_schemas = []
323         self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
324         self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
325
326         if self.collapse_user is True:
327             if self.persist == PersistMethod.none:
328                 revdata_type = RevDataCollapse
329             else:
330                 revdata_type = RevDataCollapsePersistence
331         elif self.persist != PersistMethod.none:
332             revdata_type = RevDataPersistence
333         else:
334             revdata_type = RevDataBase
335
336         regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
337
338         self.revdata_type = make_dataclass('RevData_Parser',
339                                            fields=regex_fields,
340                                            bases=(revdata_type,))
341         
342         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
343                         
344
345         # print(list(map(lambda d: d.name, dc.fields(self.revdata_type))))
346         # print(self.revdata_type.pa_schema_fields)
347
348         if output_parquet is True:
349             self.output_parquet = True
350             self.pq_writer = None
351             self.output_file = output_file
352             self.parquet_buffer = []
353             self.parquet_buffer_size = parquet_buffer_size
354         else:
355             self.output_file = open(output_file,'w')
356
357
358     def make_matchmake_pairs(self, patterns, labels):
359         if (patterns is not None and labels is not None) and \
360            (len(patterns) == len(labels)):
361             result = []
362             for pattern, label in zip(patterns, labels):
363                 result.append(RegexPair(pattern, label))
364                 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
365
366             return result
367         elif (patterns is None and labels is None):
368             return []
369         else:
370             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
371
372     def matchmake(self, rev, rev_data):
373         rev_data = self.matchmake_revision(rev.text, rev_data)
374         rev_data = self.matchmake_comment(rev.comment, rev_data)
375         return rev_data
376
377     def matchmake_revision(self, text, rev_data):
378          return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
379
380     def matchmake_comment(self, comment, rev_data):
381         return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
382
383     def matchmake_pairs(self, text, rev_data, pairs):
384         for pair in pairs:
385             rev_data = pair.matchmake(text, rev_data)
386         return rev_data
387
388     def __get_namespace_from_title(self, title):
389         default_ns = None
390
391         for ns in self.namespaces:
392             # skip if the namespace is not defined
393             if ns == None:
394                 default_ns = self.namespaces[ns]
395                 continue
396
397             if title.startswith(ns + ":"):
398                 return self.namespaces[ns]
399
400         # if we've made it this far with no matches, we return the default namespace
401         return default_ns
402
403
404     def process(self):
405
406         # create a regex that creates the output filename
407         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
408         #                         r'output/wikiq-\1-\2.tsv',
409         #                         input_filename)
410
411         # Construct dump file iterator
412         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
413
414         # extract list of namspaces
415         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
416
417         page_count = 0
418         rev_count = 0
419
420
421         # Iterate through pages
422         for page in dump:
423             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
424
425             # skip namespaces not in the filter
426             if self.namespace_filter is not None:
427                 if namespace not in self.namespace_filter:
428                     continue
429
430             rev_detector = mwreverts.Detector(radius = self.revert_radius)
431
432             if self.persist != PersistMethod.none:
433                 window = deque(maxlen=PERSISTENCE_RADIUS)
434
435                 if self.persist == PersistMethod.sequence:
436                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
437                                                     revert_radius=PERSISTENCE_RADIUS)
438
439                 elif self.persist == PersistMethod.segment:
440                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
441                                                     revert_radius=PERSISTENCE_RADIUS)
442
443                 # self.persist == PersistMethod.legacy
444                 else:
445                     from mw.lib import persistence
446                     state = persistence.State()
447
448             # Iterate through a page's revisions
449             for rev in page:
450                 
451                 rev_data = self.revdata_type(revid = rev.id,
452                                              date_time = datetime.fromtimestamp(rev.timestamp.unix()),
453                                              articleid = page.id,
454                                              editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
455                                              title =  page.title,
456                                              deleted = rev.deleted.text,
457                                              namespace = namespace
458                                              )
459
460                 rev_data = self.matchmake(rev, rev_data)
461
462                 if not rev.deleted.text:
463                     # rev.text can be None if the page has no text
464                     if not rev.text:
465                         rev.text = ""
466                     # if text exists, we'll check for a sha1 and generate one otherwise
467
468                     if rev.sha1:
469                         text_sha1 = rev.sha1
470                     else:
471
472                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
473                     
474                     rev_data.sha1 = text_sha1
475
476                     # TODO rev.bytes doesn't work.. looks like a bug
477                     rev_data.text_chars = len(rev.text)
478
479                     # generate revert data
480                     revert = rev_detector.process(text_sha1, rev.id)
481                     
482                     if revert:
483                         rev_data.revert = True
484                         rev_data.reverteds = revert.reverteds
485                     else:
486                         rev_data.revert = False
487
488                 # if the fact that the edit was minor can be hidden, this might be an issue
489                 rev_data.minor = rev.minor
490
491                 if not rev.deleted.user:
492                     # wrap user-defined editors in quotes for fread
493                     rev_data.editor = rev.user.text 
494                     rev_data.anon = rev.user.id == None
495                     
496                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
497                 #    redirect = True
498                 #else:
499                 #    redirect = False
500                 
501                 #TODO missing: additions_size deletions_size
502                 
503                 # if collapse user was on, lets run that
504                 if self.collapse_user:
505                     rev_data.collapsed_revs = rev.collapsed_revs
506
507                 if self.persist != PersistMethod.none:
508
509                     if not rev.deleted.text:
510
511                         if self.persist != PersistMethod.legacy:
512                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
513
514                         else:
515                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
516                             
517                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
518                         
519                         if len(window) == PERSISTENCE_RADIUS:
520                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
521                             
522                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
523
524                             rev_data.token_revs = num_token_revs
525                             rev_data.tokens_added = num_tokens
526                             rev_data.tokens_removed = len(old_tokens_removed)
527                             rev_data.tokens_window = PERSISTENCE_RADIUS-1
528
529                             self.print_rev_data(rev_data)
530
531                 else:
532                     self.print_rev_data(rev_data)
533
534                 rev_count += 1
535
536             if self.persist != PersistMethod.none:
537                 # print out metadata for the last RADIUS revisions
538                 for i, item in enumerate(window):
539                     # if the window was full, we've already printed item 0
540                     if len(window) == PERSISTENCE_RADIUS and i == 0:
541                         continue
542
543                     rev_id, rev_data, tokens_added, tokens_removed = item
544                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
545
546                     rev_data.token_revs = num_token_revs
547                     rev_data.tokens_added = num_tokens
548                     rev_data.tokens_removed = len(tokens_removed)
549                     rev_data.tokens_window = len(window)-(i+1)
550                     self.print_rev_data(rev_data)
551
552             page_count += 1
553
554         print("Done: %s revisions and %s pages." % (rev_count, page_count),
555               file=sys.stderr)
556
557         if self.output_parquet is True:
558             self.flush_parquet_buffer()
559             self.pq_writer.close()
560
561         else:
562             output_file.close()
563
564
565     def write_parquet_row(self, rev_data):
566         padata = rev_data.to_pyarrow()
567         self.parquet_buffer.append(padata)
568
569         if len(self.parquet_buffer) >= self.parquet_buffer_size:
570             self.flush_parquet_buffer()
571
572
573     def flush_parquet_buffer(self):
574         schema = pa.schema(self.revdata_type.pa_schema_fields)
575
576         def row_to_col(rg, types):
577             cols = []
578             first = rg[0]
579             for col in first:
580                 cols.append([col])
581
582             for row in rg[1:]:
583                 for j in range(len(cols)):
584                     cols[j].append(row[j])
585
586             arrays = []
587             for col, typ in zip(cols, types):
588                 arrays.append(pa.array(col, typ))
589             return arrays
590
591         outtable = pa.Table.from_arrays(row_to_col(self.parquet_buffer, schema.types), schema=schema)
592         if self.pq_writer is None:
593             self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
594
595         self.pq_writer.write_table(outtable)
596         self.parquet_buffer = []
597         
598     def print_rev_data(self, rev_data):
599         if self.output_parquet is False:
600             printfunc = self.write_tsv_row
601         else:
602             printfunc = self.write_parquet_row
603         
604         printfunc(rev_data)
605
606     def write_tsv_row(self, rev_data):
607         line = rev_data.to_tsv_line()
608         print(line, file=self.output_file)
609
610
611 def open_input_file(input_filename):
612     if re.match(r'.*\.7z$', input_filename):
613         cmd = ["7za", "x", "-so", input_filename, "*.xml"] 
614     elif re.match(r'.*\.gz$', input_filename):
615         cmd = ["zcat", input_filename] 
616     elif re.match(r'.*\.bz2$', input_filename):
617         cmd = ["bzcat", "-dk", input_filename] 
618
619     try:
620         input_file = Popen(cmd, stdout=PIPE).stdout
621     except NameError:
622         input_file = open(input_filename, 'r')
623
624     return input_file
625
626 def get_output_filename(input_filename, parquet = False):
627     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
628     output_filename = re.sub(r'\.xml', '', output_filename)
629     if parquet is False:
630         output_filename = output_filename + ".tsv"
631     else:
632         output_filename = output_filename + ".parquet"
633     return output_filename
634
635 def open_output_file(input_filename):
636     # create a regex that creates the output filename
637     output_filename = get_output_filename(input_filename, parquet = False)
638     output_file = open(output_filename, "w")
639     return output_file
640
641 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
642
643 # arguments for the input direction
644 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
645                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
646
647 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
648                     help="Directory for output files. If it ends with .parquet output will be in parquet format.")
649
650 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
651                     help="Write output to standard out (do not create dump file)")
652
653 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
654                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
655
656 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
657                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
658
659 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
660                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
661
662 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
663                     help="Id number of namspace to include. Can be specified more than once.")
664
665 parser.add_argument('-rr',
666                     '--revert-radius',
667                     dest="revert_radius",
668                     type=int,
669                     action='store',
670                     default=15,
671                     help="Number of edits to check when looking for reverts (default: 15)")
672
673 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
674                     help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
675
676 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
677                     help="The label for the outputted column based on matching the regex in revision text.")
678
679 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
680                     help="The regular expression to search for in comments of revisions.")
681
682 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
683                     help="The label for the outputted column based on matching the regex in comments.")
684
685 args = parser.parse_args()
686
687
688
689 # set persistence method
690
691 if args.persist is None:
692     persist = PersistMethod.none
693 elif args.persist == "segment":
694     persist = PersistMethod.segment
695 elif args.persist == "legacy":
696     persist = PersistMethod.legacy
697 else:
698     persist = PersistMethod.sequence
699
700 if args.namespace_filter is not None:
701     namespaces = args.namespace_filter
702 else:
703     namespaces = None
704
705 if len(args.dumpfiles) > 0:
706     output_parquet = False
707     for filename in args.dumpfiles:
708         input_file = open_input_file(filename)
709
710         # open directory for output
711         if args.output_dir:
712             output_dir = args.output_dir[0]
713         else:
714             output_dir = "."
715
716         if output_dir.endswith(".parquet"):
717             output_parquet = True
718
719         print("Processing file: %s" % filename, file=sys.stderr)
720
721         if args.stdout:
722             output_file = sys.stdout
723         else:
724             filename = os.path.join(output_dir, os.path.basename(filename))
725             output_file = get_output_filename(filename, parquet = output_parquet)
726
727         wikiq = WikiqParser(input_file,
728                             output_file,
729                             collapse_user=args.collapse_user,
730                             persist=persist,
731                             urlencode=args.urlencode,
732                             namespaces=namespaces,
733                             revert_radius=args.revert_radius,
734                             regex_match_revision = args.regex_match_revision,
735                             regex_revision_label = args.regex_revision_label,
736                             regex_match_comment = args.regex_match_comment,
737                             regex_comment_label = args.regex_comment_label,
738                             output_parquet=output_parquet)
739
740         print(wikiq.output_parquet)
741         wikiq.process()
742
743         # close things 
744         input_file.close()
745
746 else:
747     wikiq = WikiqParser(sys.stdin,
748                         sys.stdout,
749                         collapse_user=args.collapse_user,
750                         persist=persist,
751                         #persist_legacy=args.persist_legacy,
752                         urlencode=args.urlencode,
753                         namespaces=namespaces,
754                         revert_radius=args.revert_radius,
755                         regex_match_revision = args.regex_match_revision,
756                         regex_revision_label = args.regex_revision_label,
757                         regex_match_comment = args.regex_match_comment,
758                         regex_comment_label = args.regex_comment_label)
759
760     wikiq.process() 
761
762 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
763 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?