]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
remove dependency on pandas.
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11 from datetime import datetime
12
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
16
17 from mwxml import Dump
18
19 from deltas.tokenizers import wikitext_split
20 import mwpersistence
21 import mwreverts
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
24 PERSISTENCE_RADIUS=7
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
27
28 from dataclasses import dataclass
29 import pyarrow as pa
30 import pyarrow.parquet as pq
31
32 class PersistMethod:
33     none = 0
34     sequence = 1
35     segment = 2
36     legacy = 3
37
38 def calculate_persistence(tokens_added):
39     return(sum([(len(x.revisions)-1) for x in tokens_added]),
40            len(tokens_added))
41
42
43 class WikiqIterator():
44     def __init__(self, fh, collapse_user=False):
45         self.fh = fh
46         self.collapse_user = collapse_user
47         self.mwiterator = Dump.from_file(self.fh)
48         self.namespace_map = { ns.id : ns.name for ns in
49                                self.mwiterator.site_info.namespaces }
50         self.__pages = self.load_pages()
51
52     def load_pages(self):
53         for page in self.mwiterator:
54             yield WikiqPage(page,
55                             namespace_map = self.namespace_map,
56                             collapse_user=self.collapse_user)
57
58     def __iter__(self):
59         return self.__pages
60
61     def __next__(self):
62         return next(self._pages)
63
64 class WikiqPage():
65     __slots__ = ('id', 'title', 'namespace', 'redirect',
66                  'restrictions', 'mwpage', '__revisions',
67                  'collapse_user')
68     
69     def __init__(self, page, namespace_map, collapse_user=False):
70         self.id = page.id
71         self.namespace = page.namespace
72         # following mwxml, we assume namespace 0 in cases where
73         # page.namespace is inconsistent with namespace_map
74         if page.namespace not in namespace_map:
75             self.title = page.title
76             page.namespace = 0
77         if page.namespace != 0:
78             self.title = ':'.join([namespace_map[page.namespace], page.title])
79         else:
80             self.title = page.title
81         self.restrictions = page.restrictions
82         self.collapse_user = collapse_user
83         self.mwpage = page
84         self.__revisions = self.rev_list()
85
86     def rev_list(self):
87         # Outline for how we want to handle collapse_user=True
88         # iteration   rev.user   prev_rev.user   add prev_rev?
89         #         0          A            None           Never
90         #         1          A               A           False
91         #         2          B               A            True
92         #         3          A               B            True
93         #         4          A               A           False
94         # Post-loop                          A          Always
95         for i, rev in enumerate(self.mwpage):
96             # never yield the first time
97             if i == 0:
98                 if self.collapse_user: 
99                     collapsed_revs = 1
100                     rev.collapsed_revs = collapsed_revs
101
102             else:
103                 if self.collapse_user:
104                     # yield if this is the last edit in a seq by a user and reset
105                     # also yield if we do know who the user is
106
107                     if rev.deleted.user or prev_rev.deleted.user:
108                         yield prev_rev
109                         collapsed_revs = 1
110                         rev.collapsed_revs = collapsed_revs
111
112                     elif not rev.user.text == prev_rev.user.text:
113                         yield prev_rev
114                         collapsed_revs = 1
115                         rev.collapsed_revs = collapsed_revs
116                     # otherwise, add one to the counter
117                     else:
118                         collapsed_revs += 1
119                         rev.collapsed_revs = collapsed_revs
120                 # if collapse_user is false, we always yield
121                 else:
122                     yield prev_rev
123
124             prev_rev = rev
125
126         # also yield the final time
127         yield prev_rev
128
129     def __iter__(self):
130         return self.__revisions
131
132     def __next__(self):
133         return next(self.__revisions)
134
135
136 class RegexPair(object):
137     def __init__(self, pattern, label):
138         self.pattern = re.compile(pattern)
139         self.label = label
140         self.has_groups = bool(self.pattern.groupindex)
141         if self.has_groups:
142             self.capture_groups = list(self.pattern.groupindex.keys())
143             
144     def _make_key(self, cap_group):
145         return ("{}_{}".format(self.label, cap_group))
146
147     def matchmake(self, content, rev_data):
148         
149         temp_dict = {}
150         # if there are named capture groups in the regex
151         if self.has_groups:
152
153             # if there are matches of some sort in this revision content, fill the lists for each cap_group
154             if self.pattern.search(content) is not None:
155                 m = self.pattern.finditer(content)
156                 matchobjects = list(m)
157
158                 for cap_group in self.capture_groups:
159                     key = self._make_key(cap_group)
160                     temp_list = []
161                     for match in matchobjects:
162                         # we only want to add the match for the capture group if the match is not None
163                         if match.group(cap_group) != None:
164                             temp_list.append(match.group(cap_group))
165
166                     # if temp_list of matches is empty just make that column None
167                     if len(temp_list)==0:
168                         temp_dict[key] = None
169                     # else we put in the list we made in the for-loop above
170                     else:
171                         temp_dict[key] = ', '.join(temp_list)
172
173             # there are no matches at all in this revision content, we default values to None
174             else:
175                 for cap_group in self.capture_groups:
176                     key = self._make_key(cap_group)
177                     temp_dict[key] = None
178
179         # there are no capture groups, we just search for all the matches of the regex
180         else:
181             #given that there are matches to be made
182             if type(content) in(str, bytes):
183                 if self.pattern.search(content) is not None:
184                     m = self.pattern.findall(content)
185                     temp_dict[self.label] = ', '.join(m)
186                 else:
187                     temp_dict[self.label] = None
188
189         # update rev_data with our new columns
190         for k, v in temp_dict:
191             rev_data.setattr(k,v)
192
193         return rev_data
194
195 @dataclass()
196 class RevDataBase():
197     revid: int 
198     date_time: datetime
199     articleid: int
200     editorid: int
201     title: str
202     namespace: int
203     deleted: bool
204     text_chars: int = None
205     revert: bool = None
206     reverteds: list[int] = None
207     sha1: str = None
208     minor: bool = None
209     editor: str = None
210     anon: bool = None
211     collapsed_revs:int = None
212
213     pa_schema_fields = [
214         pa.field("revid", pa.int64),
215         pa.field("date_time",pa.timestamp('ms')),
216         pa.field("articleid",pa.int64()),
217         pa.field("editorid",pa.int64()),
218         pa.field("title",pa.string()),
219         pa.field("namespace",pa.int32()),
220         pa.field("deleted",pa.binary()),
221         pa.field("test_chars",pa.int32()),
222         pa.field("revert",pa.binary()),
223         pa.field("reverteds",pa.list_(pa.int64())),
224         pa.field("sha1",pa.string()),
225         pa.field("minor",pa.binary()),
226         pa.field("editor",pa.string()),
227         pa.field("anon",pa.binary())
228     ]
229
230     def to_pyarrow(self):
231         return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
232         
233
234     def to_tsv_row(self):
235         
236         row = []
237         for f in self.fields():
238             val = getattr(self, f.name)
239             if getattr(self, f.name) is None:
240                 row.append("")
241             elif f.type == bool:
242                 row.append("TRUE" if val else "FALSE")
243
244             elif f.type == datetime:
245                 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
246
247             elif f.name in {'editor','title'}:
248                 s = '"' + val + '"'
249                 if f.name in TO_ENCODE:
250                     row.append(quote(str(val)))
251
252             elif f.type == list[int]:
253                 row.append('"' + ",".join([str(x) for x in val]) + '"')
254
255             elif f.type == str:
256                 if f.name in TO_ENCODE:
257                     row.append(quote(str(val)))
258             else:
259                 row.append(val)
260
261         return '\t'.join(row)
262     
263     # def __init__(revid: int,
264     #              date_time: datetime,
265     #              articleid: int,
266     #              editorid: int,
267     #              title: str,
268     #              namespace: int,
269     #              deleted: bool,
270     #              test_chars: int,
271     #              revert: bool,
272     #              reverteds: list[bool],
273     #              sha1: str,
274     #              minor: bool,
275     #              editor: str,
276     #              anon: bool):
277
278         
279
280 @dataclass()
281 class RevDataCollapse(RevDataBase):
282     collapsed_revs:int = None
283     pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
284     pa_schema_fields = RevDataBase.pa_schema_fields + pa_collapsed_revs_schema
285     pa_schema = pa.schema(pa_schema_fields)    
286
287 @dataclass()
288 class RevDataPersistence(RevDataBase):
289     token_revs:int = None
290     tokens_added:int = None
291     tokens_removed:int = None
292     tokens_window:int = None
293     pa_persistence_schema_fields = [
294         pa.field(token_revs, pa.int64()),
295         pa.field(tokens_added, pa.int64()),
296         pa.field(tokens_removed, pa.int64()),
297         pa.tokens_window, pa.int64()]
298         
299     pa_schema_fields = RevDataBase.pa_schema_fields  + pa_persistence_schema_fields
300
301 @dataclass()
302 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
303     pa_scehma_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
304
305 class WikiqParser():
306     def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
307         """ 
308         Parameters:
309            persist : what persistence method to use. Takes a PersistMethod value
310         """
311         self.input_file = input_file
312
313         self.collapse_user = collapse_user
314         self.persist = persist
315         self.namespaces = []
316         self.urlencode = urlencode
317         self.revert_radius = revert_radius
318         
319         self.output_buffer = []
320         self.output_buffer_size = output_buffer_size
321
322         if namespaces is not None:
323             self.namespace_filter = set(namespaces)
324         else:
325             self.namespace_filter = None
326
327         self.regex_schemas = []
328         self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
329         self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
330
331         if self.collapse_user is True:
332             if self.persist == PersistMethod.none:
333                 revdata_type = RevDataCollapse
334             else:
335                 revdata_type = RevDataCollapsePersistence
336         elif self.persist != PersistMethod.none:
337             revdata_type = RevDataPersistence
338         else:
339             revdata_type = RevDataBase
340
341         regex_fields = [(field.name, list[str]), for field in self.regex_schemas]
342         self.revdata_type = dataclasses.make_dataclass('RevData_Parser',
343                                                        fields=map(regex_fields,
344                                                                   lambda pa_field: (pa_field.name,
345                                                                                     list[string],
346                                                                                     field(default=None))),
347                                                        bases=(revdata_type))
348         
349         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + regex_fields
350                         
351         if output_parquet is True:
352             self.output_parquet = True
353             self.pq_writer = None
354             self.output_file = output_file
355         else:
356             self.output_file = open(output_file,'w')
357
358
359     def make_matchmake_pairs(self, patterns, labels):
360         if (patterns is not None and labels is not None) and \
361            (len(patterns) == len(labels)):
362             result = []
363             for pattern, label in zip(patterns, labels):
364                 result.append(RegexPair(pattern, label))
365                 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
366
367             return result
368         elif (patterns is None and labels is None):
369             return []
370         else:
371             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
372
373     def matchmake(self, rev, rev_data):
374         rev_data = self.matchmake_revision(rev.text, rev_data)
375         rev_data = self.matchmake_comment(rev.comment, rev_data)
376         return rev_data
377
378     def matchmake_revision(self, text, rev_data):
379          return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
380
381     def matchmake_comment(self, comment, rev_data):
382         return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
383
384     def matchmake_pairs(self, text, rev_data, pairs):
385         for pair in pairs:
386             rev_data = pair.matchmake(text, rev_data)
387         return rev_data
388
389     def __get_namespace_from_title(self, title):
390         default_ns = None
391
392         for ns in self.namespaces:
393             # skip if the namespace is not defined
394             if ns == None:
395                 default_ns = self.namespaces[ns]
396                 continue
397
398             if title.startswith(ns + ":"):
399                 return self.namespaces[ns]
400
401         # if we've made it this far with no matches, we return the default namespace
402         return default_ns
403
404
405     def process(self):
406
407         # create a regex that creates the output filename
408         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
409         #                         r'output/wikiq-\1-\2.tsv',
410         #                         input_filename)
411
412         # Construct dump file iterator
413         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
414
415         # extract list of namspaces
416         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
417
418         page_count = 0
419         rev_count = 0
420
421
422         # Iterate through pages
423         for page in dump:
424             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
425
426             # skip namespaces not in the filter
427             if self.namespace_filter is not None:
428                 if namespace not in self.namespace_filter:
429                     continue
430
431             rev_detector = mwreverts.Detector(radius = self.revert_radius)
432
433             if self.persist != PersistMethod.none:
434                 window = deque(maxlen=PERSISTENCE_RADIUS)
435
436                 if self.persist == PersistMethod.sequence:
437                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
438                                                     revert_radius=PERSISTENCE_RADIUS)
439
440                 elif self.persist == PersistMethod.segment:
441                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
442                                                     revert_radius=PERSISTENCE_RADIUS)
443
444                 # self.persist == PersistMethod.legacy
445                 else:
446                     from mw.lib import persistence
447                     state = persistence.State()
448
449             # Iterate through a page's revisions
450             for rev in page:
451                 
452                 rev_data = self.revdata_type(revid = rev.id,
453                                              date_time = rev.timestamp,
454                                              articleid = page.id,
455                                              editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
456                                              title =  page.title,
457                                              deleted = rev.deleted.text
458                                              )
459
460                 rev_data = self.matchmake(rev, rev_data)
461
462                 if not rev.deleted.text:
463                     # rev.text can be None if the page has no text
464                     if not rev.text:
465                         rev.text = ""
466                     # if text exists, we'll check for a sha1 and generate one otherwise
467
468                     if rev.sha1:
469                         text_sha1 = rev.sha1
470                     else:
471
472                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
473                     
474                     rev_data.sha1 = text_sha1
475
476                     # TODO rev.bytes doesn't work.. looks like a bug
477                     rev_data.text_chars = len(rev.text)
478
479                     # generate revert data
480                     rev_data.revert = rev_detector.process(text_sha1, rev.id)
481                     
482                     if revert:
483                         rev_data.reverteds = revert.reverteds
484
485                 # if the fact that the edit was minor can be hidden, this might be an issue
486                 rev_data.minor = rev.minor
487
488                 if not rev.deleted.user:
489                     # wrap user-defined editors in quotes for fread
490                     rev_data.editor = rev.user.text 
491                     rev_data.anon = rev.user.id == None
492                     
493                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
494                 #    redirect = True
495                 #else:
496                 #    redirect = False
497                 
498                 #TODO missing: additions_size deletions_size
499                 
500                 # if collapse user was on, lets run that
501                 if self.collapse_user:
502                     rev_data.collapsed_revs = rev.collapsed_revs
503
504                 if self.persist != PersistMethod.none:
505
506                     if not rev.deleted.text:
507
508                         if self.persist != PersistMethod.legacy:
509                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
510
511                         else:
512                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
513                             
514                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
515                         
516                         if len(window) == PERSISTENCE_RADIUS:
517                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
518                             
519                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
520
521                             rev_data.token_revs = num_token_revs
522                             rev_data.tokens_added = num_tokens
523                             rev_data.tokens_removed = len(old_tokens_removed)
524                             rev_data.tokens_window = PERSISTENCE_RADIUS-1
525
526                             self.print_rev_data(rev_data)
527
528                 else:
529                     self.print_rev_data(rev_data)
530
531                 rev_count += 1
532
533             if self.persist != PersistMethod.none:
534                 # print out metadata for the last RADIUS revisions
535                 for i, item in enumerate(window):
536                     # if the window was full, we've already printed item 0
537                     if len(window) == PERSISTENCE_RADIUS and i == 0:
538                         continue
539
540                     rev_id, rev_data, tokens_added, tokens_removed = item
541                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
542
543                     rev_data.token_revs = num_token_revs
544                     rev_data.tokens_added = num_tokens
545                     rev_data.tokens_removed = len(tokens_removed)
546                     rev_data.tokens_window = len(window)-(i+1)
547                     self.print_rev_data(rev_data)
548
549             page_count += 1
550
551         print("Done: %s revisions and %s pages." % (rev_count, page_count),
552               file=sys.stderr)
553
554         if self.output_parquet is True:
555             self.flush_parquet_buffer()
556             self.pq_writer.close()
557
558         else:
559             output_file.close()
560
561
562     def write_parquet_row(self, rev_data):
563         padata = rev_data.to_pyarrow()
564         self.output_buffer.append(padata)
565
566         if len(self.output_buffer) >= self.output_buffer_size:
567             self.flush_parquet_buffer()
568
569
570     def flush_parquet_buffer(self):
571         outtable = pa.table.concat_arrays(self.output_buffer)                        
572         if self.pq_writer is None:
573             schema = pa.schema(self.revdata_type.pa_schema_field)
574             self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
575
576         self.pq_writer.write_table(outtable)
577         self.output_buffer = []
578         
579     def print_rev_data(self, rev_data):
580         if self.output_parquet is False:
581             printfunc = self.write_tsv_row
582         else:
583             printfunc = self.write_parquet_row
584         
585         printfunc(rev_data)
586
587     def write_tsv_row(self, rev_data):
588         
589         self.output_buffer.append(rev_data.to_tsv_line())
590
591         if len(self.output_buffer) >= self.output_buffer_size:
592             self.flush_tsv_buffer()
593
594
595     def flush_tsv_buffer():
596         if self.output_header:
597
598 def open_input_file(input_filename):
599     if re.match(r'.*\.7z$', input_filename):
600         cmd = ["7za", "x", "-so", input_filename, "*.xml"] 
601     elif re.match(r'.*\.gz$', input_filename):
602         cmd = ["zcat", input_filename] 
603     elif re.match(r'.*\.bz2$', input_filename):
604         cmd = ["bzcat", "-dk", input_filename] 
605
606     try:
607         input_file = Popen(cmd, stdout=PIPE).stdout
608     except NameError:
609         input_file = open(input_filename, 'r')
610
611     return input_file
612
613 def get_output_filename(input_filename, parquet = False):
614     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
615     output_filename = re.sub(r'\.xml', '', output_filename)
616     if parquet is False:
617         output_filename = output_filename + ".tsv"
618     else:
619         output_filename = output_filename + ".parquet"
620     return output_filename
621
622 def open_output_file(input_filename):
623     # create a regex that creates the output filename
624     output_filename = get_output_filename(input_filename, parquet = False)
625     output_file = open(output_filename, "w")
626     return output_file
627
628 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
629
630 # arguments for the input direction
631 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
632                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
633
634 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
635                     help="Directory for output files. If it ends with .parquet output will be in parquet format.")
636
637 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
638                     help="Write output to standard out (do not create dump file)")
639
640 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
641                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
642
643 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
644                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
645
646 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
647                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
648
649 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
650                     help="Id number of namspace to include. Can be specified more than once.")
651
652 parser.add_argument('-rr',
653                     '--revert-radius',
654                     dest="revert_radius",
655                     type=int,
656                     action='store',
657                     default=15,
658                     help="Number of edits to check when looking for reverts (default: 15)")
659
660 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
661                     help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
662
663 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
664                     help="The label for the outputted column based on matching the regex in revision text.")
665
666 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
667                     help="The regular expression to search for in comments of revisions.")
668
669 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
670                     help="The label for the outputted column based on matching the regex in comments.")
671
672 args = parser.parse_args()
673
674
675
676 # set persistence method
677
678 if args.persist is None:
679     persist = PersistMethod.none
680 elif args.persist == "segment":
681     persist = PersistMethod.segment
682 elif args.persist == "legacy":
683     persist = PersistMethod.legacy
684 else:
685     persist = PersistMethod.sequence
686
687 if args.namespace_filter is not None:
688     namespaces = args.namespace_filter
689 else:
690     namespaces = None
691
692 if len(args.dumpfiles) > 0:
693     output_parquet = False
694     for filename in args.dumpfiles:
695         input_file = open_input_file(filename)
696
697         # open directory for output
698         if args.output_dir:
699             output_dir = args.output_dir[0]
700         else:
701             output_dir = "."
702
703         if output_dir.endswith(".parquet"):
704             output_parquet = True
705
706         print("Processing file: %s" % filename, file=sys.stderr)
707
708         if args.stdout:
709             output_file = sys.stdout
710         else:
711             filename = os.path.join(output_dir, os.path.basename(filename))
712             output_file = get_output_filename(filename, parquet = output_parquet)
713
714         wikiq = WikiqParser(input_file,
715                             output_file,
716                             collapse_user=args.collapse_user,
717                             persist=persist,
718                             urlencode=args.urlencode,
719                             namespaces=namespaces,
720                             revert_radius=args.revert_radius,
721                             regex_match_revision = args.regex_match_revision,
722                             regex_revision_label = args.regex_revision_label,
723                             regex_match_comment = args.regex_match_comment,
724                             regex_comment_label = args.regex_comment_label,
725                             output_parquet=output_parquet)
726
727         print(wikiq.output_parquet)
728         wikiq.process()
729
730         # close things 
731         input_file.close()
732
733 else:
734     wikiq = WikiqParser(sys.stdin,
735                         sys.stdout,
736                         collapse_user=args.collapse_user,
737                         persist=persist,
738                         #persist_legacy=args.persist_legacy,
739                         urlencode=args.urlencode,
740                         namespaces=namespaces,
741                         revert_radius=args.revert_radius,
742                         regex_match_revision = args.regex_match_revision,
743                         regex_revision_label = args.regex_revision_label,
744                         regex_match_comment = args.regex_match_comment,
745                         regex_comment_label = args.regex_comment_label)
746
747     wikiq.process() 
748
749 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
750 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?