]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
use dataclasses and pyarrow for types.
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11 from datetime import datetime
12
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
16
17 from mwxml import Dump
18
19 from deltas.tokenizers import wikitext_split
20 import mwpersistence
21 import mwreverts
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
24 PERSISTENCE_RADIUS=7
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
27
28 from dataclasses import dataclass
29 import pandas as pd
30 import pyarrow as pa
31 import pyarrow.parquet as pq
32
33 class PersistMethod:
34     none = 0
35     sequence = 1
36     segment = 2
37     legacy = 3
38
39 def calculate_persistence(tokens_added):
40     return(sum([(len(x.revisions)-1) for x in tokens_added]),
41            len(tokens_added))
42
43
44 class WikiqIterator():
45     def __init__(self, fh, collapse_user=False):
46         self.fh = fh
47         self.collapse_user = collapse_user
48         self.mwiterator = Dump.from_file(self.fh)
49         self.namespace_map = { ns.id : ns.name for ns in
50                                self.mwiterator.site_info.namespaces }
51         self.__pages = self.load_pages()
52
53     def load_pages(self):
54         for page in self.mwiterator:
55             yield WikiqPage(page,
56                             namespace_map = self.namespace_map,
57                             collapse_user=self.collapse_user)
58
59     def __iter__(self):
60         return self.__pages
61
62     def __next__(self):
63         return next(self._pages)
64
65 class WikiqPage():
66     __slots__ = ('id', 'title', 'namespace', 'redirect',
67                  'restrictions', 'mwpage', '__revisions',
68                  'collapse_user')
69     
70     def __init__(self, page, namespace_map, collapse_user=False):
71         self.id = page.id
72         self.namespace = page.namespace
73         # following mwxml, we assume namespace 0 in cases where
74         # page.namespace is inconsistent with namespace_map
75         if page.namespace not in namespace_map:
76             self.title = page.title
77             page.namespace = 0
78         if page.namespace != 0:
79             self.title = ':'.join([namespace_map[page.namespace], page.title])
80         else:
81             self.title = page.title
82         self.restrictions = page.restrictions
83         self.collapse_user = collapse_user
84         self.mwpage = page
85         self.__revisions = self.rev_list()
86
87     def rev_list(self):
88         # Outline for how we want to handle collapse_user=True
89         # iteration   rev.user   prev_rev.user   add prev_rev?
90         #         0          A            None           Never
91         #         1          A               A           False
92         #         2          B               A            True
93         #         3          A               B            True
94         #         4          A               A           False
95         # Post-loop                          A          Always
96         for i, rev in enumerate(self.mwpage):
97             # never yield the first time
98             if i == 0:
99                 if self.collapse_user: 
100                     collapsed_revs = 1
101                     rev.collapsed_revs = collapsed_revs
102
103             else:
104                 if self.collapse_user:
105                     # yield if this is the last edit in a seq by a user and reset
106                     # also yield if we do know who the user is
107
108                     if rev.deleted.user or prev_rev.deleted.user:
109                         yield prev_rev
110                         collapsed_revs = 1
111                         rev.collapsed_revs = collapsed_revs
112
113                     elif not rev.user.text == prev_rev.user.text:
114                         yield prev_rev
115                         collapsed_revs = 1
116                         rev.collapsed_revs = collapsed_revs
117                     # otherwise, add one to the counter
118                     else:
119                         collapsed_revs += 1
120                         rev.collapsed_revs = collapsed_revs
121                 # if collapse_user is false, we always yield
122                 else:
123                     yield prev_rev
124
125             prev_rev = rev
126
127         # also yield the final time
128         yield prev_rev
129
130     def __iter__(self):
131         return self.__revisions
132
133     def __next__(self):
134         return next(self.__revisions)
135
136
137 class RegexPair(object):
138     def __init__(self, pattern, label):
139         self.pattern = re.compile(pattern)
140         self.label = label
141         self.has_groups = bool(self.pattern.groupindex)
142         if self.has_groups:
143             self.capture_groups = list(self.pattern.groupindex.keys())
144             
145     def _make_key(self, cap_group):
146         return ("{}_{}".format(self.label, cap_group))
147
148     def matchmake(self, content, rev_data):
149         
150         temp_dict = {}
151         # if there are named capture groups in the regex
152         if self.has_groups:
153
154             # if there are matches of some sort in this revision content, fill the lists for each cap_group
155             if self.pattern.search(content) is not None:
156                 m = self.pattern.finditer(content)
157                 matchobjects = list(m)
158
159                 for cap_group in self.capture_groups:
160                     key = self._make_key(cap_group)
161                     temp_list = []
162                     for match in matchobjects:
163                         # we only want to add the match for the capture group if the match is not None
164                         if match.group(cap_group) != None:
165                             temp_list.append(match.group(cap_group))
166
167                     # if temp_list of matches is empty just make that column None
168                     if len(temp_list)==0:
169                         temp_dict[key] = None
170                     # else we put in the list we made in the for-loop above
171                     else:
172                         temp_dict[key] = ', '.join(temp_list)
173
174             # there are no matches at all in this revision content, we default values to None
175             else:
176                 for cap_group in self.capture_groups:
177                     key = self._make_key(cap_group)
178                     temp_dict[key] = None
179
180         # there are no capture groups, we just search for all the matches of the regex
181         else:
182             #given that there are matches to be made
183             if type(content) in(str, bytes):
184                 if self.pattern.search(content) is not None:
185                     m = self.pattern.findall(content)
186                     temp_dict[self.label] = ', '.join(m)
187                 else:
188                     temp_dict[self.label] = None
189
190         # update rev_data with our new columns
191         for k, v in temp_dict:
192             rev_data.setattr(k,v)
193
194         return rev_data
195
196 @dataclass()
197 class RevDataBase():
198     revid: int 
199     date_time: datetime
200     articleid: int
201     editorid: int
202     title: str
203     namespace: int
204     deleted: bool
205     text_chars: int = None
206     revert: bool = None
207     reverteds: list[int] = None
208     sha1: str = None
209     minor: bool = None
210     editor: str = None
211     anon: bool = None
212     collapsed_revs:int = None
213
214     pa_schema_fields = [
215         pa.field("revid", pa.int64),
216         pa.field("date_time",pa.timestamp('ms')),
217         pa.field("articleid",pa.int64()),
218         pa.field("editorid",pa.int64()),
219         pa.field("title",pa.string()),
220         pa.field("namespace",pa.int32()),
221         pa.field("deleted",pa.binary()),
222         pa.field("test_chars",pa.int32()),
223         pa.field("revert",pa.binary()),
224         pa.field("reverteds",pa.list_(pa.int64())),
225         pa.field("sha1",pa.string()),
226         pa.field("minor",pa.binary()),
227         pa.field("editor",pa.string()),
228         pa.field("anon",pa.binary())
229     ]
230
231     def to_pyarrow(self):
232         return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
233         
234
235     def to_tsv_row(self):
236         
237         row = []
238         for f in self.fields():
239             val = getattr(self, f.name)
240             if getattr(self, f.name) is None:
241                 row.append("")
242             elif f.type == bool:
243                 row.append("TRUE" if val else "FALSE")
244
245             elif f.type == datetime:
246                 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
247
248             elif f.name in {'editor','title'}:
249                 s = '"' + val + '"'
250                 if f.name in TO_ENCODE:
251                     row.append(quote(str(val)))
252
253             elif f.type == list[int]:
254                 row.append('"' + ",".join([str(x) for x in val]) + '"')
255
256             elif f.type == str:
257                 if f.name in TO_ENCODE:
258                     row.append(quote(str(val)))
259             else:
260                 row.append(val)
261
262         return '\t'.join(row)
263     
264     # def __init__(revid: int,
265     #              date_time: datetime,
266     #              articleid: int,
267     #              editorid: int,
268     #              title: str,
269     #              namespace: int,
270     #              deleted: bool,
271     #              test_chars: int,
272     #              revert: bool,
273     #              reverteds: list[bool],
274     #              sha1: str,
275     #              minor: bool,
276     #              editor: str,
277     #              anon: bool):
278
279         
280
281 @dataclass()
282 class RevDataCollapse(RevDataBase):
283     collapsed_revs:int = None
284     pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
285     pa_schema_fields = RevDataBase.pa_schema_fields + pa_collapsed_revs_schema
286     pa_schema = pa.schema(pa_schema_fields)    
287
288 @dataclass()
289 class RevDataPersistence(RevDataBase):
290     token_revs:int = None
291     tokens_added:int = None
292     tokens_removed:int = None
293     tokens_window:int = None
294     pa_persistence_schema_fields = [
295         pa.field(token_revs, pa.int64()),
296         pa.field(tokens_added, pa.int64()),
297         pa.field(tokens_removed, pa.int64()),
298         pa.tokens_window, pa.int64()]
299         
300     pa_schema_fields = RevDataBase.pa_schema_fields  + pa_persistence_schema_fields
301
302 @dataclass()
303 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
304     pa_scehma_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
305
306 class WikiqParser():
307     def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
308         """ 
309         Parameters:
310            persist : what persistence method to use. Takes a PersistMethod value
311         """
312         self.input_file = input_file
313
314         self.collapse_user = collapse_user
315         self.persist = persist
316         self.namespaces = []
317         self.urlencode = urlencode
318         self.revert_radius = revert_radius
319         
320         self.output_buffer = []
321         self.output_buffer_size = output_buffer_size
322
323         if namespaces is not None:
324             self.namespace_filter = set(namespaces)
325         else:
326             self.namespace_filter = None
327
328         self.regex_schemas = []
329         self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
330         self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
331
332         if self.collapse_user is True:
333             if self.persist == PersistMethod.none:
334                 revdata_type = RevDataCollapse
335             else:
336                 revdata_type = RevDataCollapsePersistence
337         elif self.persist != PersistMethod.none:
338             revdata_type = RevDataPersistence
339         else:
340             revdata_type = RevDataBase
341
342         regex_fields = [(field.name, list[str]), for field in self.regex_schemas]
343         self.revdata_type = dataclasses.make_dataclass('RevData_Parser',
344                                                        fields=map(regex_fields,
345                                                                   lambda pa_field: (pa_field.name,
346                                                                                     list[string],
347                                                                                     field(default=None))),
348                                                        bases=(revdata_type))
349         
350         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + regex_fields
351                         
352         if output_parquet is True:
353             self.output_parquet = True
354             self.pq_writer = None
355             self.output_file = output_file
356         else:
357             self.output_file = open(output_file,'w')
358
359
360     def make_matchmake_pairs(self, patterns, labels):
361         if (patterns is not None and labels is not None) and \
362            (len(patterns) == len(labels)):
363             result = []
364             for pattern, label in zip(patterns, labels):
365                 result.append(RegexPair(pattern, label))
366                 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
367
368             return result
369         elif (patterns is None and labels is None):
370             return []
371         else:
372             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
373
374     def matchmake(self, rev, rev_data):
375         rev_data = self.matchmake_revision(rev.text, rev_data)
376         rev_data = self.matchmake_comment(rev.comment, rev_data)
377         return rev_data
378
379     def matchmake_revision(self, text, rev_data):
380          return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
381
382     def matchmake_comment(self, comment, rev_data):
383         return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
384
385     def matchmake_pairs(self, text, rev_data, pairs):
386         for pair in pairs:
387             rev_data = pair.matchmake(text, rev_data)
388         return rev_data
389
390     def __get_namespace_from_title(self, title):
391         default_ns = None
392
393         for ns in self.namespaces:
394             # skip if the namespace is not defined
395             if ns == None:
396                 default_ns = self.namespaces[ns]
397                 continue
398
399             if title.startswith(ns + ":"):
400                 return self.namespaces[ns]
401
402         # if we've made it this far with no matches, we return the default namespace
403         return default_ns
404
405
406     def process(self):
407
408         # create a regex that creates the output filename
409         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
410         #                         r'output/wikiq-\1-\2.tsv',
411         #                         input_filename)
412
413         # Construct dump file iterator
414         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
415
416         # extract list of namspaces
417         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
418
419         page_count = 0
420         rev_count = 0
421
422
423         # Iterate through pages
424         for page in dump:
425             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
426
427             # skip namespaces not in the filter
428             if self.namespace_filter is not None:
429                 if namespace not in self.namespace_filter:
430                     continue
431
432             rev_detector = mwreverts.Detector(radius = self.revert_radius)
433
434             if self.persist != PersistMethod.none:
435                 window = deque(maxlen=PERSISTENCE_RADIUS)
436
437                 if self.persist == PersistMethod.sequence:
438                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
439                                                     revert_radius=PERSISTENCE_RADIUS)
440
441                 elif self.persist == PersistMethod.segment:
442                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
443                                                     revert_radius=PERSISTENCE_RADIUS)
444
445                 # self.persist == PersistMethod.legacy
446                 else:
447                     from mw.lib import persistence
448                     state = persistence.State()
449
450             # Iterate through a page's revisions
451             for rev in page:
452                 
453                 rev_data = self.revdata_type(revid = rev.id,
454                                              date_time = rev.timestamp,
455                                              articleid = page.id,
456                                              editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
457                                              title =  page.title,
458                                              deleted = rev.deleted.text
459                                              )
460
461                 rev_data = self.matchmake(rev, rev_data)
462
463                 if not rev.deleted.text:
464                     # rev.text can be None if the page has no text
465                     if not rev.text:
466                         rev.text = ""
467                     # if text exists, we'll check for a sha1 and generate one otherwise
468
469                     if rev.sha1:
470                         text_sha1 = rev.sha1
471                     else:
472
473                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
474                     
475                     rev_data.sha1 = text_sha1
476
477                     # TODO rev.bytes doesn't work.. looks like a bug
478                     rev_data.text_chars = len(rev.text)
479
480                     # generate revert data
481                     rev_data.revert = rev_detector.process(text_sha1, rev.id)
482                     
483                     if revert:
484                         rev_data.reverteds = revert.reverteds
485
486                 # if the fact that the edit was minor can be hidden, this might be an issue
487                 rev_data.minor = rev.minor
488
489                 if not rev.deleted.user:
490                     # wrap user-defined editors in quotes for fread
491                     rev_data.editor = rev.user.text 
492                     rev_data.anon = rev.user.id == None
493                     
494                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
495                 #    redirect = True
496                 #else:
497                 #    redirect = False
498                 
499                 #TODO missing: additions_size deletions_size
500                 
501                 # if collapse user was on, lets run that
502                 if self.collapse_user:
503                     rev_data.collapsed_revs = rev.collapsed_revs
504
505                 if self.persist != PersistMethod.none:
506
507                     if not rev.deleted.text:
508
509                         if self.persist != PersistMethod.legacy:
510                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
511
512                         else:
513                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
514                             
515                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
516                         
517                         if len(window) == PERSISTENCE_RADIUS:
518                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
519                             
520                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
521
522                             rev_data.token_revs = num_token_revs
523                             rev_data.tokens_added = num_tokens
524                             rev_data.tokens_removed = len(old_tokens_removed)
525                             rev_data.tokens_window = PERSISTENCE_RADIUS-1
526
527                             self.print_rev_data(rev_data)
528
529                 else:
530                     self.print_rev_data(rev_data)
531
532                 rev_count += 1
533
534             if self.persist != PersistMethod.none:
535                 # print out metadata for the last RADIUS revisions
536                 for i, item in enumerate(window):
537                     # if the window was full, we've already printed item 0
538                     if len(window) == PERSISTENCE_RADIUS and i == 0:
539                         continue
540
541                     rev_id, rev_data, tokens_added, tokens_removed = item
542                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
543
544                     rev_data.token_revs = num_token_revs
545                     rev_data.tokens_added = num_tokens
546                     rev_data.tokens_removed = len(tokens_removed)
547                     rev_data.tokens_window = len(window)-(i+1)
548                     self.print_rev_data(rev_data)
549
550             page_count += 1
551
552         print("Done: %s revisions and %s pages." % (rev_count, page_count),
553               file=sys.stderr)
554
555         if self.output_parquet is True:
556             self.flush_parquet_buffer()
557             self.pq_writer.close()
558
559         else:
560             output_file.close()
561
562
563     def write_parquet_row(self, rev_data):
564         padata = rev_data.to_pyarrow()
565         self.output_buffer.append(padata)
566
567         if len(self.output_buffer) >= self.output_buffer_size:
568             self.flush_parquet_buffer()
569
570
571     def flush_parquet_buffer(self):
572         outtable = pa.table.concat_arrays(self.output_buffer)                        
573         if self.pq_writer is None:
574             schema = pa.schema(self.revdata_type.pa_schema_field)
575             self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
576
577         self.pq_writer.write_table(outtable)
578         self.output_buffer = []
579         
580     def print_rev_data(self, rev_data):
581         if self.output_parquet is False:
582             printfunc = self.write_tsv_row
583         else:
584             printfunc = self.write_parquet_row
585         
586         printfunc(rev_data)
587
588     def write_tsv_row(self, rev_data):
589         
590         self.output_buffer.append(rev_data.to_tsv_line())
591
592         if len(self.output_buffer) >= self.output_buffer_size:
593             self.flush_tsv_buffer()
594
595
596     def flush_tsv_buffer():
597         if self.output_header:
598
599 def open_input_file(input_filename):
600     if re.match(r'.*\.7z$', input_filename):
601         cmd = ["7za", "x", "-so", input_filename, "*.xml"] 
602     elif re.match(r'.*\.gz$', input_filename):
603         cmd = ["zcat", input_filename] 
604     elif re.match(r'.*\.bz2$', input_filename):
605         cmd = ["bzcat", "-dk", input_filename] 
606
607     try:
608         input_file = Popen(cmd, stdout=PIPE).stdout
609     except NameError:
610         input_file = open(input_filename, 'r')
611
612     return input_file
613
614 def get_output_filename(input_filename, parquet = False):
615     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
616     output_filename = re.sub(r'\.xml', '', output_filename)
617     if parquet is False:
618         output_filename = output_filename + ".tsv"
619     else:
620         output_filename = output_filename + ".parquet"
621     return output_filename
622
623 def open_output_file(input_filename):
624     # create a regex that creates the output filename
625     output_filename = get_output_filename(input_filename, parquet = False)
626     output_file = open(output_filename, "w")
627     return output_file
628
629 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
630
631 # arguments for the input direction
632 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
633                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
634
635 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
636                     help="Directory for output files. If it ends with .parquet output will be in parquet format.")
637
638 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
639                     help="Write output to standard out (do not create dump file)")
640
641 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
642                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
643
644 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
645                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
646
647 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
648                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
649
650 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
651                     help="Id number of namspace to include. Can be specified more than once.")
652
653 parser.add_argument('-rr',
654                     '--revert-radius',
655                     dest="revert_radius",
656                     type=int,
657                     action='store',
658                     default=15,
659                     help="Number of edits to check when looking for reverts (default: 15)")
660
661 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
662                     help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
663
664 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
665                     help="The label for the outputted column based on matching the regex in revision text.")
666
667 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
668                     help="The regular expression to search for in comments of revisions.")
669
670 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
671                     help="The label for the outputted column based on matching the regex in comments.")
672
673 args = parser.parse_args()
674
675
676
677 # set persistence method
678
679 if args.persist is None:
680     persist = PersistMethod.none
681 elif args.persist == "segment":
682     persist = PersistMethod.segment
683 elif args.persist == "legacy":
684     persist = PersistMethod.legacy
685 else:
686     persist = PersistMethod.sequence
687
688 if args.namespace_filter is not None:
689     namespaces = args.namespace_filter
690 else:
691     namespaces = None
692
693 if len(args.dumpfiles) > 0:
694     output_parquet = False
695     for filename in args.dumpfiles:
696         input_file = open_input_file(filename)
697
698         # open directory for output
699         if args.output_dir:
700             output_dir = args.output_dir[0]
701         else:
702             output_dir = "."
703
704         if output_dir.endswith(".parquet"):
705             output_parquet = True
706
707         print("Processing file: %s" % filename, file=sys.stderr)
708
709         if args.stdout:
710             output_file = sys.stdout
711         else:
712             filename = os.path.join(output_dir, os.path.basename(filename))
713             output_file = get_output_filename(filename, parquet = output_parquet)
714
715         wikiq = WikiqParser(input_file,
716                             output_file,
717                             collapse_user=args.collapse_user,
718                             persist=persist,
719                             urlencode=args.urlencode,
720                             namespaces=namespaces,
721                             revert_radius=args.revert_radius,
722                             regex_match_revision = args.regex_match_revision,
723                             regex_revision_label = args.regex_revision_label,
724                             regex_match_comment = args.regex_match_comment,
725                             regex_comment_label = args.regex_comment_label,
726                             output_parquet=output_parquet)
727
728         print(wikiq.output_parquet)
729         wikiq.process()
730
731         # close things 
732         input_file.close()
733
734 else:
735     wikiq = WikiqParser(sys.stdin,
736                         sys.stdout,
737                         collapse_user=args.collapse_user,
738                         persist=persist,
739                         #persist_legacy=args.persist_legacy,
740                         urlencode=args.urlencode,
741                         namespaces=namespaces,
742                         revert_radius=args.revert_radius,
743                         regex_match_revision = args.regex_match_revision,
744                         regex_revision_label = args.regex_revision_label,
745                         regex_match_comment = args.regex_match_comment,
746                         regex_comment_label = args.regex_comment_label)
747
748     wikiq.process() 
749
750 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
751 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?