]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
fix bugs and unit tests
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11 from datetime import datetime,timezone
12
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
16
17 from mwxml import Dump
18
19 from deltas.tokenizers import wikitext_split
20 import mwpersistence
21 import mwreverts
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
24 PERSISTENCE_RADIUS=7
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
27
28 import dataclasses as dc
29 from dataclasses import dataclass, make_dataclass
30 import pyarrow as pa
31 import pyarrow.parquet as pq
32
33 class PersistMethod:
34     none = 0
35     sequence = 1
36     segment = 2
37     legacy = 3
38
39 def calculate_persistence(tokens_added):
40     return(sum([(len(x.revisions)-1) for x in tokens_added]),
41            len(tokens_added))
42
43 class WikiqIterator():
44     def __init__(self, fh, collapse_user=False):
45         self.fh = fh
46         self.collapse_user = collapse_user
47         self.mwiterator = Dump.from_file(self.fh)
48         self.namespace_map = { ns.id : ns.name for ns in
49                                self.mwiterator.site_info.namespaces }
50         self.__pages = self.load_pages()
51
52     def load_pages(self):
53         for page in self.mwiterator:
54             yield WikiqPage(page,
55                             namespace_map = self.namespace_map,
56                             collapse_user=self.collapse_user)
57
58     def __iter__(self):
59         return self.__pages
60
61     def __next__(self):
62         return next(self._pages)
63
64 class WikiqPage():
65     __slots__ = ('id', 'title', 'namespace', 'redirect',
66                  'restrictions', 'mwpage', '__revisions',
67                  'collapse_user')
68     
69     def __init__(self, page, namespace_map, collapse_user=False):
70         self.id = page.id
71         self.namespace = page.namespace
72         # following mwxml, we assume namespace 0 in cases where
73         # page.namespace is inconsistent with namespace_map
74         if page.namespace not in namespace_map:
75             self.title = page.title
76             page.namespace = 0
77         if page.namespace != 0:
78             self.title = ':'.join([namespace_map[page.namespace], page.title])
79         else:
80             self.title = page.title
81         self.restrictions = page.restrictions
82         self.collapse_user = collapse_user
83         self.mwpage = page
84         self.__revisions = self.rev_list()
85
86     def rev_list(self):
87         # Outline for how we want to handle collapse_user=True
88         # iteration   rev.user   prev_rev.user   add prev_rev?
89         #         0          A            None           Never
90         #         1          A               A           False
91         #         2          B               A            True
92         #         3          A               B            True
93         #         4          A               A           False
94         # Post-loop                          A          Always
95         for i, rev in enumerate(self.mwpage):
96             # never yield the first time
97             if i == 0:
98                 if self.collapse_user: 
99                     collapsed_revs = 1
100                     rev.collapsed_revs = collapsed_revs
101
102             else:
103                 if self.collapse_user:
104                     # yield if this is the last edit in a seq by a user and reset
105                     # also yield if we do know who the user is
106
107                     if rev.deleted.user or prev_rev.deleted.user:
108                         yield prev_rev
109                         collapsed_revs = 1
110                         rev.collapsed_revs = collapsed_revs
111
112                     elif not rev.user.text == prev_rev.user.text:
113                         yield prev_rev
114                         collapsed_revs = 1
115                         rev.collapsed_revs = collapsed_revs
116                     # otherwise, add one to the counter
117                     else:
118                         collapsed_revs += 1
119                         rev.collapsed_revs = collapsed_revs
120                 # if collapse_user is false, we always yield
121                 else:
122                     yield prev_rev
123
124             prev_rev = rev
125
126         # also yield the final time
127         yield prev_rev
128
129     def __iter__(self):
130         return self.__revisions
131
132     def __next__(self):
133         return next(self.__revisions)
134
135
136 class RegexPair(object):
137     def __init__(self, pattern, label):
138         self.pattern = re.compile(pattern)
139         self.label = label
140         self.has_groups = bool(self.pattern.groupindex)
141         if self.has_groups:
142             self.capture_groups = list(self.pattern.groupindex.keys())
143             
144     def get_pyarrow_fields(self):
145         if self.has_groups:
146             fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
147                       for cap_group in self.capture_groups]
148         else:
149             fields = [pa.field(self.label, pa.list_(pa.string()))]
150
151         return fields
152
153     def _make_key(self, cap_group):
154         return ("{}_{}".format(self.label, cap_group))
155
156     def matchmake(self, content, rev_data):
157         
158         temp_dict = {}
159         # if there are named capture groups in the regex
160         if self.has_groups:
161
162             # if there are matches of some sort in this revision content, fill the lists for each cap_group
163             if self.pattern.search(content) is not None:
164                 m = self.pattern.finditer(content)
165                 matchobjects = list(m)
166
167                 for cap_group in self.capture_groups:
168                     key = self._make_key(cap_group)
169                     temp_list = []
170                     for match in matchobjects:
171                         # we only want to add the match for the capture group if the match is not None
172                         if match.group(cap_group) != None:
173                             temp_list.append(match.group(cap_group))
174
175                     # if temp_list of matches is empty just make that column None
176                     if len(temp_list)==0:
177                         temp_dict[key] = None
178                     # else we put in the list we made in the for-loop above
179                     else:
180                         temp_dict[key] = ', '.join(temp_list)
181
182             # there are no matches at all in this revision content, we default values to None
183             else:
184                 for cap_group in self.capture_groups:
185                     key = self._make_key(cap_group)
186                     temp_dict[key] = None
187
188         # there are no capture groups, we just search for all the matches of the regex
189         else:
190             #given that there are matches to be made
191             if type(content) in(str, bytes):
192                 if self.pattern.search(content) is not None:
193                     m = self.pattern.findall(content)
194                     temp_dict[self.label] = ', '.join(m)
195                 else:
196                     temp_dict[self.label] = None
197
198         # update rev_data with our new columns
199         for k, v in temp_dict.items():
200             setattr(rev_data, k, v)
201
202         return rev_data
203
204 @dataclass()
205 class RevDataBase():
206     revid: int 
207     date_time: datetime
208     articleid: int
209     editorid: int
210     title: str
211     namespace: int
212     deleted: bool
213     text_chars: int = None
214     revert: bool = None
215     reverteds: list[int] = None
216     sha1: str = None
217     minor: bool = None
218     editor: str = None
219     anon: bool = None
220
221     urlencode = False
222     pa_schema_fields = [
223         pa.field("revid", pa.int64()),
224         pa.field("date_time",pa.timestamp('ms')),
225         pa.field("articleid",pa.int64()),
226         pa.field("editorid",pa.int64()),
227         pa.field("title",pa.string()),
228         pa.field("namespace",pa.int32()),
229         pa.field("deleted",pa.bool_()),
230         pa.field("test_chars",pa.int32()),
231         pa.field("revert",pa.bool_()),
232         pa.field("reverteds",pa.list_(pa.int64())),
233         pa.field("sha1",pa.string()),
234         pa.field("minor",pa.bool_()),
235         pa.field("editor",pa.string()),
236         pa.field("anon",pa.bool_())
237     ]
238
239     def to_pyarrow(self):
240         return dc.astuple(self)
241
242     def to_tsv_row(self):
243         
244         row = []
245         for f in dc.fields(self):
246             val = getattr(self, f.name)
247             if getattr(self, f.name) is None:
248                 row.append("")
249             elif f.type == bool:
250                 row.append("TRUE" if val else "FALSE")
251
252             elif f.type == datetime:
253                 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
254
255             elif f.name in {'editor','title'}:
256                 s = '"' + val + '"'
257                 if self.urlencode and f.name in TO_ENCODE:
258                     row.append(quote(str(s)))
259                 else:
260                     row.append(s)
261
262             elif f.type == list[int]:
263                 row.append('"' + ",".join([str(x) for x in val]) + '"')
264
265             elif f.type == str:
266                 if self.urlencode and f.name in TO_ENCODE:
267                     row.append(quote(str(val)))
268                 else:
269                     row.append(val)
270             else:
271                 row.append(val)
272
273         return '\t'.join(map(str,row))
274
275     def header_row(self):
276         return '\t'.join(map(lambda f: f.name, dc.fields(self)))
277
278 @dataclass()
279 class RevDataCollapse(RevDataBase):
280     collapsed_revs:int = None
281     pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
282     pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
283
284 @dataclass()
285 class RevDataPersistence(RevDataBase):
286     token_revs:int = None
287     tokens_added:int = None
288     tokens_removed:int = None
289     tokens_window:int = None
290
291     pa_persistence_schema_fields = [
292         pa.field("token_revs", pa.int64()),
293         pa.field("tokens_added", pa.int64()),
294         pa.field("tokens_removed", pa.int64()),
295         pa.field("tokens_window", pa.int64())]
296         
297     pa_schema_fields = RevDataBase.pa_schema_fields  + pa_persistence_schema_fields
298
299 @dataclass()
300 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
301     pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
302
303 class WikiqParser():
304     def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
305         """ 
306         Parameters:
307            persist : what persistence method to use. Takes a PersistMethod value
308         """
309         self.input_file = input_file
310
311         self.collapse_user = collapse_user
312         self.persist = persist
313         self.namespaces = []
314         self.urlencode = urlencode
315         self.revert_radius = revert_radius
316         
317         if namespaces is not None:
318             self.namespace_filter = set(namespaces)
319         else:
320             self.namespace_filter = None
321
322         self.regex_schemas = []
323         self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
324         self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
325
326         if self.collapse_user is True:
327             if self.persist == PersistMethod.none:
328                 revdata_type = RevDataCollapse
329             else:
330                 revdata_type = RevDataCollapsePersistence
331         elif self.persist != PersistMethod.none:
332             revdata_type = RevDataPersistence
333         else:
334             revdata_type = RevDataBase
335
336         regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
337
338         self.revdata_type = make_dataclass('RevData_Parser',
339                                            fields=regex_fields,
340                                            bases=(revdata_type,))
341         
342         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
343                         
344         self.revdata_type.urlencode = self.urlencode
345
346         if output_parquet is True:
347             self.output_parquet = True
348             self.pq_writer = None
349             self.output_file = output_file
350             self.parquet_buffer = []
351             self.parquet_buffer_size = parquet_buffer_size
352         else:
353             self.print_header = True
354             if output_file == sys.stdout:
355                 
356                 self.output_file = output_file
357             else:
358                 self.output_file = open(output_file,'w')
359             self.output_parquet = False
360
361     def make_matchmake_pairs(self, patterns, labels):
362         if (patterns is not None and labels is not None) and \
363            (len(patterns) == len(labels)):
364             result = []
365             for pattern, label in zip(patterns, labels):
366                 rp = RegexPair(pattern, label)
367                 result.append(rp)
368                 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
369             return result
370         elif (patterns is None and labels is None):
371             return []
372         else:
373             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
374
375     def matchmake(self, rev, rev_data):
376         rev_data = self.matchmake_revision(rev.text, rev_data)
377         rev_data = self.matchmake_comment(rev.comment, rev_data)
378         return rev_data
379
380     def matchmake_revision(self, text, rev_data):
381          return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
382
383     def matchmake_comment(self, comment, rev_data):
384         return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
385
386     def matchmake_pairs(self, text, rev_data, pairs):
387         for pair in pairs:
388             rev_data = pair.matchmake(text, rev_data)
389         return rev_data
390
391     def __get_namespace_from_title(self, title):
392         default_ns = None
393
394         for ns in self.namespaces:
395             # skip if the namespace is not defined
396             if ns == None:
397                 default_ns = self.namespaces[ns]
398                 continue
399
400             if title.startswith(ns + ":"):
401                 return self.namespaces[ns]
402
403         # if we've made it this far with no matches, we return the default namespace
404         return default_ns
405
406
407     def process(self):
408
409         # create a regex that creates the output filename
410         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
411         #                         r'output/wikiq-\1-\2.tsv',
412         #                         input_filename)
413
414         # Construct dump file iterator
415         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
416
417         # extract list of namspaces
418         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
419
420         page_count = 0
421         rev_count = 0
422
423
424         # Iterate through pages
425         for page in dump:
426             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
427
428             # skip namespaces not in the filter
429             if self.namespace_filter is not None:
430                 if namespace not in self.namespace_filter:
431                     continue
432
433             rev_detector = mwreverts.Detector(radius = self.revert_radius)
434
435             if self.persist != PersistMethod.none:
436                 window = deque(maxlen=PERSISTENCE_RADIUS)
437                 
438                 if self.persist == PersistMethod.sequence:
439                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
440                                                     revert_radius=PERSISTENCE_RADIUS)
441
442                 elif self.persist == PersistMethod.segment:
443                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
444                                                     revert_radius=PERSISTENCE_RADIUS)
445
446                 # self.persist == PersistMethod.legacy
447                 else:
448                     from mw.lib import persistence
449                     state = persistence.State()
450
451             # Iterate through a page's revisions
452             for rev in page:
453                 
454                 rev_data = self.revdata_type(revid = rev.id,
455                                              date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
456                                              articleid = page.id,
457                                              editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
458                                              title =  page.title,
459                                              deleted = rev.deleted.text,
460                                              namespace = namespace
461                                              )
462
463                 rev_data = self.matchmake(rev, rev_data)
464
465                 if not rev.deleted.text:
466                     # rev.text can be None if the page has no text
467                     if not rev.text:
468                         rev.text = ""
469                     # if text exists, we'll check for a sha1 and generate one otherwise
470
471                     if rev.sha1:
472                         text_sha1 = rev.sha1
473                     else:
474                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
475                     
476                     rev_data.sha1 = text_sha1
477
478                     # TODO rev.bytes doesn't work.. looks like a bug
479                     rev_data.text_chars = len(rev.text)
480
481                     # generate revert data
482                     revert = rev_detector.process(text_sha1, rev.id)
483                     
484                     if revert:
485                         rev_data.revert = True
486                         rev_data.reverteds = revert.reverteds
487                     else:
488                         rev_data.revert = False
489
490                 # if the fact that the edit was minor can be hidden, this might be an issue
491                 rev_data.minor = rev.minor
492
493                 if not rev.deleted.user:
494                     # wrap user-defined editors in quotes for fread
495                     rev_data.editor = rev.user.text 
496                     rev_data.anon = rev.user.id is None
497                 
498                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
499                 #    redirect = True
500                 #else:
501                 #    redirect = False
502                 
503                 #TODO missing: additions_size deletions_size
504                 
505                 # if collapse user was on, lets run that
506                 if self.collapse_user:
507                     rev_data.collapsed_revs = rev.collapsed_revs
508
509                 # get the 
510                 if self.persist != PersistMethod.none:
511                     if not rev.deleted.text:
512
513                         if self.persist != PersistMethod.legacy:
514                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
515
516                         else:
517                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
518                             
519                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
520                         
521                         if len(window) == PERSISTENCE_RADIUS:
522                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
523                             
524                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
525
526                             old_rev_data.token_revs = num_token_revs
527                             old_rev_data.tokens_added = num_tokens
528                             old_rev_data.tokens_removed = len(old_tokens_removed)
529                             old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
530
531                             self.print_rev_data(old_rev_data)
532
533                 else:
534                     self.print_rev_data(rev_data)
535
536                 rev_count += 1
537
538             if self.persist != PersistMethod.none:
539                 # print out metadata for the last RADIUS revisions
540                 for i, item in enumerate(window):
541                     # if the window was full, we've already printed item 0
542                     if len(window) == PERSISTENCE_RADIUS and i == 0:
543                         continue
544
545                     rev_id, rev_data, tokens_added, tokens_removed = item
546                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
547
548                     rev_data.token_revs = num_token_revs
549                     rev_data.tokens_added = num_tokens
550                     rev_data.tokens_removed = len(tokens_removed)
551                     rev_data.tokens_window = len(window)-(i+1)
552                     self.print_rev_data(rev_data)
553
554             page_count += 1
555
556         print("Done: %s revisions and %s pages." % (rev_count, page_count),
557               file=sys.stderr)
558
559         if self.output_parquet is True:
560             self.flush_parquet_buffer()
561             self.pq_writer.close()
562
563         else:
564             self.output_file.close()
565
566
567     def write_parquet_row(self, rev_data):
568         padata = rev_data.to_pyarrow()
569         self.parquet_buffer.append(padata)
570
571         if len(self.parquet_buffer) >= self.parquet_buffer_size:
572             self.flush_parquet_buffer()
573
574
575     def flush_parquet_buffer(self):
576         schema = pa.schema(self.revdata_type.pa_schema_fields)
577
578         def row_to_col(rg, types):
579             cols = []
580             first = rg[0]
581             for col in first:
582                 cols.append([col])
583
584             for row in rg[1:]:
585                 for j in range(len(cols)):
586                     cols[j].append(row[j])
587
588             arrays = []
589             for col, typ in zip(cols, types):
590                 arrays.append(pa.array(col, typ))
591             return arrays
592
593         outtable = pa.Table.from_arrays(row_to_col(self.parquet_buffer, schema.types), schema=schema)
594         if self.pq_writer is None:
595             self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
596
597         self.pq_writer.write_table(outtable)
598         self.parquet_buffer = []
599         
600     def print_rev_data(self, rev_data):
601         if self.output_parquet is False:
602             printfunc = self.write_tsv_row
603         else:
604             printfunc = self.write_parquet_row
605         
606         printfunc(rev_data)
607
608     def write_tsv_row(self, rev_data):
609         if self.print_header:
610             print(rev_data.header_row(), file=self.output_file)
611             self.print_header = False
612
613         line = rev_data.to_tsv_row()
614         print(line, file=self.output_file)
615
616
617 def open_input_file(input_filename):
618     if re.match(r'.*\.7z$', input_filename):
619         cmd = ["7za", "x", "-so", input_filename, "*.xml"] 
620     elif re.match(r'.*\.gz$', input_filename):
621         cmd = ["zcat", input_filename] 
622     elif re.match(r'.*\.bz2$', input_filename):
623         cmd = ["bzcat", "-dk", input_filename] 
624
625     try:
626         input_file = Popen(cmd, stdout=PIPE).stdout
627     except NameError:
628         input_file = open(input_filename, 'r')
629
630     return input_file
631
632 def get_output_filename(input_filename, parquet = False):
633     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
634     output_filename = re.sub(r'\.xml', '', output_filename)
635     if parquet is False:
636         output_filename = output_filename + ".tsv"
637     else:
638         output_filename = output_filename + ".parquet"
639     return output_filename
640
641 def open_output_file(input_filename):
642     # create a regex that creates the output filename
643     output_filename = get_output_filename(input_filename, parquet = False)
644     output_file = open(output_filename, "w")
645     return output_file
646
647 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
648
649 # arguments for the input direction
650 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
651                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
652
653 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
654                     help="Directory for output files. If it ends with .parquet output will be in parquet format.")
655
656 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
657                     help="Write output to standard out (do not create dump file)")
658
659 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
660                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
661
662 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
663                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
664
665 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
666                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
667
668 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
669                     help="Id number of namspace to include. Can be specified more than once.")
670
671 parser.add_argument('-rr',
672                     '--revert-radius',
673                     dest="revert_radius",
674                     type=int,
675                     action='store',
676                     default=15,
677                     help="Number of edits to check when looking for reverts (default: 15)")
678
679 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
680                     help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
681
682 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
683                     help="The label for the outputted column based on matching the regex in revision text.")
684
685 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
686                     help="The regular expression to search for in comments of revisions.")
687
688 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
689                     help="The label for the outputted column based on matching the regex in comments.")
690
691 args = parser.parse_args()
692
693
694
695 # set persistence method
696
697 if args.persist is None:
698     persist = PersistMethod.none
699 elif args.persist == "segment":
700     persist = PersistMethod.segment
701 elif args.persist == "legacy":
702     persist = PersistMethod.legacy
703 else:
704     persist = PersistMethod.sequence
705
706 if args.namespace_filter is not None:
707     namespaces = args.namespace_filter
708 else:
709     namespaces = None
710
711 if len(args.dumpfiles) > 0:
712     output_parquet = False
713     for filename in args.dumpfiles:
714         input_file = open_input_file(filename)
715
716         # open directory for output
717         if args.output_dir:
718             output_dir = args.output_dir[0]
719         else:
720             output_dir = "."
721
722         if output_dir.endswith(".parquet"):
723             output_parquet = True
724
725         print("Processing file: %s" % filename, file=sys.stderr)
726
727         if args.stdout:
728             output_file = sys.stdout
729         else:
730             filename = os.path.join(output_dir, os.path.basename(filename))
731             output_file = get_output_filename(filename, parquet = output_parquet)
732
733         wikiq = WikiqParser(input_file,
734                             output_file,
735                             collapse_user=args.collapse_user,
736                             persist=persist,
737                             urlencode=args.urlencode,
738                             namespaces=namespaces,
739                             revert_radius=args.revert_radius,
740                             regex_match_revision = args.regex_match_revision,
741                             regex_revision_label = args.regex_revision_label,
742                             regex_match_comment = args.regex_match_comment,
743                             regex_comment_label = args.regex_comment_label,
744                             output_parquet=output_parquet)
745
746         wikiq.process()
747
748         # close things 
749         input_file.close()
750
751 else:
752     wikiq = WikiqParser(sys.stdin,
753                         sys.stdout,
754                         collapse_user=args.collapse_user,
755                         persist=persist,
756                         #persist_legacy=args.persist_legacy,
757                         urlencode=args.urlencode,
758                         namespaces=namespaces,
759                         revert_radius=args.revert_radius,
760                         regex_match_revision = args.regex_match_revision,
761                         regex_revision_label = args.regex_revision_label,
762                         regex_match_comment = args.regex_match_comment,
763                         regex_comment_label = args.regex_comment_label)
764
765     wikiq.process() 
766
767 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
768 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?