3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime,timezone
14 from subprocess import Popen, PIPE
15 from collections import deque
16 from hashlib import sha1
18 from mwxml import Dump
20 from deltas.tokenizers import wikitext_split
23 from urllib.parse import quote
24 TO_ENCODE = ('title', 'editor')
26 from deltas import SequenceMatcher
27 from deltas import SegmentMatcher
29 import dataclasses as dc
30 from dataclasses import dataclass
32 import pyarrow.parquet as pq
33 from itertools import chain
41 def calculate_persistence(tokens_added):
42 return(sum([(len(x.revisions)-1) for x in tokens_added]),
45 class WikiqIterator():
46 def __init__(self, fh, collapse_user=False):
48 self.collapse_user = collapse_user
49 self.mwiterator = Dump.from_file(self.fh)
50 self.namespace_map = { ns.id : ns.name for ns in
51 self.mwiterator.site_info.namespaces }
52 self.__pages = self.load_pages()
55 for page in self.mwiterator:
57 namespace_map = self.namespace_map,
58 collapse_user=self.collapse_user)
64 return next(self._pages)
67 __slots__ = ('id', 'title', 'namespace', 'redirect',
68 'restrictions', 'mwpage', '__revisions',
71 def __init__(self, page, namespace_map, collapse_user=False):
73 self.namespace = page.namespace
74 # following mwxml, we assume namespace 0 in cases where
75 # page.namespace is inconsistent with namespace_map
76 if page.namespace not in namespace_map:
77 self.title = page.title
79 if page.namespace != 0:
80 self.title = ':'.join([namespace_map[page.namespace], page.title])
82 self.title = page.title
83 self.restrictions = page.restrictions
84 self.collapse_user = collapse_user
86 self.__revisions = self.rev_list()
89 # Outline for how we want to handle collapse_user=True
90 # iteration rev.user prev_rev.user add prev_rev?
97 for i, rev in enumerate(self.mwpage):
98 # never yield the first time
100 if self.collapse_user:
102 rev.collapsed_revs = collapsed_revs
105 if self.collapse_user:
106 # yield if this is the last edit in a seq by a user and reset
107 # also yield if we do know who the user is
109 if rev.deleted.user or prev_rev.deleted.user:
112 rev.collapsed_revs = collapsed_revs
114 elif not rev.user.text == prev_rev.user.text:
117 rev.collapsed_revs = collapsed_revs
118 # otherwise, add one to the counter
121 rev.collapsed_revs = collapsed_revs
122 # if collapse_user is false, we always yield
128 # also yield the final time
132 return self.__revisions
135 return next(self.__revisions)
139 A RegexPair is defined by a regular expression (pattern) and a label.
140 The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
141 If the pattern does not include a capture group, then only one output column will result.
143 class RegexPair(object):
144 def __init__(self, pattern, label):
145 self.pattern = pattern
147 if type(self.pattern) is str:
148 self.pattern = re.compile(pattern)
151 self.has_groups = bool(self.pattern.groupindex)
153 self.capture_groups = list(self.pattern.groupindex.keys())
155 def get_pyarrow_fields(self):
157 fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
158 for cap_group in self.capture_groups]
160 fields = [pa.field(self.label, pa.list_(pa.string()))]
164 def _make_key(self, cap_group):
165 return ("{}_{}".format(self.label, cap_group))
167 def matchmake(self, content, rev_data):
170 # if there are named capture groups in the regex
173 # if there are matches of some sort in this revision content, fill the lists for each cap_group
174 if self.pattern.search(content) is not None:
175 m = self.pattern.finditer(content)
176 matchobjects = list(m)
178 for cap_group in self.capture_groups:
179 key = self._make_key(cap_group)
181 for match in matchobjects:
182 # we only want to add the match for the capture group if the match is not None
183 if match.group(cap_group) != None:
184 temp_list.append(match.group(cap_group))
186 # if temp_list of matches is empty just make that column None
187 if len(temp_list)==0:
188 temp_dict[key] = None
189 # else we put in the list we made in the for-loop above
191 temp_dict[key] = ', '.join(temp_list)
193 # there are no matches at all in this revision content, we default values to None
195 for cap_group in self.capture_groups:
196 key = self._make_key(cap_group)
197 temp_dict[key] = None
199 # there are no capture groups, we just search for all the matches of the regex
201 #given that there are matches to be made
202 if type(content) in(str, bytes):
203 if self.pattern.search(content) is not None:
204 m = self.pattern.findall(content)
205 temp_dict[self.label] = ', '.join(m)
207 temp_dict[self.label] = None
209 # update rev_data with our new columns
210 for k, v in temp_dict.items():
211 setattr(rev_data, k, v)
217 We used to use a dictionary to collect fields for the output.
218 Now we use dataclasses. Compared to a dictionary, this should help:
220 - make it easier to output parquet data.
221 - use class attribute '.' syntax instead of dictionary syntax.
222 - improve support for tooling (autocomplete, type hints)
223 - use type information to define formatting rules
225 Depending on the parameters passed into Wikiq, the output schema can be different.
226 Therefore, we need to end up constructing a dataclass with the correct output schema.
227 It also needs to have the correct pyarrow schema so we can write parquet files.
229 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
240 text_chars: int = None
242 reverteds: list[int] = None
248 # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
251 # defines pyarrow schema.
252 # each field in the data class needs an entry in this array.
253 # the names should match and be in the same order.
254 # this isn't a dataclass field since it doesn't have a type annotation
256 pa.field("revid", pa.int64()),
257 pa.field("date_time", pa.timestamp('ms')),
258 pa.field("articleid",pa.int64()),
259 pa.field("editorid",pa.int64()),
260 pa.field("title",pa.string()),
261 pa.field("namespace",pa.int32()),
262 pa.field("deleted",pa.bool_()),
263 pa.field("text_chars",pa.int32()),
264 pa.field("revert",pa.bool_()),
265 pa.field("reverteds",pa.list_(pa.int64())),
266 pa.field("sha1",pa.string()),
267 pa.field("minor",pa.bool_()),
268 pa.field("editor",pa.string()),
269 pa.field("anon",pa.bool_()),
272 # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
273 def to_pyarrow(self):
274 return dc.astuple(self)
276 # logic to convert each field into the wikiq tsv format goes here.
277 def to_tsv_row(self):
280 for f in dc.fields(self):
281 val = getattr(self, f.name)
282 if getattr(self, f.name) is None:
285 row.append("TRUE" if val else "FALSE")
287 elif f.type == datetime:
288 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
290 elif f.name in {'editor','title'}:
292 if self.urlencode and f.name in TO_ENCODE:
293 row.append(quote(str(s)))
297 elif f.type == list[int]:
298 row.append('"' + ",".join([str(x) for x in val]) + '"')
301 if self.urlencode and f.name in TO_ENCODE:
302 row.append(quote(str(val)))
308 return '\t'.join(map(str,row))
310 def header_row(self):
311 return '\t'.join(map(lambda f: f.name, dc.fields(self)))
315 If collapse=True we'll use a RevDataCollapse dataclass.
316 This class inherits from RevDataBase. This means that it has all the same fields and functions.
318 It just adds a new field and updates the pyarrow schema.
322 class RevDataCollapse(RevDataBase):
323 collapsed_revs:int = None
325 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
326 pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
330 If persistence data is to be computed we'll need the fields added by RevDataPersistence.
334 class RevDataPersistence(RevDataBase):
335 token_revs:int = None
336 tokens_added:int = None
337 tokens_removed:int = None
338 tokens_window:int = None
340 pa_persistence_schema_fields = [
341 pa.field("token_revs", pa.int64()),
342 pa.field("tokens_added", pa.int64()),
343 pa.field("tokens_removed", pa.int64()),
344 pa.field("tokens_window", pa.int64())]
346 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
349 class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
353 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
354 pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
359 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000, siteinfo_file=None):
362 persist : what persistence method to use. Takes a PersistMethod value
364 self.input_file = input_file
366 self.collapse_user = collapse_user
367 self.persist = persist
369 self.urlencode = urlencode
370 self.revert_radius = revert_radius
372 if namespaces is not None:
373 self.namespace_filter = set(namespaces)
375 self.namespace_filter = None
377 self.regex_schemas = []
378 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
379 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
381 if siteinfo_file is not None:
382 siteinfo = open_siteinfo(siteinfo_file)
383 siteinfo = json.loads(siteinfo.read())
385 magicwords = siteinfo.get('query').get('magicwords')
388 redirect_config = list(filter(lambda obj: obj.get("name") == "redirect", magicwords))
389 redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
390 redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
391 redirect_aliases.append('REDIRECT') # just in case
393 # this regular expression is copied from pywikibot
394 pattern = '(?:' + '|'.join(redirect_aliases) + ')'
395 redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
396 .format(pattern=pattern), re.IGNORECASE | re.DOTALL)
398 self.regex_revision_pairs.extend(self.make_matchmake_pairs([redirect_regex], ["redirect"]))
400 # This is where we set the type for revdata.
402 if self.collapse_user is True:
403 if self.persist == PersistMethod.none:
404 revdata_type = RevDataCollapse
406 revdata_type = RevDataCollapsePersistence
407 elif self.persist != PersistMethod.none:
408 revdata_type = RevDataPersistence
410 revdata_type = RevDataBase
412 # if there are regex fields, we need to add them to the revdata type.
413 regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
415 # make_dataclass is a function that defines a new dataclass type.
416 # here we extend the type we have already chosen and add the regular expression types
417 self.revdata_type = dc.make_dataclass('RevData_Parser',
419 bases=(revdata_type,))
421 # we also need to make sure that we have the right pyarrow schema
422 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
424 self.revdata_type.urlencode = self.urlencode
426 self.schema = pa.schema(self.revdata_type.pa_schema_fields)
428 # here we initialize the variables we need for output.
429 if output_parquet is True:
430 self.output_parquet = True
431 self.pq_writer = None
432 self.output_file = output_file
433 self.parquet_buffer = []
434 self.parquet_buffer_size = parquet_buffer_size
436 self.print_header = True
437 if output_file == sys.stdout:
439 self.output_file = output_file
441 self.output_file = open(output_file,'w')
442 self.output_parquet = False
444 def make_matchmake_pairs(self, patterns, labels):
445 if (patterns is not None and labels is not None) and \
446 (len(patterns) == len(labels)):
448 for pattern, label in zip(patterns, labels):
449 rp = RegexPair(pattern, label)
451 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
453 elif (patterns is None and labels is None):
456 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
458 def matchmake_revision(self, rev, rev_data):
459 rev_data = self.matchmake_text(rev.text, rev_data)
460 rev_data = self.matchmake_comment(rev.comment, rev_data)
463 def matchmake_text(self, text, rev_data):
464 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
466 def matchmake_comment(self, comment, rev_data):
467 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
469 def matchmake_pairs(self, text, rev_data, pairs):
471 rev_data = pair.matchmake(text, rev_data)
474 def __get_namespace_from_title(self, title):
477 for ns in self.namespaces:
478 # skip if the namespace is not defined
480 default_ns = self.namespaces[ns]
483 if title.startswith(ns + ":"):
484 return self.namespaces[ns]
486 # if we've made it this far with no matches, we return the default namespace
492 # create a regex that creates the output filename
493 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
494 # r'output/wikiq-\1-\2.tsv',
497 # Construct dump file iterator
498 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
500 # extract list of namspaces
501 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
506 # Iterate through pages
508 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
510 # skip namespaces not in the filter
511 if self.namespace_filter is not None:
512 if namespace not in self.namespace_filter:
515 rev_detector = mwreverts.Detector(radius = self.revert_radius)
517 if self.persist != PersistMethod.none:
518 window = deque(maxlen=PERSISTENCE_RADIUS)
520 if self.persist == PersistMethod.sequence:
521 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
522 revert_radius=PERSISTENCE_RADIUS)
524 elif self.persist == PersistMethod.segment:
525 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
526 revert_radius=PERSISTENCE_RADIUS)
528 # self.persist == PersistMethod.legacy
530 from mw.lib import persistence
531 state = persistence.State()
533 # Iterate through a page's revisions
536 # create a new data object instead of a dictionary.
537 rev_data = self.revdata_type(revid = rev.id,
538 date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
540 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
542 deleted = rev.deleted.text,
543 namespace = namespace
546 rev_data = self.matchmake_revision(rev, rev_data)
548 if not rev.deleted.text:
549 # rev.text can be None if the page has no text
552 # if text exists, we'll check for a sha1 and generate one otherwise
557 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
559 rev_data.sha1 = text_sha1
561 # TODO rev.bytes doesn't work.. looks like a bug
562 rev_data.text_chars = len(rev.text)
564 # generate revert data
565 revert = rev_detector.process(text_sha1, rev.id)
568 rev_data.revert = True
569 rev_data.reverteds = revert.reverteds
571 rev_data.revert = False
573 # if the fact that the edit was minor can be hidden, this might be an issue
574 rev_data.minor = rev.minor
576 if not rev.deleted.user:
577 # wrap user-defined editors in quotes for fread
578 rev_data.editor = rev.user.text
579 rev_data.anon = rev.user.id is None
581 #TODO missing: additions_size deletions_size
583 # if collapse user was on, lets run that
584 if self.collapse_user:
585 rev_data.collapsed_revs = rev.collapsed_revs
588 if self.persist != PersistMethod.none:
589 if not rev.deleted.text:
591 if self.persist != PersistMethod.legacy:
592 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
595 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
597 window.append((rev.id, rev_data, tokens_added, tokens_removed))
599 if len(window) == PERSISTENCE_RADIUS:
600 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
602 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
604 old_rev_data.token_revs = num_token_revs
605 old_rev_data.tokens_added = num_tokens
606 old_rev_data.tokens_removed = len(old_tokens_removed)
607 old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
609 self.print_rev_data(old_rev_data)
612 self.print_rev_data(rev_data)
616 if self.persist != PersistMethod.none:
617 # print out metadata for the last RADIUS revisions
618 for i, item in enumerate(window):
619 # if the window was full, we've already printed item 0
620 if len(window) == PERSISTENCE_RADIUS and i == 0:
623 rev_id, rev_data, tokens_added, tokens_removed = item
624 num_token_revs, num_tokens = calculate_persistence(tokens_added)
626 rev_data.token_revs = num_token_revs
627 rev_data.tokens_added = num_tokens
628 rev_data.tokens_removed = len(tokens_removed)
629 rev_data.tokens_window = len(window)-(i+1)
630 self.print_rev_data(rev_data)
634 print("Done: %s revisions and %s pages." % (rev_count, page_count),
637 # remember to flush the parquet_buffer if we're done
638 if self.output_parquet is True:
639 self.flush_parquet_buffer()
640 self.pq_writer.close()
643 self.output_file.close()
647 For performance reasons it's better to write parquet in batches instead of one row at a time.
648 So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
650 def write_parquet_row(self, rev_data):
651 padata = rev_data.to_pyarrow()
652 self.parquet_buffer.append(padata)
654 if len(self.parquet_buffer) >= self.parquet_buffer_size:
655 self.flush_parquet_buffer()
659 Function that actually writes data to the parquet file.
660 It needs to transpose the data from row-by-row to column-by-column
662 def flush_parquet_buffer(self):
665 Returns the pyarrow table that we'll write
667 def rows_to_table(rg, schema):
674 for j in range(len(cols)):
675 cols[j].append(row[j])
678 for col, typ in zip(cols, schema.types):
679 arrays.append(pa.array(col, typ))
680 return pa.Table.from_arrays(arrays, schema=schema)
682 outtable = rows_to_table(self.parquet_buffer, self.schema)
683 if self.pq_writer is None:
684 self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
686 self.pq_writer.write_table(outtable)
687 self.parquet_buffer = []
689 # depending on if we are configured to write tsv or parquet, we'll call a different function.
690 def print_rev_data(self, rev_data):
691 if self.output_parquet is False:
692 printfunc = self.write_tsv_row
694 printfunc = self.write_parquet_row
698 def write_tsv_row(self, rev_data):
699 if self.print_header:
700 print(rev_data.header_row(), file=self.output_file)
701 self.print_header = False
703 line = rev_data.to_tsv_row()
704 print(line, file=self.output_file)
706 def open_siteinfo(siteinfo_file):
707 if re.match(r'.*\.7z$', siteinfo_file):
708 cmd = ["7za", "x", "-so", siteinfo_file, "*.json"]
709 elif re.match(r'.*\.gz$', siteinfo_file):
710 cmd = ["zcat", siteinfo_file]
711 elif re.match(r'.*\.bz2$', siteinfo_file):
712 cmd = ["bzcat", "-dk", siteinfo_file]
715 input_file = Popen(cmd, stdout=PIPE).stdout
717 input_file = open(siteinfo_file, 'r')
722 def open_input_file(input_filename):
723 if re.match(r'.*\.7z$', input_filename):
724 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
725 elif re.match(r'.*\.gz$', input_filename):
726 cmd = ["zcat", input_filename]
727 elif re.match(r'.*\.bz2$', input_filename):
728 cmd = ["bzcat", "-dk", input_filename]
731 input_file = Popen(cmd, stdout=PIPE).stdout
733 input_file = open(input_filename, 'r')
737 def get_output_filename(input_filename, parquet = False):
738 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
739 output_filename = re.sub(r'\.xml', '', output_filename)
741 output_filename = output_filename + ".tsv"
743 output_filename = output_filename + ".parquet"
744 return output_filename
746 def open_output_file(input_filename):
747 # create a regex that creates the output filename
748 output_filename = get_output_filename(input_filename, parquet = False)
749 output_file = open(output_filename, "w")
752 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
754 # arguments for the input direction
755 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
756 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
758 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
759 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
761 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
762 help="Write output to standard out (do not create dump file)")
764 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
765 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
767 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
768 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
770 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
771 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
773 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
774 help="Id number of namspace to include. Can be specified more than once.")
776 parser.add_argument('-rr',
778 dest="revert_radius",
782 help="Number of edits to check when looking for reverts (default: 15)")
784 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
785 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
787 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
788 help="The label for the outputted column based on matching the regex in revision text.")
790 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
791 help="The regular expression to search for in comments of revisions.")
793 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
794 help="The label for the outputted column based on matching the regex in comments.")
796 parser.add_argument('--SI', '--siteinfo', dest="siteinfo", default=None, type=str,
797 help="Path to archive containing siteinfo json. This is required for resolving redirects")
801 args = parser.parse_args()
805 # set persistence method
807 if args.persist is None:
808 persist = PersistMethod.none
809 elif args.persist == "segment":
810 persist = PersistMethod.segment
811 elif args.persist == "legacy":
812 persist = PersistMethod.legacy
814 persist = PersistMethod.sequence
816 if args.namespace_filter is not None:
817 namespaces = args.namespace_filter
821 if len(args.dumpfiles) > 0:
822 output_parquet = False
823 for filename in args.dumpfiles:
824 input_file = open_input_file(filename)
826 # open directory for output
828 output_dir = args.output_dir[0]
832 if output_dir.endswith(".parquet"):
833 output_parquet = True
835 print("Processing file: %s" % filename, file=sys.stderr)
838 output_file = sys.stdout
840 filename = os.path.join(output_dir, os.path.basename(filename))
841 output_file = get_output_filename(filename, parquet = output_parquet)
844 wikiq = WikiqParser(input_file,
846 collapse_user=args.collapse_user,
848 urlencode=args.urlencode,
849 namespaces=namespaces,
850 revert_radius=args.revert_radius,
851 regex_match_revision = args.regex_match_revision,
852 regex_revision_label = args.regex_revision_label,
853 regex_match_comment = args.regex_match_comment,
854 regex_comment_label = args.regex_comment_label,
855 output_parquet=output_parquet,
856 siteinfo_file = args.siteinfo)
864 wikiq = WikiqParser(sys.stdin,
866 collapse_user=args.collapse_user,
868 #persist_legacy=args.persist_legacy,
869 urlencode=args.urlencode,
870 namespaces=namespaces,
871 revert_radius=args.revert_radius,
872 regex_match_revision = args.regex_match_revision,
873 regex_revision_label = args.regex_revision_label,
874 regex_match_comment = args.regex_match_comment,
875 regex_comment_label = args.regex_comment_label,
876 siteinfo_file = args.siteinfo)
880 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
881 # stop_words = stop_words.split(",")