3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime,timezone
14 from subprocess import Popen, PIPE
15 from collections import deque
16 from hashlib import sha1
18 from mwxml import Dump
20 from deltas.tokenizers import wikitext_split
23 from urllib.parse import quote
24 TO_ENCODE = ('title', 'editor')
26 from deltas import SequenceMatcher
27 from deltas import SegmentMatcher
29 import dataclasses as dc
30 from dataclasses import dataclass
32 import pyarrow.parquet as pq
33 from itertools import chain
41 def calculate_persistence(tokens_added):
42 return(sum([(len(x.revisions)-1) for x in tokens_added]),
45 class WikiqIterator():
46 def __init__(self, fh, collapse_user=False):
48 self.collapse_user = collapse_user
49 self.mwiterator = Dump.from_file(self.fh)
50 self.namespace_map = { ns.id : ns.name for ns in
51 self.mwiterator.site_info.namespaces }
52 self.__pages = self.load_pages()
55 for page in self.mwiterator:
57 namespace_map = self.namespace_map,
58 collapse_user=self.collapse_user)
64 return next(self._pages)
67 __slots__ = ('id', 'title', 'namespace', 'redirect',
68 'restrictions', 'mwpage', '__revisions',
71 def __init__(self, page, namespace_map, collapse_user=False):
73 self.namespace = page.namespace
74 # following mwxml, we assume namespace 0 in cases where
75 # page.namespace is inconsistent with namespace_map
76 if page.namespace not in namespace_map:
77 self.title = page.title
79 if page.namespace != 0:
80 self.title = ':'.join([namespace_map[page.namespace], page.title])
82 self.title = page.title
83 self.restrictions = page.restrictions
84 self.collapse_user = collapse_user
86 self.__revisions = self.rev_list()
89 # Outline for how we want to handle collapse_user=True
90 # iteration rev.user prev_rev.user add prev_rev?
97 for i, rev in enumerate(self.mwpage):
98 # never yield the first time
100 if self.collapse_user:
102 rev.collapsed_revs = collapsed_revs
105 if self.collapse_user:
106 # yield if this is the last edit in a seq by a user and reset
107 # also yield if we do know who the user is
109 if rev.deleted.user or prev_rev.deleted.user:
112 rev.collapsed_revs = collapsed_revs
114 elif not rev.user.text == prev_rev.user.text:
117 rev.collapsed_revs = collapsed_revs
118 # otherwise, add one to the counter
121 rev.collapsed_revs = collapsed_revs
122 # if collapse_user is false, we always yield
128 # also yield the final time
132 return self.__revisions
135 return next(self.__revisions)
139 A RegexPair is defined by a regular expression (pattern) and a label.
140 The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
141 If the pattern does not include a capture group, then only one output column will result.
143 class RegexPair(object):
144 def __init__(self, pattern, label):
145 self.pattern = pattern
147 if type(self.pattern) is str:
148 self.pattern = re.compile(pattern)
151 self.has_groups = bool(self.pattern.groupindex)
153 self.capture_groups = list(self.pattern.groupindex.keys())
155 def get_pyarrow_fields(self):
157 fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
158 for cap_group in self.capture_groups]
160 fields = [pa.field(self.label, pa.list_(pa.string()))]
164 def _make_key(self, cap_group):
165 return ("{}_{}".format(self.label, cap_group))
167 def matchmake(self, content, rev_data):
170 # if there are named capture groups in the regex
173 # if there are matches of some sort in this revision content, fill the lists for each cap_group
174 if self.pattern.search(content) is not None:
175 m = self.pattern.finditer(content)
176 matchobjects = list(m)
178 for cap_group in self.capture_groups:
179 key = self._make_key(cap_group)
181 for match in matchobjects:
182 # we only want to add the match for the capture group if the match is not None
183 if match.group(cap_group) != None:
184 temp_list.append(match.group(cap_group))
186 # if temp_list of matches is empty just make that column None
187 if len(temp_list)==0:
188 temp_dict[key] = None
189 # else we put in the list we made in the for-loop above
191 temp_dict[key] = ', '.join(temp_list)
193 # there are no matches at all in this revision content, we default values to None
195 for cap_group in self.capture_groups:
196 key = self._make_key(cap_group)
197 temp_dict[key] = None
199 # there are no capture groups, we just search for all the matches of the regex
201 #given that there are matches to be made
202 if type(content) in(str, bytes):
203 if self.pattern.search(content) is not None:
204 m = self.pattern.findall(content)
205 temp_dict[self.label] = ', '.join(m)
207 temp_dict[self.label] = None
209 # update rev_data with our new columns
210 for k, v in temp_dict.items():
211 setattr(rev_data, k, v)
217 We used to use a dictionary to collect fields for the output.
218 Now we use dataclasses. Compared to a dictionary, this should help:
220 - make it easier to output parquet data.
221 - use class attribute '.' syntax instead of dictionary syntax.
222 - improve support for tooling (autocomplete, type hints)
223 - use type information to define formatting rules
225 Depending on the parameters passed into Wikiq, the output schema can be different.
226 Therefore, we need to end up constructing a dataclass with the correct output schema.
227 It also needs to have the correct pyarrow schema so we can write parquet files.
229 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
240 text_chars: int = None
242 reverteds: list[int] = None
248 # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
251 # defines pyarrow schema.
252 # each field in the data class needs an entry in this array.
253 # the names should match and be in the same order.
254 # this isn't a dataclass field since it doesn't have a type annotation
256 pa.field("revid", pa.int64()),
257 pa.field("date_time", pa.timestamp('ms')),
258 pa.field("articleid",pa.int64()),
259 pa.field("editorid",pa.int64()),
260 pa.field("title",pa.string()),
261 pa.field("namespace",pa.int32()),
262 pa.field("deleted",pa.bool_()),
263 pa.field("text_chars",pa.int32()),
264 pa.field("revert",pa.bool_()),
265 pa.field("reverteds",pa.list_(pa.int64())),
266 pa.field("sha1",pa.string()),
267 pa.field("minor",pa.bool_()),
268 pa.field("editor",pa.string()),
269 pa.field("anon",pa.bool_()),
272 # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
273 def to_pyarrow(self):
274 return dc.astuple(self)
276 # logic to convert each field into the wikiq tsv format goes here.
277 def to_tsv_row(self):
280 for f in dc.fields(self):
281 val = getattr(self, f.name)
282 if getattr(self, f.name) is None:
285 row.append("TRUE" if val else "FALSE")
287 elif f.type == datetime:
288 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
290 elif f.name in {'editor','title'}:
292 if self.urlencode and f.name in TO_ENCODE:
293 row.append(quote(str(s)))
297 elif f.type == list[int]:
298 row.append('"' + ",".join([str(x) for x in val]) + '"')
301 if self.urlencode and f.name in TO_ENCODE:
302 row.append(quote(str(val)))
308 return '\t'.join(map(str,row))
310 def header_row(self):
311 return '\t'.join(map(lambda f: f.name, dc.fields(self)))
315 If collapse=True we'll use a RevDataCollapse dataclass.
316 This class inherits from RevDataBase. This means that it has all the same fields and functions.
318 It just adds a new field and updates the pyarrow schema.
322 class RevDataCollapse(RevDataBase):
323 collapsed_revs:int = None
325 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
326 pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
330 If persistence data is to be computed we'll need the fields added by RevDataPersistence.
334 class RevDataPersistence(RevDataBase):
335 token_revs:int = None
336 tokens_added:int = None
337 tokens_removed:int = None
338 tokens_window:int = None
340 pa_persistence_schema_fields = [
341 pa.field("token_revs", pa.int64()),
342 pa.field("tokens_added", pa.int64()),
343 pa.field("tokens_removed", pa.int64()),
344 pa.field("tokens_window", pa.int64())]
346 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
349 class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
353 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
354 pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
359 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000, siteinfo_file=None):
362 persist : what persistence method to use. Takes a PersistMethod value
364 self.input_file = input_file
366 self.collapse_user = collapse_user
367 self.persist = persist
369 self.urlencode = urlencode
370 self.revert_radius = revert_radius
372 if namespaces is not None:
373 self.namespace_filter = set(namespaces)
375 self.namespace_filter = None
377 self.regex_schemas = []
378 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
379 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
381 if siteinfo_file is not None:
382 siteinfo = open_siteinfo(siteinfo_file)
383 siteinfo = json.loads(siteinfo.read())
385 magicwords = siteinfo.get('query').get('magicwords')
388 redirect_config = list(filter(lambda obj: obj.get("name") == "redirect", magicwords))
389 redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
390 redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
391 redirect_aliases.append('REDIRECT') # just in case
392 pattern = '(?:' + '|'.join(redirect_aliases) + ')'
393 redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
394 .format(pattern=pattern), re.IGNORECASE | re.DOTALL)
396 self.regex_revision_pairs.extend(self.make_matchmake_pairs([redirect_regex], ["redirect"]))
398 # This is where we set the type for revdata.
400 if self.collapse_user is True:
401 if self.persist == PersistMethod.none:
402 revdata_type = RevDataCollapse
404 revdata_type = RevDataCollapsePersistence
405 elif self.persist != PersistMethod.none:
406 revdata_type = RevDataPersistence
408 revdata_type = RevDataBase
410 # if there are regex fields, we need to add them to the revdata type.
411 regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
413 # make_dataclass is a function that defines a new dataclass type.
414 # here we extend the type we have already chosen and add the regular expression types
415 self.revdata_type = dc.make_dataclass('RevData_Parser',
417 bases=(revdata_type,))
419 # we also need to make sure that we have the right pyarrow schema
420 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
422 self.revdata_type.urlencode = self.urlencode
424 self.schema = pa.schema(self.revdata_type.pa_schema_fields)
426 # here we initialize the variables we need for output.
427 if output_parquet is True:
428 self.output_parquet = True
429 self.pq_writer = None
430 self.output_file = output_file
431 self.parquet_buffer = []
432 self.parquet_buffer_size = parquet_buffer_size
434 self.print_header = True
435 if output_file == sys.stdout:
437 self.output_file = output_file
439 self.output_file = open(output_file,'w')
440 self.output_parquet = False
442 def make_matchmake_pairs(self, patterns, labels):
443 if (patterns is not None and labels is not None) and \
444 (len(patterns) == len(labels)):
446 for pattern, label in zip(patterns, labels):
447 rp = RegexPair(pattern, label)
449 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
451 elif (patterns is None and labels is None):
454 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
456 def matchmake_revision(self, rev, rev_data):
457 rev_data = self.matchmake_text(rev.text, rev_data)
458 rev_data = self.matchmake_comment(rev.comment, rev_data)
461 def matchmake_text(self, text, rev_data):
462 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
464 def matchmake_comment(self, comment, rev_data):
465 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
467 def matchmake_pairs(self, text, rev_data, pairs):
469 rev_data = pair.matchmake(text, rev_data)
472 def __get_namespace_from_title(self, title):
475 for ns in self.namespaces:
476 # skip if the namespace is not defined
478 default_ns = self.namespaces[ns]
481 if title.startswith(ns + ":"):
482 return self.namespaces[ns]
484 # if we've made it this far with no matches, we return the default namespace
490 # create a regex that creates the output filename
491 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
492 # r'output/wikiq-\1-\2.tsv',
495 # Construct dump file iterator
496 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
498 # extract list of namspaces
499 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
504 # Iterate through pages
506 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
508 # skip namespaces not in the filter
509 if self.namespace_filter is not None:
510 if namespace not in self.namespace_filter:
513 rev_detector = mwreverts.Detector(radius = self.revert_radius)
515 if self.persist != PersistMethod.none:
516 window = deque(maxlen=PERSISTENCE_RADIUS)
518 if self.persist == PersistMethod.sequence:
519 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
520 revert_radius=PERSISTENCE_RADIUS)
522 elif self.persist == PersistMethod.segment:
523 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
524 revert_radius=PERSISTENCE_RADIUS)
526 # self.persist == PersistMethod.legacy
528 from mw.lib import persistence
529 state = persistence.State()
531 # Iterate through a page's revisions
534 # create a new data object instead of a dictionary.
535 rev_data = self.revdata_type(revid = rev.id,
536 date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
538 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
540 deleted = rev.deleted.text,
541 namespace = namespace
544 rev_data = self.matchmake_revision(rev, rev_data)
546 if not rev.deleted.text:
547 # rev.text can be None if the page has no text
550 # if text exists, we'll check for a sha1 and generate one otherwise
555 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
557 rev_data.sha1 = text_sha1
559 # TODO rev.bytes doesn't work.. looks like a bug
560 rev_data.text_chars = len(rev.text)
562 # generate revert data
563 revert = rev_detector.process(text_sha1, rev.id)
566 rev_data.revert = True
567 rev_data.reverteds = revert.reverteds
569 rev_data.revert = False
571 # if the fact that the edit was minor can be hidden, this might be an issue
572 rev_data.minor = rev.minor
574 if not rev.deleted.user:
575 # wrap user-defined editors in quotes for fread
576 rev_data.editor = rev.user.text
577 rev_data.anon = rev.user.id is None
579 #TODO missing: additions_size deletions_size
581 # if collapse user was on, lets run that
582 if self.collapse_user:
583 rev_data.collapsed_revs = rev.collapsed_revs
586 if self.persist != PersistMethod.none:
587 if not rev.deleted.text:
589 if self.persist != PersistMethod.legacy:
590 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
593 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
595 window.append((rev.id, rev_data, tokens_added, tokens_removed))
597 if len(window) == PERSISTENCE_RADIUS:
598 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
600 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
602 old_rev_data.token_revs = num_token_revs
603 old_rev_data.tokens_added = num_tokens
604 old_rev_data.tokens_removed = len(old_tokens_removed)
605 old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
607 self.print_rev_data(old_rev_data)
610 self.print_rev_data(rev_data)
614 if self.persist != PersistMethod.none:
615 # print out metadata for the last RADIUS revisions
616 for i, item in enumerate(window):
617 # if the window was full, we've already printed item 0
618 if len(window) == PERSISTENCE_RADIUS and i == 0:
621 rev_id, rev_data, tokens_added, tokens_removed = item
622 num_token_revs, num_tokens = calculate_persistence(tokens_added)
624 rev_data.token_revs = num_token_revs
625 rev_data.tokens_added = num_tokens
626 rev_data.tokens_removed = len(tokens_removed)
627 rev_data.tokens_window = len(window)-(i+1)
628 self.print_rev_data(rev_data)
632 print("Done: %s revisions and %s pages." % (rev_count, page_count),
635 # remember to flush the parquet_buffer if we're done
636 if self.output_parquet is True:
637 self.flush_parquet_buffer()
638 self.pq_writer.close()
641 self.output_file.close()
645 For performance reasons it's better to write parquet in batches instead of one row at a time.
646 So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
648 def write_parquet_row(self, rev_data):
649 padata = rev_data.to_pyarrow()
650 self.parquet_buffer.append(padata)
652 if len(self.parquet_buffer) >= self.parquet_buffer_size:
653 self.flush_parquet_buffer()
657 Function that actually writes data to the parquet file.
658 It needs to transpose the data from row-by-row to column-by-column
660 def flush_parquet_buffer(self):
663 Returns the pyarrow table that we'll write
665 def rows_to_table(rg, schema):
672 for j in range(len(cols)):
673 cols[j].append(row[j])
676 for col, typ in zip(cols, schema.types):
677 arrays.append(pa.array(col, typ))
678 return pa.Table.from_arrays(arrays, schema=schema)
680 outtable = rows_to_table(self.parquet_buffer, self.schema)
681 if self.pq_writer is None:
682 self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
684 self.pq_writer.write_table(outtable)
685 self.parquet_buffer = []
687 # depending on if we are configured to write tsv or parquet, we'll call a different function.
688 def print_rev_data(self, rev_data):
689 if self.output_parquet is False:
690 printfunc = self.write_tsv_row
692 printfunc = self.write_parquet_row
696 def write_tsv_row(self, rev_data):
697 if self.print_header:
698 print(rev_data.header_row(), file=self.output_file)
699 self.print_header = False
701 line = rev_data.to_tsv_row()
702 print(line, file=self.output_file)
704 def open_siteinfo(siteinfo_file):
705 if re.match(r'.*\.7z$', siteinfo_file):
706 cmd = ["7za", "x", "-so", siteinfo_file, "*.json"]
707 elif re.match(r'.*\.gz$', siteinfo_file):
708 cmd = ["zcat", siteinfo_file]
709 elif re.match(r'.*\.bz2$', siteinfo_file):
710 cmd = ["bzcat", "-dk", siteinfo_file]
713 input_file = Popen(cmd, stdout=PIPE).stdout
715 input_file = open(siteinfo_file, 'r')
720 def open_input_file(input_filename):
721 if re.match(r'.*\.7z$', input_filename):
722 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
723 elif re.match(r'.*\.gz$', input_filename):
724 cmd = ["zcat", input_filename]
725 elif re.match(r'.*\.bz2$', input_filename):
726 cmd = ["bzcat", "-dk", input_filename]
729 input_file = Popen(cmd, stdout=PIPE).stdout
731 input_file = open(input_filename, 'r')
735 def get_output_filename(input_filename, parquet = False):
736 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
737 output_filename = re.sub(r'\.xml', '', output_filename)
739 output_filename = output_filename + ".tsv"
741 output_filename = output_filename + ".parquet"
742 return output_filename
744 def open_output_file(input_filename):
745 # create a regex that creates the output filename
746 output_filename = get_output_filename(input_filename, parquet = False)
747 output_file = open(output_filename, "w")
750 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
752 # arguments for the input direction
753 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
754 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
756 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
757 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
759 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
760 help="Write output to standard out (do not create dump file)")
762 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
763 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
765 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
766 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
768 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
769 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
771 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
772 help="Id number of namspace to include. Can be specified more than once.")
774 parser.add_argument('-rr',
776 dest="revert_radius",
780 help="Number of edits to check when looking for reverts (default: 15)")
782 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
783 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
785 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
786 help="The label for the outputted column based on matching the regex in revision text.")
788 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
789 help="The regular expression to search for in comments of revisions.")
791 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
792 help="The label for the outputted column based on matching the regex in comments.")
794 parser.add_argument('--SI', '--siteinfo', dest="siteinfo", default=None, type=str,
795 help="Path to archive containing siteinfo json. This is required for resolving redirects")
799 args = parser.parse_args()
803 # set persistence method
805 if args.persist is None:
806 persist = PersistMethod.none
807 elif args.persist == "segment":
808 persist = PersistMethod.segment
809 elif args.persist == "legacy":
810 persist = PersistMethod.legacy
812 persist = PersistMethod.sequence
814 if args.namespace_filter is not None:
815 namespaces = args.namespace_filter
819 if len(args.dumpfiles) > 0:
820 output_parquet = False
821 for filename in args.dumpfiles:
822 input_file = open_input_file(filename)
824 # open directory for output
826 output_dir = args.output_dir[0]
830 if output_dir.endswith(".parquet"):
831 output_parquet = True
833 print("Processing file: %s" % filename, file=sys.stderr)
836 output_file = sys.stdout
838 filename = os.path.join(output_dir, os.path.basename(filename))
839 output_file = get_output_filename(filename, parquet = output_parquet)
842 wikiq = WikiqParser(input_file,
844 collapse_user=args.collapse_user,
846 urlencode=args.urlencode,
847 namespaces=namespaces,
848 revert_radius=args.revert_radius,
849 regex_match_revision = args.regex_match_revision,
850 regex_revision_label = args.regex_revision_label,
851 regex_match_comment = args.regex_match_comment,
852 regex_comment_label = args.regex_comment_label,
853 output_parquet=output_parquet,
854 siteinfo_file = args.siteinfo)
862 wikiq = WikiqParser(sys.stdin,
864 collapse_user=args.collapse_user,
866 #persist_legacy=args.persist_legacy,
867 urlencode=args.urlencode,
868 namespaces=namespaces,
869 revert_radius=args.revert_radius,
870 regex_match_revision = args.regex_match_revision,
871 regex_revision_label = args.regex_revision_label,
872 regex_match_comment = args.regex_match_comment,
873 regex_comment_label = args.regex_comment_label,
874 siteinfo_file = args.siteinfo)
878 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
879 # stop_words = stop_words.split(",")