3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime,timezone
14 from subprocess import Popen, PIPE
15 from collections import deque
16 from hashlib import sha1
18 from mwxml import Dump
20 from deltas.tokenizers import wikitext_split
23 from urllib.parse import quote
24 TO_ENCODE = ('title', 'editor')
26 from deltas import SequenceMatcher
27 from deltas import SegmentMatcher
29 import dataclasses as dc
30 from dataclasses import dataclass
32 import pyarrow.parquet as pq
33 from itertools import chain
41 def calculate_persistence(tokens_added):
42 return(sum([(len(x.revisions)-1) for x in tokens_added]),
45 class WikiqIterator():
46 def __init__(self, fh, collapse_user=False):
48 self.collapse_user = collapse_user
49 self.mwiterator = Dump.from_file(self.fh)
50 self.namespace_map = { ns.id : ns.name for ns in
51 self.mwiterator.site_info.namespaces }
52 self.__pages = self.load_pages()
55 for page in self.mwiterator:
57 namespace_map = self.namespace_map,
58 collapse_user=self.collapse_user)
64 return next(self._pages)
67 __slots__ = ('id', 'title', 'namespace', 'redirect',
68 'restrictions', 'mwpage', '__revisions',
71 def __init__(self, page, namespace_map, collapse_user=False):
73 self.namespace = page.namespace
74 # following mwxml, we assume namespace 0 in cases where
75 # page.namespace is inconsistent with namespace_map
76 if page.namespace not in namespace_map:
77 self.title = page.title
79 if page.namespace != 0:
80 self.title = ':'.join([namespace_map[page.namespace], page.title])
82 self.title = page.title
83 self.restrictions = page.restrictions
84 self.collapse_user = collapse_user
86 self.__revisions = self.rev_list()
89 # Outline for how we want to handle collapse_user=True
90 # iteration rev.user prev_rev.user add prev_rev?
97 for i, rev in enumerate(self.mwpage):
98 # never yield the first time
100 if self.collapse_user:
102 rev.collapsed_revs = collapsed_revs
105 if self.collapse_user:
106 # yield if this is the last edit in a seq by a user and reset
107 # also yield if we do know who the user is
109 if rev.deleted.user or prev_rev.deleted.user:
112 rev.collapsed_revs = collapsed_revs
114 elif not rev.user.text == prev_rev.user.text:
117 rev.collapsed_revs = collapsed_revs
118 # otherwise, add one to the counter
121 rev.collapsed_revs = collapsed_revs
122 # if collapse_user is false, we always yield
128 # also yield the final time
132 return self.__revisions
135 return next(self.__revisions)
139 A RegexPair is defined by a regular expression (pattern) and a label.
140 The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
141 If the pattern does not include a capture group, then only one output column will result.
143 class RegexPair(object):
144 def __init__(self, pattern, label):
145 self.pattern = pattern
147 if type(self.pattern) is str:
148 self.pattern = re.compile(pattern)
151 self.has_groups = bool(self.pattern.groupindex)
153 self.capture_groups = list(self.pattern.groupindex.keys())
155 def get_pyarrow_fields(self):
157 fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
158 for cap_group in self.capture_groups]
160 fields = [pa.field(self.label, pa.list_(pa.string()))]
164 def _make_key(self, cap_group):
165 return ("{}_{}".format(self.label, cap_group))
167 def matchmake(self, content, rev_data):
170 # if there are named capture groups in the regex
173 # if there are matches of some sort in this revision content, fill the lists for each cap_group
174 if self.pattern.search(content) is not None:
175 m = self.pattern.finditer(content)
176 matchobjects = list(m)
178 for cap_group in self.capture_groups:
179 key = self._make_key(cap_group)
181 for match in matchobjects:
182 # we only want to add the match for the capture group if the match is not None
183 if match.group(cap_group) != None:
184 temp_list.append(match.group(cap_group))
186 # if temp_list of matches is empty just make that column None
187 if len(temp_list)==0:
188 temp_dict[key] = None
189 # else we put in the list we made in the for-loop above
191 temp_dict[key] = ', '.join(temp_list)
193 # there are no matches at all in this revision content, we default values to None
195 for cap_group in self.capture_groups:
196 key = self._make_key(cap_group)
197 temp_dict[key] = None
199 # there are no capture groups, we just search for all the matches of the regex
201 #given that there are matches to be made
202 if type(content) in(str, bytes):
203 if self.pattern.search(content) is not None:
204 m = self.pattern.findall(content)
205 temp_dict[self.label] = m
207 temp_dict[self.label] = None
209 # update rev_data with our new columns
210 for k, v in temp_dict.items():
211 setattr(rev_data, k, v)
217 We used to use a dictionary to collect fields for the output.
218 Now we use dataclasses. Compared to a dictionary, this should help:
220 - make it easier to output parquet data.
221 - use class attribute '.' syntax instead of dictionary syntax.
222 - improve support for tooling (autocomplete, type hints)
223 - use type information to define formatting rules
225 Depending on the parameters passed into Wikiq, the output schema can be different.
226 Therefore, we need to end up constructing a dataclass with the correct output schema.
227 It also needs to have the correct pyarrow schema so we can write parquet files.
229 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
240 text_chars: int = None
242 reverteds: list[int] = None
248 # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
251 # defines pyarrow schema.
252 # each field in the data class needs an entry in this array.
253 # the names should match and be in the same order.
254 # this isn't a dataclass field since it doesn't have a type annotation
256 pa.field("revid", pa.int64()),
257 pa.field("date_time", pa.timestamp('ms')),
258 pa.field("articleid",pa.int64()),
259 pa.field("editorid",pa.int64()),
260 pa.field("title",pa.string()),
261 pa.field("namespace",pa.int32()),
262 pa.field("deleted",pa.bool_()),
263 pa.field("text_chars",pa.int32()),
264 pa.field("revert",pa.bool_()),
265 pa.field("reverteds",pa.list_(pa.int64())),
266 pa.field("sha1",pa.string()),
267 pa.field("minor",pa.bool_()),
268 pa.field("editor",pa.string()),
269 pa.field("anon",pa.bool_()),
272 # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
273 def to_pyarrow(self):
274 return dc.astuple(self)
276 # logic to convert each field into the wikiq tsv format goes here.
277 def to_tsv_row(self):
280 for f in dc.fields(self):
281 val = getattr(self, f.name)
282 if getattr(self, f.name) is None:
285 row.append("TRUE" if val else "FALSE")
287 elif f.type == datetime:
288 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
290 elif f.name in {'editor','title'}:
292 if self.urlencode and f.name in TO_ENCODE:
293 row.append(quote(str(s)))
297 elif f.type == list[int]:
298 row.append('"' + ",".join([str(x) for x in val]) + '"')
300 elif f.type == list[str]:
301 row.append('"' + ",".join([(x) for x in val]) + '"')
304 if self.urlencode and f.name in TO_ENCODE:
305 row.append(quote(str(val)))
311 return '\t'.join(map(str,row))
313 def header_row(self):
314 return '\t'.join(map(lambda f: f.name, dc.fields(self)))
318 If collapse=True we'll use a RevDataCollapse dataclass.
319 This class inherits from RevDataBase. This means that it has all the same fields and functions.
321 It just adds a new field and updates the pyarrow schema.
325 class RevDataCollapse(RevDataBase):
326 collapsed_revs:int = None
328 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
329 pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
333 If persistence data is to be computed we'll need the fields added by RevDataPersistence.
337 class RevDataPersistence(RevDataBase):
338 token_revs:int = None
339 tokens_added:int = None
340 tokens_removed:int = None
341 tokens_window:int = None
343 pa_persistence_schema_fields = [
344 pa.field("token_revs", pa.int64()),
345 pa.field("tokens_added", pa.int64()),
346 pa.field("tokens_removed", pa.int64()),
347 pa.field("tokens_window", pa.int64())]
349 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
352 class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
356 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
357 pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
362 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000, siteinfo_file=None):
365 persist : what persistence method to use. Takes a PersistMethod value
367 self.input_file = input_file
369 self.collapse_user = collapse_user
370 self.persist = persist
372 self.urlencode = urlencode
373 self.revert_radius = revert_radius
375 if namespaces is not None:
376 self.namespace_filter = set(namespaces)
378 self.namespace_filter = None
380 self.regex_schemas = []
381 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
382 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
384 if siteinfo_file is not None:
385 siteinfo = open_siteinfo(siteinfo_file)
386 siteinfo = json.loads(siteinfo.read())
388 magicwords = siteinfo.get('query').get('magicwords')
391 redirect_config = list(filter(lambda obj: obj.get("name") == "redirect", magicwords))
392 redirect_aliases = chain(* map(lambda obj: obj.get("aliases"), redirect_config))
393 redirect_aliases = list(map(lambda s: s.lstrip('#'), redirect_aliases))
394 redirect_aliases.append('REDIRECT') # just in case
396 # this regular expression is copied from pywikibot
397 pattern = '(?:' + '|'.join(redirect_aliases) + ')'
398 redirect_regex = re.compile(r'\s*#{pattern}\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
399 .format(pattern=pattern), re.IGNORECASE | re.DOTALL)
401 self.regex_revision_pairs.extend(self.make_matchmake_pairs([redirect_regex], ["redirect"]))
403 # This is where we set the type for revdata.
405 if self.collapse_user is True:
406 if self.persist == PersistMethod.none:
407 revdata_type = RevDataCollapse
409 revdata_type = RevDataCollapsePersistence
410 elif self.persist != PersistMethod.none:
411 revdata_type = RevDataPersistence
413 revdata_type = RevDataBase
415 # if there are regex fields, we need to add them to the revdata type.
416 regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
418 # make_dataclass is a function that defines a new dataclass type.
419 # here we extend the type we have already chosen and add the regular expression types
420 self.revdata_type = dc.make_dataclass('RevData_Parser',
422 bases=(revdata_type,))
424 # we also need to make sure that we have the right pyarrow schema
425 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
427 self.revdata_type.urlencode = self.urlencode
429 self.schema = pa.schema(self.revdata_type.pa_schema_fields)
431 # here we initialize the variables we need for output.
432 if output_parquet is True:
433 self.output_parquet = True
434 self.pq_writer = None
435 self.output_file = output_file
436 self.parquet_buffer = []
437 self.parquet_buffer_size = parquet_buffer_size
439 self.print_header = True
440 if output_file == sys.stdout:
442 self.output_file = output_file
444 self.output_file = open(output_file,'w')
445 self.output_parquet = False
447 def make_matchmake_pairs(self, patterns, labels):
448 if (patterns is not None and labels is not None) and \
449 (len(patterns) == len(labels)):
451 for pattern, label in zip(patterns, labels):
452 rp = RegexPair(pattern, label)
454 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
456 elif (patterns is None and labels is None):
459 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
461 def matchmake_revision(self, rev, rev_data):
462 rev_data = self.matchmake_text(rev.text, rev_data)
463 rev_data = self.matchmake_comment(rev.comment, rev_data)
466 def matchmake_text(self, text, rev_data):
467 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
469 def matchmake_comment(self, comment, rev_data):
470 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
472 def matchmake_pairs(self, text, rev_data, pairs):
474 rev_data = pair.matchmake(text, rev_data)
477 def __get_namespace_from_title(self, title):
480 for ns in self.namespaces:
481 # skip if the namespace is not defined
483 default_ns = self.namespaces[ns]
486 if title.startswith(ns + ":"):
487 return self.namespaces[ns]
489 # if we've made it this far with no matches, we return the default namespace
495 # create a regex that creates the output filename
496 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
497 # r'output/wikiq-\1-\2.tsv',
500 # Construct dump file iterator
501 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
503 # extract list of namspaces
504 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
509 # Iterate through pages
511 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
513 # skip namespaces not in the filter
514 if self.namespace_filter is not None:
515 if namespace not in self.namespace_filter:
518 rev_detector = mwreverts.Detector(radius = self.revert_radius)
520 if self.persist != PersistMethod.none:
521 window = deque(maxlen=PERSISTENCE_RADIUS)
523 if self.persist == PersistMethod.sequence:
524 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
525 revert_radius=PERSISTENCE_RADIUS)
527 elif self.persist == PersistMethod.segment:
528 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
529 revert_radius=PERSISTENCE_RADIUS)
531 # self.persist == PersistMethod.legacy
533 from mw.lib import persistence
534 state = persistence.State()
536 # Iterate through a page's revisions
539 # create a new data object instead of a dictionary.
540 rev_data = self.revdata_type(revid = rev.id,
541 date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
543 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
545 deleted = rev.deleted.text,
546 namespace = namespace
549 rev_data = self.matchmake_revision(rev, rev_data)
551 if not rev.deleted.text:
552 # rev.text can be None if the page has no text
555 # if text exists, we'll check for a sha1 and generate one otherwise
560 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
562 rev_data.sha1 = text_sha1
564 # TODO rev.bytes doesn't work.. looks like a bug
565 rev_data.text_chars = len(rev.text)
567 # generate revert data
568 revert = rev_detector.process(text_sha1, rev.id)
571 rev_data.revert = True
572 rev_data.reverteds = revert.reverteds
574 rev_data.revert = False
576 # if the fact that the edit was minor can be hidden, this might be an issue
577 rev_data.minor = rev.minor
579 if not rev.deleted.user:
580 # wrap user-defined editors in quotes for fread
581 rev_data.editor = rev.user.text
582 rev_data.anon = rev.user.id is None
584 #TODO missing: additions_size deletions_size
586 # if collapse user was on, lets run that
587 if self.collapse_user:
588 rev_data.collapsed_revs = rev.collapsed_revs
591 if self.persist != PersistMethod.none:
592 if not rev.deleted.text:
594 if self.persist != PersistMethod.legacy:
595 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
598 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
600 window.append((rev.id, rev_data, tokens_added, tokens_removed))
602 if len(window) == PERSISTENCE_RADIUS:
603 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
605 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
607 old_rev_data.token_revs = num_token_revs
608 old_rev_data.tokens_added = num_tokens
609 old_rev_data.tokens_removed = len(old_tokens_removed)
610 old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
612 self.print_rev_data(old_rev_data)
615 self.print_rev_data(rev_data)
619 if self.persist != PersistMethod.none:
620 # print out metadata for the last RADIUS revisions
621 for i, item in enumerate(window):
622 # if the window was full, we've already printed item 0
623 if len(window) == PERSISTENCE_RADIUS and i == 0:
626 rev_id, rev_data, tokens_added, tokens_removed = item
627 num_token_revs, num_tokens = calculate_persistence(tokens_added)
629 rev_data.token_revs = num_token_revs
630 rev_data.tokens_added = num_tokens
631 rev_data.tokens_removed = len(tokens_removed)
632 rev_data.tokens_window = len(window)-(i+1)
633 self.print_rev_data(rev_data)
637 print("Done: %s revisions and %s pages." % (rev_count, page_count),
640 # remember to flush the parquet_buffer if we're done
641 if self.output_parquet is True:
642 self.flush_parquet_buffer()
643 self.pq_writer.close()
646 self.output_file.close()
650 For performance reasons it's better to write parquet in batches instead of one row at a time.
651 So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
653 def write_parquet_row(self, rev_data):
654 padata = rev_data.to_pyarrow()
655 self.parquet_buffer.append(padata)
657 if len(self.parquet_buffer) >= self.parquet_buffer_size:
658 self.flush_parquet_buffer()
662 Function that actually writes data to the parquet file.
663 It needs to transpose the data from row-by-row to column-by-column
665 def flush_parquet_buffer(self):
668 Returns the pyarrow table that we'll write
670 def rows_to_table(rg, schema):
677 for j in range(len(cols)):
678 cols[j].append(row[j])
681 for col, typ in zip(cols, schema.types):
682 arrays.append(pa.array(col, typ))
683 return pa.Table.from_arrays(arrays, schema=schema)
685 outtable = rows_to_table(self.parquet_buffer, self.schema)
686 if self.pq_writer is None:
687 self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
689 self.pq_writer.write_table(outtable)
690 self.parquet_buffer = []
692 # depending on if we are configured to write tsv or parquet, we'll call a different function.
693 def print_rev_data(self, rev_data):
695 if self.output_parquet is False:
696 printfunc = self.write_tsv_row
698 printfunc = self.write_parquet_row
702 def write_tsv_row(self, rev_data):
703 if self.print_header:
704 print(rev_data.header_row(), file=self.output_file)
705 self.print_header = False
707 line = rev_data.to_tsv_row()
708 print(line, file=self.output_file)
710 def open_siteinfo(siteinfo_file):
711 if re.match(r'.*\.7z$', siteinfo_file):
712 cmd = ["7za", "x", "-so", siteinfo_file, "*.json"]
713 elif re.match(r'.*\.gz$', siteinfo_file):
714 cmd = ["zcat", siteinfo_file]
715 elif re.match(r'.*\.bz2$', siteinfo_file):
716 cmd = ["bzcat", "-dk", siteinfo_file]
719 input_file = Popen(cmd, stdout=PIPE).stdout
721 input_file = open(siteinfo_file, 'r')
726 def open_input_file(input_filename):
727 if re.match(r'.*\.7z$', input_filename):
728 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
729 elif re.match(r'.*\.gz$', input_filename):
730 cmd = ["zcat", input_filename]
731 elif re.match(r'.*\.bz2$', input_filename):
732 cmd = ["bzcat", "-dk", input_filename]
735 input_file = Popen(cmd, stdout=PIPE).stdout
737 input_file = open(input_filename, 'r')
741 def get_output_filename(input_filename, parquet = False):
742 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
743 output_filename = re.sub(r'\.xml', '', output_filename)
745 output_filename = output_filename + ".tsv"
747 output_filename = output_filename + ".parquet"
748 return output_filename
750 def open_output_file(input_filename):
751 # create a regex that creates the output filename
752 output_filename = get_output_filename(input_filename, parquet = False)
753 output_file = open(output_filename, "w")
756 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
758 # arguments for the input direction
759 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
760 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
762 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
763 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
765 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
766 help="Write output to standard out (do not create dump file)")
768 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
769 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
771 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
772 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
774 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
775 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
777 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
778 help="Id number of namspace to include. Can be specified more than once.")
780 parser.add_argument('-rr',
782 dest="revert_radius",
786 help="Number of edits to check when looking for reverts (default: 15)")
788 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
789 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
791 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
792 help="The label for the outputted column based on matching the regex in revision text.")
794 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
795 help="The regular expression to search for in comments of revisions.")
797 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
798 help="The label for the outputted column based on matching the regex in comments.")
800 parser.add_argument('--SI', '--siteinfo', dest="siteinfo", default=None, type=str,
801 help="Path to archive containing siteinfo json. This is required for resolving redirects")
805 args = parser.parse_args()
809 # set persistence method
811 if args.persist is None:
812 persist = PersistMethod.none
813 elif args.persist == "segment":
814 persist = PersistMethod.segment
815 elif args.persist == "legacy":
816 persist = PersistMethod.legacy
818 persist = PersistMethod.sequence
820 if args.namespace_filter is not None:
821 namespaces = args.namespace_filter
825 if len(args.dumpfiles) > 0:
826 output_parquet = False
827 for filename in args.dumpfiles:
828 input_file = open_input_file(filename)
830 # open directory for output
832 output_dir = args.output_dir[0]
836 if output_dir.endswith(".parquet"):
837 output_parquet = True
839 print("Processing file: %s" % filename, file=sys.stderr)
842 output_file = sys.stdout
844 filename = os.path.join(output_dir, os.path.basename(filename))
845 output_file = get_output_filename(filename, parquet = output_parquet)
847 print(args.siteinfo, file=sys.stderr)
848 wikiq = WikiqParser(input_file,
850 collapse_user=args.collapse_user,
852 urlencode=args.urlencode,
853 namespaces=namespaces,
854 revert_radius=args.revert_radius,
855 regex_match_revision = args.regex_match_revision,
856 regex_revision_label = args.regex_revision_label,
857 regex_match_comment = args.regex_match_comment,
858 regex_comment_label = args.regex_comment_label,
859 output_parquet=output_parquet,
860 siteinfo_file = args.siteinfo)
868 wikiq = WikiqParser(sys.stdin,
870 collapse_user=args.collapse_user,
872 #persist_legacy=args.persist_legacy,
873 urlencode=args.urlencode,
874 namespaces=namespaces,
875 revert_radius=args.revert_radius,
876 regex_match_revision = args.regex_match_revision,
877 regex_revision_label = args.regex_revision_label,
878 regex_match_comment = args.regex_match_comment,
879 regex_comment_label = args.regex_comment_label,
880 siteinfo_file = args.siteinfo)
884 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
885 # stop_words = stop_words.split(",")