3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
17 from mwxml import Dump
19 from deltas.tokenizers import wikitext_split
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
28 from dataclasses import dataclass
30 import pyarrow.parquet as pq
38 def calculate_persistence(tokens_added):
39 return(sum([(len(x.revisions)-1) for x in tokens_added]),
43 class WikiqIterator():
44 def __init__(self, fh, collapse_user=False):
46 self.collapse_user = collapse_user
47 self.mwiterator = Dump.from_file(self.fh)
48 self.namespace_map = { ns.id : ns.name for ns in
49 self.mwiterator.site_info.namespaces }
50 self.__pages = self.load_pages()
53 for page in self.mwiterator:
55 namespace_map = self.namespace_map,
56 collapse_user=self.collapse_user)
62 return next(self._pages)
65 __slots__ = ('id', 'title', 'namespace', 'redirect',
66 'restrictions', 'mwpage', '__revisions',
69 def __init__(self, page, namespace_map, collapse_user=False):
71 self.namespace = page.namespace
72 # following mwxml, we assume namespace 0 in cases where
73 # page.namespace is inconsistent with namespace_map
74 if page.namespace not in namespace_map:
75 self.title = page.title
77 if page.namespace != 0:
78 self.title = ':'.join([namespace_map[page.namespace], page.title])
80 self.title = page.title
81 self.restrictions = page.restrictions
82 self.collapse_user = collapse_user
84 self.__revisions = self.rev_list()
87 # Outline for how we want to handle collapse_user=True
88 # iteration rev.user prev_rev.user add prev_rev?
95 for i, rev in enumerate(self.mwpage):
96 # never yield the first time
98 if self.collapse_user:
100 rev.collapsed_revs = collapsed_revs
103 if self.collapse_user:
104 # yield if this is the last edit in a seq by a user and reset
105 # also yield if we do know who the user is
107 if rev.deleted.user or prev_rev.deleted.user:
110 rev.collapsed_revs = collapsed_revs
112 elif not rev.user.text == prev_rev.user.text:
115 rev.collapsed_revs = collapsed_revs
116 # otherwise, add one to the counter
119 rev.collapsed_revs = collapsed_revs
120 # if collapse_user is false, we always yield
126 # also yield the final time
130 return self.__revisions
133 return next(self.__revisions)
136 class RegexPair(object):
137 def __init__(self, pattern, label):
138 self.pattern = re.compile(pattern)
140 self.has_groups = bool(self.pattern.groupindex)
142 self.capture_groups = list(self.pattern.groupindex.keys())
144 def _make_key(self, cap_group):
145 return ("{}_{}".format(self.label, cap_group))
147 def matchmake(self, content, rev_data):
150 # if there are named capture groups in the regex
153 # if there are matches of some sort in this revision content, fill the lists for each cap_group
154 if self.pattern.search(content) is not None:
155 m = self.pattern.finditer(content)
156 matchobjects = list(m)
158 for cap_group in self.capture_groups:
159 key = self._make_key(cap_group)
161 for match in matchobjects:
162 # we only want to add the match for the capture group if the match is not None
163 if match.group(cap_group) != None:
164 temp_list.append(match.group(cap_group))
166 # if temp_list of matches is empty just make that column None
167 if len(temp_list)==0:
168 temp_dict[key] = None
169 # else we put in the list we made in the for-loop above
171 temp_dict[key] = ', '.join(temp_list)
173 # there are no matches at all in this revision content, we default values to None
175 for cap_group in self.capture_groups:
176 key = self._make_key(cap_group)
177 temp_dict[key] = None
179 # there are no capture groups, we just search for all the matches of the regex
181 #given that there are matches to be made
182 if type(content) in(str, bytes):
183 if self.pattern.search(content) is not None:
184 m = self.pattern.findall(content)
185 temp_dict[self.label] = ', '.join(m)
187 temp_dict[self.label] = None
189 # update rev_data with our new columns
190 for k, v in temp_dict:
191 rev_data.setattr(k,v)
204 text_chars: int = None
206 reverteds: list[int] = None
211 collapsed_revs:int = None
214 pa.field("revid", pa.int64),
215 pa.field("date_time",pa.timestamp('ms')),
216 pa.field("articleid",pa.int64()),
217 pa.field("editorid",pa.int64()),
218 pa.field("title",pa.string()),
219 pa.field("namespace",pa.int32()),
220 pa.field("deleted",pa.binary()),
221 pa.field("test_chars",pa.int32()),
222 pa.field("revert",pa.binary()),
223 pa.field("reverteds",pa.list_(pa.int64())),
224 pa.field("sha1",pa.string()),
225 pa.field("minor",pa.binary()),
226 pa.field("editor",pa.string()),
227 pa.field("anon",pa.binary())
230 def to_pyarrow(self):
231 return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
234 def to_tsv_row(self):
237 for f in self.fields():
238 val = getattr(self, f.name)
239 if getattr(self, f.name) is None:
242 row.append("TRUE" if val else "FALSE")
244 elif f.type == datetime:
245 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
247 elif f.name in {'editor','title'}:
249 if f.name in TO_ENCODE:
250 row.append(quote(str(val)))
252 elif f.type == list[int]:
253 row.append('"' + ",".join([str(x) for x in val]) + '"')
256 if f.name in TO_ENCODE:
257 row.append(quote(str(val)))
261 return '\t'.join(row)
263 # def __init__(revid: int,
264 # date_time: datetime,
272 # reverteds: list[bool],
281 class RevDataCollapse(RevDataBase):
282 collapsed_revs:int = None
283 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
284 pa_schema_fields = RevDataBase.pa_schema_fields + pa_collapsed_revs_schema
285 pa_schema = pa.schema(pa_schema_fields)
288 class RevDataPersistence(RevDataBase):
289 token_revs:int = None
290 tokens_added:int = None
291 tokens_removed:int = None
292 tokens_window:int = None
293 pa_persistence_schema_fields = [
294 pa.field(token_revs, pa.int64()),
295 pa.field(tokens_added, pa.int64()),
296 pa.field(tokens_removed, pa.int64()),
297 pa.tokens_window, pa.int64()]
299 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
302 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
303 pa_scehma_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
306 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
309 persist : what persistence method to use. Takes a PersistMethod value
311 self.input_file = input_file
313 self.collapse_user = collapse_user
314 self.persist = persist
316 self.urlencode = urlencode
317 self.revert_radius = revert_radius
319 self.output_buffer = []
320 self.output_buffer_size = output_buffer_size
322 if namespaces is not None:
323 self.namespace_filter = set(namespaces)
325 self.namespace_filter = None
327 self.regex_schemas = []
328 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
329 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
331 if self.collapse_user is True:
332 if self.persist == PersistMethod.none:
333 revdata_type = RevDataCollapse
335 revdata_type = RevDataCollapsePersistence
336 elif self.persist != PersistMethod.none:
337 revdata_type = RevDataPersistence
339 revdata_type = RevDataBase
341 regex_fields = [(field.name, list[str]), for field in self.regex_schemas]
342 self.revdata_type = dataclasses.make_dataclass('RevData_Parser',
343 fields=map(regex_fields,
344 lambda pa_field: (pa_field.name,
346 field(default=None))),
347 bases=(revdata_type))
349 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + regex_fields
351 if output_parquet is True:
352 self.output_parquet = True
353 self.pq_writer = None
354 self.output_file = output_file
356 self.output_file = open(output_file,'w')
359 def make_matchmake_pairs(self, patterns, labels):
360 if (patterns is not None and labels is not None) and \
361 (len(patterns) == len(labels)):
363 for pattern, label in zip(patterns, labels):
364 result.append(RegexPair(pattern, label))
365 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
368 elif (patterns is None and labels is None):
371 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
373 def matchmake(self, rev, rev_data):
374 rev_data = self.matchmake_revision(rev.text, rev_data)
375 rev_data = self.matchmake_comment(rev.comment, rev_data)
378 def matchmake_revision(self, text, rev_data):
379 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
381 def matchmake_comment(self, comment, rev_data):
382 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
384 def matchmake_pairs(self, text, rev_data, pairs):
386 rev_data = pair.matchmake(text, rev_data)
389 def __get_namespace_from_title(self, title):
392 for ns in self.namespaces:
393 # skip if the namespace is not defined
395 default_ns = self.namespaces[ns]
398 if title.startswith(ns + ":"):
399 return self.namespaces[ns]
401 # if we've made it this far with no matches, we return the default namespace
407 # create a regex that creates the output filename
408 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
409 # r'output/wikiq-\1-\2.tsv',
412 # Construct dump file iterator
413 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
415 # extract list of namspaces
416 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
422 # Iterate through pages
424 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
426 # skip namespaces not in the filter
427 if self.namespace_filter is not None:
428 if namespace not in self.namespace_filter:
431 rev_detector = mwreverts.Detector(radius = self.revert_radius)
433 if self.persist != PersistMethod.none:
434 window = deque(maxlen=PERSISTENCE_RADIUS)
436 if self.persist == PersistMethod.sequence:
437 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
438 revert_radius=PERSISTENCE_RADIUS)
440 elif self.persist == PersistMethod.segment:
441 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
442 revert_radius=PERSISTENCE_RADIUS)
444 # self.persist == PersistMethod.legacy
446 from mw.lib import persistence
447 state = persistence.State()
449 # Iterate through a page's revisions
452 rev_data = self.revdata_type(revid = rev.id,
453 date_time = rev.timestamp,
455 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
457 deleted = rev.deleted.text
460 rev_data = self.matchmake(rev, rev_data)
462 if not rev.deleted.text:
463 # rev.text can be None if the page has no text
466 # if text exists, we'll check for a sha1 and generate one otherwise
472 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
474 rev_data.sha1 = text_sha1
476 # TODO rev.bytes doesn't work.. looks like a bug
477 rev_data.text_chars = len(rev.text)
479 # generate revert data
480 rev_data.revert = rev_detector.process(text_sha1, rev.id)
483 rev_data.reverteds = revert.reverteds
485 # if the fact that the edit was minor can be hidden, this might be an issue
486 rev_data.minor = rev.minor
488 if not rev.deleted.user:
489 # wrap user-defined editors in quotes for fread
490 rev_data.editor = rev.user.text
491 rev_data.anon = rev.user.id == None
493 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
498 #TODO missing: additions_size deletions_size
500 # if collapse user was on, lets run that
501 if self.collapse_user:
502 rev_data.collapsed_revs = rev.collapsed_revs
504 if self.persist != PersistMethod.none:
506 if not rev.deleted.text:
508 if self.persist != PersistMethod.legacy:
509 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
512 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
514 window.append((rev.id, rev_data, tokens_added, tokens_removed))
516 if len(window) == PERSISTENCE_RADIUS:
517 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
519 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
521 rev_data.token_revs = num_token_revs
522 rev_data.tokens_added = num_tokens
523 rev_data.tokens_removed = len(old_tokens_removed)
524 rev_data.tokens_window = PERSISTENCE_RADIUS-1
526 self.print_rev_data(rev_data)
529 self.print_rev_data(rev_data)
533 if self.persist != PersistMethod.none:
534 # print out metadata for the last RADIUS revisions
535 for i, item in enumerate(window):
536 # if the window was full, we've already printed item 0
537 if len(window) == PERSISTENCE_RADIUS and i == 0:
540 rev_id, rev_data, tokens_added, tokens_removed = item
541 num_token_revs, num_tokens = calculate_persistence(tokens_added)
543 rev_data.token_revs = num_token_revs
544 rev_data.tokens_added = num_tokens
545 rev_data.tokens_removed = len(tokens_removed)
546 rev_data.tokens_window = len(window)-(i+1)
547 self.print_rev_data(rev_data)
551 print("Done: %s revisions and %s pages." % (rev_count, page_count),
554 if self.output_parquet is True:
555 self.flush_parquet_buffer()
556 self.pq_writer.close()
562 def write_parquet_row(self, rev_data):
563 padata = rev_data.to_pyarrow()
564 self.output_buffer.append(padata)
566 if len(self.output_buffer) >= self.output_buffer_size:
567 self.flush_parquet_buffer()
570 def flush_parquet_buffer(self):
571 outtable = pa.table.concat_arrays(self.output_buffer)
572 if self.pq_writer is None:
573 schema = pa.schema(self.revdata_type.pa_schema_field)
574 self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
576 self.pq_writer.write_table(outtable)
577 self.output_buffer = []
579 def print_rev_data(self, rev_data):
580 if self.output_parquet is False:
581 printfunc = self.write_tsv_row
583 printfunc = self.write_parquet_row
587 def write_tsv_row(self, rev_data):
589 self.output_buffer.append(rev_data.to_tsv_line())
591 if len(self.output_buffer) >= self.output_buffer_size:
592 self.flush_tsv_buffer()
595 def flush_tsv_buffer():
596 if self.output_header:
598 def open_input_file(input_filename):
599 if re.match(r'.*\.7z$', input_filename):
600 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
601 elif re.match(r'.*\.gz$', input_filename):
602 cmd = ["zcat", input_filename]
603 elif re.match(r'.*\.bz2$', input_filename):
604 cmd = ["bzcat", "-dk", input_filename]
607 input_file = Popen(cmd, stdout=PIPE).stdout
609 input_file = open(input_filename, 'r')
613 def get_output_filename(input_filename, parquet = False):
614 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
615 output_filename = re.sub(r'\.xml', '', output_filename)
617 output_filename = output_filename + ".tsv"
619 output_filename = output_filename + ".parquet"
620 return output_filename
622 def open_output_file(input_filename):
623 # create a regex that creates the output filename
624 output_filename = get_output_filename(input_filename, parquet = False)
625 output_file = open(output_filename, "w")
628 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
630 # arguments for the input direction
631 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
632 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
634 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
635 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
637 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
638 help="Write output to standard out (do not create dump file)")
640 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
641 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
643 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
644 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
646 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
647 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
649 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
650 help="Id number of namspace to include. Can be specified more than once.")
652 parser.add_argument('-rr',
654 dest="revert_radius",
658 help="Number of edits to check when looking for reverts (default: 15)")
660 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
661 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
663 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
664 help="The label for the outputted column based on matching the regex in revision text.")
666 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
667 help="The regular expression to search for in comments of revisions.")
669 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
670 help="The label for the outputted column based on matching the regex in comments.")
672 args = parser.parse_args()
676 # set persistence method
678 if args.persist is None:
679 persist = PersistMethod.none
680 elif args.persist == "segment":
681 persist = PersistMethod.segment
682 elif args.persist == "legacy":
683 persist = PersistMethod.legacy
685 persist = PersistMethod.sequence
687 if args.namespace_filter is not None:
688 namespaces = args.namespace_filter
692 if len(args.dumpfiles) > 0:
693 output_parquet = False
694 for filename in args.dumpfiles:
695 input_file = open_input_file(filename)
697 # open directory for output
699 output_dir = args.output_dir[0]
703 if output_dir.endswith(".parquet"):
704 output_parquet = True
706 print("Processing file: %s" % filename, file=sys.stderr)
709 output_file = sys.stdout
711 filename = os.path.join(output_dir, os.path.basename(filename))
712 output_file = get_output_filename(filename, parquet = output_parquet)
714 wikiq = WikiqParser(input_file,
716 collapse_user=args.collapse_user,
718 urlencode=args.urlencode,
719 namespaces=namespaces,
720 revert_radius=args.revert_radius,
721 regex_match_revision = args.regex_match_revision,
722 regex_revision_label = args.regex_revision_label,
723 regex_match_comment = args.regex_match_comment,
724 regex_comment_label = args.regex_comment_label,
725 output_parquet=output_parquet)
727 print(wikiq.output_parquet)
734 wikiq = WikiqParser(sys.stdin,
736 collapse_user=args.collapse_user,
738 #persist_legacy=args.persist_legacy,
739 urlencode=args.urlencode,
740 namespaces=namespaces,
741 revert_radius=args.revert_radius,
742 regex_match_revision = args.regex_match_revision,
743 regex_revision_label = args.regex_revision_label,
744 regex_match_comment = args.regex_match_comment,
745 regex_comment_label = args.regex_comment_label)
749 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
750 # stop_words = stop_words.split(",")