3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime,timezone
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
17 from mwxml import Dump
19 from deltas.tokenizers import wikitext_split
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
28 import dataclasses as dc
29 from dataclasses import dataclass, make_dataclass
31 import pyarrow.parquet as pq
39 def calculate_persistence(tokens_added):
40 return(sum([(len(x.revisions)-1) for x in tokens_added]),
43 class WikiqIterator():
44 def __init__(self, fh, collapse_user=False):
46 self.collapse_user = collapse_user
47 self.mwiterator = Dump.from_file(self.fh)
48 self.namespace_map = { ns.id : ns.name for ns in
49 self.mwiterator.site_info.namespaces }
50 self.__pages = self.load_pages()
53 for page in self.mwiterator:
55 namespace_map = self.namespace_map,
56 collapse_user=self.collapse_user)
62 return next(self._pages)
65 __slots__ = ('id', 'title', 'namespace', 'redirect',
66 'restrictions', 'mwpage', '__revisions',
69 def __init__(self, page, namespace_map, collapse_user=False):
71 self.namespace = page.namespace
72 # following mwxml, we assume namespace 0 in cases where
73 # page.namespace is inconsistent with namespace_map
74 if page.namespace not in namespace_map:
75 self.title = page.title
77 if page.namespace != 0:
78 self.title = ':'.join([namespace_map[page.namespace], page.title])
80 self.title = page.title
81 self.restrictions = page.restrictions
82 self.collapse_user = collapse_user
84 self.__revisions = self.rev_list()
87 # Outline for how we want to handle collapse_user=True
88 # iteration rev.user prev_rev.user add prev_rev?
95 for i, rev in enumerate(self.mwpage):
96 # never yield the first time
98 if self.collapse_user:
100 rev.collapsed_revs = collapsed_revs
103 if self.collapse_user:
104 # yield if this is the last edit in a seq by a user and reset
105 # also yield if we do know who the user is
107 if rev.deleted.user or prev_rev.deleted.user:
110 rev.collapsed_revs = collapsed_revs
112 elif not rev.user.text == prev_rev.user.text:
115 rev.collapsed_revs = collapsed_revs
116 # otherwise, add one to the counter
119 rev.collapsed_revs = collapsed_revs
120 # if collapse_user is false, we always yield
126 # also yield the final time
130 return self.__revisions
133 return next(self.__revisions)
136 class RegexPair(object):
137 def __init__(self, pattern, label):
138 self.pattern = re.compile(pattern)
140 self.has_groups = bool(self.pattern.groupindex)
142 self.capture_groups = list(self.pattern.groupindex.keys())
144 def get_pyarrow_fields(self):
146 fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
147 for cap_group in self.capture_groups]
149 fields = [pa.field(self.label, pa.list_(pa.string()))]
153 def _make_key(self, cap_group):
154 return ("{}_{}".format(self.label, cap_group))
156 def matchmake(self, content, rev_data):
159 # if there are named capture groups in the regex
162 # if there are matches of some sort in this revision content, fill the lists for each cap_group
163 if self.pattern.search(content) is not None:
164 m = self.pattern.finditer(content)
165 matchobjects = list(m)
167 for cap_group in self.capture_groups:
168 key = self._make_key(cap_group)
170 for match in matchobjects:
171 # we only want to add the match for the capture group if the match is not None
172 if match.group(cap_group) != None:
173 temp_list.append(match.group(cap_group))
175 # if temp_list of matches is empty just make that column None
176 if len(temp_list)==0:
177 temp_dict[key] = None
178 # else we put in the list we made in the for-loop above
180 temp_dict[key] = ', '.join(temp_list)
182 # there are no matches at all in this revision content, we default values to None
184 for cap_group in self.capture_groups:
185 key = self._make_key(cap_group)
186 temp_dict[key] = None
188 # there are no capture groups, we just search for all the matches of the regex
190 #given that there are matches to be made
191 if type(content) in(str, bytes):
192 if self.pattern.search(content) is not None:
193 m = self.pattern.findall(content)
194 temp_dict[self.label] = ', '.join(m)
196 temp_dict[self.label] = None
198 # update rev_data with our new columns
199 for k, v in temp_dict.items():
200 setattr(rev_data, k, v)
213 text_chars: int = None
215 reverteds: list[int] = None
223 pa.field("revid", pa.int64()),
224 pa.field("date_time",pa.timestamp('ms')),
225 pa.field("articleid",pa.int64()),
226 pa.field("editorid",pa.int64()),
227 pa.field("title",pa.string()),
228 pa.field("namespace",pa.int32()),
229 pa.field("deleted",pa.bool_()),
230 pa.field("test_chars",pa.int32()),
231 pa.field("revert",pa.bool_()),
232 pa.field("reverteds",pa.list_(pa.int64())),
233 pa.field("sha1",pa.string()),
234 pa.field("minor",pa.bool_()),
235 pa.field("editor",pa.string()),
236 pa.field("anon",pa.bool_())
239 def to_pyarrow(self):
240 return dc.astuple(self)
242 def to_tsv_row(self):
245 for f in dc.fields(self):
246 val = getattr(self, f.name)
247 if getattr(self, f.name) is None:
250 row.append("TRUE" if val else "FALSE")
252 elif f.type == datetime:
253 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
255 elif f.name in {'editor','title'}:
257 if self.urlencode and f.name in TO_ENCODE:
258 row.append(quote(str(s)))
262 elif f.type == list[int]:
263 row.append('"' + ",".join([str(x) for x in val]) + '"')
266 if self.urlencode and f.name in TO_ENCODE:
267 row.append(quote(str(val)))
273 return '\t'.join(map(str,row))
275 def header_row(self):
276 return '\t'.join(map(lambda f: f.name, dc.fields(self)))
279 class RevDataCollapse(RevDataBase):
280 collapsed_revs:int = None
281 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
282 pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
285 class RevDataPersistence(RevDataBase):
286 token_revs:int = None
287 tokens_added:int = None
288 tokens_removed:int = None
289 tokens_window:int = None
291 pa_persistence_schema_fields = [
292 pa.field("token_revs", pa.int64()),
293 pa.field("tokens_added", pa.int64()),
294 pa.field("tokens_removed", pa.int64()),
295 pa.field("tokens_window", pa.int64())]
297 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
300 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
301 pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
304 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
307 persist : what persistence method to use. Takes a PersistMethod value
309 self.input_file = input_file
311 self.collapse_user = collapse_user
312 self.persist = persist
314 self.urlencode = urlencode
315 self.revert_radius = revert_radius
317 if namespaces is not None:
318 self.namespace_filter = set(namespaces)
320 self.namespace_filter = None
322 self.regex_schemas = []
323 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
324 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
326 if self.collapse_user is True:
327 if self.persist == PersistMethod.none:
328 revdata_type = RevDataCollapse
330 revdata_type = RevDataCollapsePersistence
331 elif self.persist != PersistMethod.none:
332 revdata_type = RevDataPersistence
334 revdata_type = RevDataBase
336 regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
338 self.revdata_type = make_dataclass('RevData_Parser',
340 bases=(revdata_type,))
342 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
344 self.revdata_type.urlencode = self.urlencode
346 if output_parquet is True:
347 self.output_parquet = True
348 self.pq_writer = None
349 self.output_file = output_file
350 self.parquet_buffer = []
351 self.parquet_buffer_size = parquet_buffer_size
353 self.print_header = True
354 if output_file == sys.stdout:
356 self.output_file = output_file
358 self.output_file = open(output_file,'w')
359 self.output_parquet = False
361 def make_matchmake_pairs(self, patterns, labels):
362 if (patterns is not None and labels is not None) and \
363 (len(patterns) == len(labels)):
365 for pattern, label in zip(patterns, labels):
366 rp = RegexPair(pattern, label)
368 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
370 elif (patterns is None and labels is None):
373 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
375 def matchmake(self, rev, rev_data):
376 rev_data = self.matchmake_revision(rev.text, rev_data)
377 rev_data = self.matchmake_comment(rev.comment, rev_data)
380 def matchmake_revision(self, text, rev_data):
381 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
383 def matchmake_comment(self, comment, rev_data):
384 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
386 def matchmake_pairs(self, text, rev_data, pairs):
388 rev_data = pair.matchmake(text, rev_data)
391 def __get_namespace_from_title(self, title):
394 for ns in self.namespaces:
395 # skip if the namespace is not defined
397 default_ns = self.namespaces[ns]
400 if title.startswith(ns + ":"):
401 return self.namespaces[ns]
403 # if we've made it this far with no matches, we return the default namespace
409 # create a regex that creates the output filename
410 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
411 # r'output/wikiq-\1-\2.tsv',
414 # Construct dump file iterator
415 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
417 # extract list of namspaces
418 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
424 # Iterate through pages
426 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
428 # skip namespaces not in the filter
429 if self.namespace_filter is not None:
430 if namespace not in self.namespace_filter:
433 rev_detector = mwreverts.Detector(radius = self.revert_radius)
435 if self.persist != PersistMethod.none:
436 window = deque(maxlen=PERSISTENCE_RADIUS)
438 if self.persist == PersistMethod.sequence:
439 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
440 revert_radius=PERSISTENCE_RADIUS)
442 elif self.persist == PersistMethod.segment:
443 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
444 revert_radius=PERSISTENCE_RADIUS)
446 # self.persist == PersistMethod.legacy
448 from mw.lib import persistence
449 state = persistence.State()
451 # Iterate through a page's revisions
454 rev_data = self.revdata_type(revid = rev.id,
455 date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
457 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
459 deleted = rev.deleted.text,
460 namespace = namespace
463 rev_data = self.matchmake(rev, rev_data)
465 if not rev.deleted.text:
466 # rev.text can be None if the page has no text
469 # if text exists, we'll check for a sha1 and generate one otherwise
474 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
476 rev_data.sha1 = text_sha1
478 # TODO rev.bytes doesn't work.. looks like a bug
479 rev_data.text_chars = len(rev.text)
481 # generate revert data
482 revert = rev_detector.process(text_sha1, rev.id)
485 rev_data.revert = True
486 rev_data.reverteds = revert.reverteds
488 rev_data.revert = False
490 # if the fact that the edit was minor can be hidden, this might be an issue
491 rev_data.minor = rev.minor
493 if not rev.deleted.user:
494 # wrap user-defined editors in quotes for fread
495 rev_data.editor = rev.user.text
496 rev_data.anon = rev.user.id is None
498 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
503 #TODO missing: additions_size deletions_size
505 # if collapse user was on, lets run that
506 if self.collapse_user:
507 rev_data.collapsed_revs = rev.collapsed_revs
510 if self.persist != PersistMethod.none:
511 if not rev.deleted.text:
513 if self.persist != PersistMethod.legacy:
514 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
517 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
519 window.append((rev.id, rev_data, tokens_added, tokens_removed))
521 if len(window) == PERSISTENCE_RADIUS:
522 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
524 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
526 old_rev_data.token_revs = num_token_revs
527 old_rev_data.tokens_added = num_tokens
528 old_rev_data.tokens_removed = len(old_tokens_removed)
529 old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
531 self.print_rev_data(old_rev_data)
534 self.print_rev_data(rev_data)
538 if self.persist != PersistMethod.none:
539 # print out metadata for the last RADIUS revisions
540 for i, item in enumerate(window):
541 # if the window was full, we've already printed item 0
542 if len(window) == PERSISTENCE_RADIUS and i == 0:
545 rev_id, rev_data, tokens_added, tokens_removed = item
546 num_token_revs, num_tokens = calculate_persistence(tokens_added)
548 rev_data.token_revs = num_token_revs
549 rev_data.tokens_added = num_tokens
550 rev_data.tokens_removed = len(tokens_removed)
551 rev_data.tokens_window = len(window)-(i+1)
552 self.print_rev_data(rev_data)
556 print("Done: %s revisions and %s pages." % (rev_count, page_count),
559 if self.output_parquet is True:
560 self.flush_parquet_buffer()
561 self.pq_writer.close()
564 self.output_file.close()
567 def write_parquet_row(self, rev_data):
568 padata = rev_data.to_pyarrow()
569 self.parquet_buffer.append(padata)
571 if len(self.parquet_buffer) >= self.parquet_buffer_size:
572 self.flush_parquet_buffer()
575 def flush_parquet_buffer(self):
576 schema = pa.schema(self.revdata_type.pa_schema_fields)
578 def row_to_col(rg, types):
585 for j in range(len(cols)):
586 cols[j].append(row[j])
589 for col, typ in zip(cols, types):
590 arrays.append(pa.array(col, typ))
593 outtable = pa.Table.from_arrays(row_to_col(self.parquet_buffer, schema.types), schema=schema)
594 if self.pq_writer is None:
595 self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
597 self.pq_writer.write_table(outtable)
598 self.parquet_buffer = []
600 def print_rev_data(self, rev_data):
601 if self.output_parquet is False:
602 printfunc = self.write_tsv_row
604 printfunc = self.write_parquet_row
608 def write_tsv_row(self, rev_data):
609 if self.print_header:
610 print(rev_data.header_row(), file=self.output_file)
611 self.print_header = False
613 line = rev_data.to_tsv_row()
614 print(line, file=self.output_file)
617 def open_input_file(input_filename):
618 if re.match(r'.*\.7z$', input_filename):
619 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
620 elif re.match(r'.*\.gz$', input_filename):
621 cmd = ["zcat", input_filename]
622 elif re.match(r'.*\.bz2$', input_filename):
623 cmd = ["bzcat", "-dk", input_filename]
626 input_file = Popen(cmd, stdout=PIPE).stdout
628 input_file = open(input_filename, 'r')
632 def get_output_filename(input_filename, parquet = False):
633 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
634 output_filename = re.sub(r'\.xml', '', output_filename)
636 output_filename = output_filename + ".tsv"
638 output_filename = output_filename + ".parquet"
639 return output_filename
641 def open_output_file(input_filename):
642 # create a regex that creates the output filename
643 output_filename = get_output_filename(input_filename, parquet = False)
644 output_file = open(output_filename, "w")
647 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
649 # arguments for the input direction
650 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
651 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
653 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
654 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
656 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
657 help="Write output to standard out (do not create dump file)")
659 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
660 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
662 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
663 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
665 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
666 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
668 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
669 help="Id number of namspace to include. Can be specified more than once.")
671 parser.add_argument('-rr',
673 dest="revert_radius",
677 help="Number of edits to check when looking for reverts (default: 15)")
679 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
680 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
682 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
683 help="The label for the outputted column based on matching the regex in revision text.")
685 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
686 help="The regular expression to search for in comments of revisions.")
688 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
689 help="The label for the outputted column based on matching the regex in comments.")
691 args = parser.parse_args()
695 # set persistence method
697 if args.persist is None:
698 persist = PersistMethod.none
699 elif args.persist == "segment":
700 persist = PersistMethod.segment
701 elif args.persist == "legacy":
702 persist = PersistMethod.legacy
704 persist = PersistMethod.sequence
706 if args.namespace_filter is not None:
707 namespaces = args.namespace_filter
711 if len(args.dumpfiles) > 0:
712 output_parquet = False
713 for filename in args.dumpfiles:
714 input_file = open_input_file(filename)
716 # open directory for output
718 output_dir = args.output_dir[0]
722 if output_dir.endswith(".parquet"):
723 output_parquet = True
725 print("Processing file: %s" % filename, file=sys.stderr)
728 output_file = sys.stdout
730 filename = os.path.join(output_dir, os.path.basename(filename))
731 output_file = get_output_filename(filename, parquet = output_parquet)
733 wikiq = WikiqParser(input_file,
735 collapse_user=args.collapse_user,
737 urlencode=args.urlencode,
738 namespaces=namespaces,
739 revert_radius=args.revert_radius,
740 regex_match_revision = args.regex_match_revision,
741 regex_revision_label = args.regex_revision_label,
742 regex_match_comment = args.regex_match_comment,
743 regex_comment_label = args.regex_comment_label,
744 output_parquet=output_parquet)
752 wikiq = WikiqParser(sys.stdin,
754 collapse_user=args.collapse_user,
756 #persist_legacy=args.persist_legacy,
757 urlencode=args.urlencode,
758 namespaces=namespaces,
759 revert_radius=args.revert_radius,
760 regex_match_revision = args.regex_match_revision,
761 regex_revision_label = args.regex_revision_label,
762 regex_match_comment = args.regex_match_comment,
763 regex_comment_label = args.regex_comment_label)
767 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
768 # stop_words = stop_words.split(",")