3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
17 from mwxml import Dump
19 from deltas.tokenizers import wikitext_split
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
28 from dataclasses import dataclass
31 import pyarrow.parquet as pq
39 def calculate_persistence(tokens_added):
40 return(sum([(len(x.revisions)-1) for x in tokens_added]),
44 class WikiqIterator():
45 def __init__(self, fh, collapse_user=False):
47 self.collapse_user = collapse_user
48 self.mwiterator = Dump.from_file(self.fh)
49 self.namespace_map = { ns.id : ns.name for ns in
50 self.mwiterator.site_info.namespaces }
51 self.__pages = self.load_pages()
54 for page in self.mwiterator:
56 namespace_map = self.namespace_map,
57 collapse_user=self.collapse_user)
63 return next(self._pages)
66 __slots__ = ('id', 'title', 'namespace', 'redirect',
67 'restrictions', 'mwpage', '__revisions',
70 def __init__(self, page, namespace_map, collapse_user=False):
72 self.namespace = page.namespace
73 # following mwxml, we assume namespace 0 in cases where
74 # page.namespace is inconsistent with namespace_map
75 if page.namespace not in namespace_map:
76 self.title = page.title
78 if page.namespace != 0:
79 self.title = ':'.join([namespace_map[page.namespace], page.title])
81 self.title = page.title
82 self.restrictions = page.restrictions
83 self.collapse_user = collapse_user
85 self.__revisions = self.rev_list()
88 # Outline for how we want to handle collapse_user=True
89 # iteration rev.user prev_rev.user add prev_rev?
96 for i, rev in enumerate(self.mwpage):
97 # never yield the first time
99 if self.collapse_user:
101 rev.collapsed_revs = collapsed_revs
104 if self.collapse_user:
105 # yield if this is the last edit in a seq by a user and reset
106 # also yield if we do know who the user is
108 if rev.deleted.user or prev_rev.deleted.user:
111 rev.collapsed_revs = collapsed_revs
113 elif not rev.user.text == prev_rev.user.text:
116 rev.collapsed_revs = collapsed_revs
117 # otherwise, add one to the counter
120 rev.collapsed_revs = collapsed_revs
121 # if collapse_user is false, we always yield
127 # also yield the final time
131 return self.__revisions
134 return next(self.__revisions)
137 class RegexPair(object):
138 def __init__(self, pattern, label):
139 self.pattern = re.compile(pattern)
141 self.has_groups = bool(self.pattern.groupindex)
143 self.capture_groups = list(self.pattern.groupindex.keys())
145 def _make_key(self, cap_group):
146 return ("{}_{}".format(self.label, cap_group))
148 def matchmake(self, content, rev_data):
151 # if there are named capture groups in the regex
154 # if there are matches of some sort in this revision content, fill the lists for each cap_group
155 if self.pattern.search(content) is not None:
156 m = self.pattern.finditer(content)
157 matchobjects = list(m)
159 for cap_group in self.capture_groups:
160 key = self._make_key(cap_group)
162 for match in matchobjects:
163 # we only want to add the match for the capture group if the match is not None
164 if match.group(cap_group) != None:
165 temp_list.append(match.group(cap_group))
167 # if temp_list of matches is empty just make that column None
168 if len(temp_list)==0:
169 temp_dict[key] = None
170 # else we put in the list we made in the for-loop above
172 temp_dict[key] = ', '.join(temp_list)
174 # there are no matches at all in this revision content, we default values to None
176 for cap_group in self.capture_groups:
177 key = self._make_key(cap_group)
178 temp_dict[key] = None
180 # there are no capture groups, we just search for all the matches of the regex
182 #given that there are matches to be made
183 if type(content) in(str, bytes):
184 if self.pattern.search(content) is not None:
185 m = self.pattern.findall(content)
186 temp_dict[self.label] = ', '.join(m)
188 temp_dict[self.label] = None
190 # update rev_data with our new columns
191 for k, v in temp_dict:
192 rev_data.setattr(k,v)
205 text_chars: int = None
207 reverteds: list[int] = None
212 collapsed_revs:int = None
215 pa.field("revid", pa.int64),
216 pa.field("date_time",pa.timestamp('ms')),
217 pa.field("articleid",pa.int64()),
218 pa.field("editorid",pa.int64()),
219 pa.field("title",pa.string()),
220 pa.field("namespace",pa.int32()),
221 pa.field("deleted",pa.binary()),
222 pa.field("test_chars",pa.int32()),
223 pa.field("revert",pa.binary()),
224 pa.field("reverteds",pa.list_(pa.int64())),
225 pa.field("sha1",pa.string()),
226 pa.field("minor",pa.binary()),
227 pa.field("editor",pa.string()),
228 pa.field("anon",pa.binary())
231 def to_pyarrow(self):
232 return pa.array(self.astuple(), map(self.pa_schema_fields, pa.field.type))
235 def to_tsv_row(self):
238 for f in self.fields():
239 val = getattr(self, f.name)
240 if getattr(self, f.name) is None:
243 row.append("TRUE" if val else "FALSE")
245 elif f.type == datetime:
246 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
248 elif f.name in {'editor','title'}:
250 if f.name in TO_ENCODE:
251 row.append(quote(str(val)))
253 elif f.type == list[int]:
254 row.append('"' + ",".join([str(x) for x in val]) + '"')
257 if f.name in TO_ENCODE:
258 row.append(quote(str(val)))
262 return '\t'.join(row)
264 # def __init__(revid: int,
265 # date_time: datetime,
273 # reverteds: list[bool],
282 class RevDataCollapse(RevDataBase):
283 collapsed_revs:int = None
284 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
285 pa_schema_fields = RevDataBase.pa_schema_fields + pa_collapsed_revs_schema
286 pa_schema = pa.schema(pa_schema_fields)
289 class RevDataPersistence(RevDataBase):
290 token_revs:int = None
291 tokens_added:int = None
292 tokens_removed:int = None
293 tokens_window:int = None
294 pa_persistence_schema_fields = [
295 pa.field(token_revs, pa.int64()),
296 pa.field(tokens_added, pa.int64()),
297 pa.field(tokens_removed, pa.int64()),
298 pa.tokens_window, pa.int64()]
300 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
303 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
304 pa_scehma_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
307 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
310 persist : what persistence method to use. Takes a PersistMethod value
312 self.input_file = input_file
314 self.collapse_user = collapse_user
315 self.persist = persist
317 self.urlencode = urlencode
318 self.revert_radius = revert_radius
320 self.output_buffer = []
321 self.output_buffer_size = output_buffer_size
323 if namespaces is not None:
324 self.namespace_filter = set(namespaces)
326 self.namespace_filter = None
328 self.regex_schemas = []
329 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
330 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
332 if self.collapse_user is True:
333 if self.persist == PersistMethod.none:
334 revdata_type = RevDataCollapse
336 revdata_type = RevDataCollapsePersistence
337 elif self.persist != PersistMethod.none:
338 revdata_type = RevDataPersistence
340 revdata_type = RevDataBase
342 regex_fields = [(field.name, list[str]), for field in self.regex_schemas]
343 self.revdata_type = dataclasses.make_dataclass('RevData_Parser',
344 fields=map(regex_fields,
345 lambda pa_field: (pa_field.name,
347 field(default=None))),
348 bases=(revdata_type))
350 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + regex_fields
352 if output_parquet is True:
353 self.output_parquet = True
354 self.pq_writer = None
355 self.output_file = output_file
357 self.output_file = open(output_file,'w')
360 def make_matchmake_pairs(self, patterns, labels):
361 if (patterns is not None and labels is not None) and \
362 (len(patterns) == len(labels)):
364 for pattern, label in zip(patterns, labels):
365 result.append(RegexPair(pattern, label))
366 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
369 elif (patterns is None and labels is None):
372 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
374 def matchmake(self, rev, rev_data):
375 rev_data = self.matchmake_revision(rev.text, rev_data)
376 rev_data = self.matchmake_comment(rev.comment, rev_data)
379 def matchmake_revision(self, text, rev_data):
380 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
382 def matchmake_comment(self, comment, rev_data):
383 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
385 def matchmake_pairs(self, text, rev_data, pairs):
387 rev_data = pair.matchmake(text, rev_data)
390 def __get_namespace_from_title(self, title):
393 for ns in self.namespaces:
394 # skip if the namespace is not defined
396 default_ns = self.namespaces[ns]
399 if title.startswith(ns + ":"):
400 return self.namespaces[ns]
402 # if we've made it this far with no matches, we return the default namespace
408 # create a regex that creates the output filename
409 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
410 # r'output/wikiq-\1-\2.tsv',
413 # Construct dump file iterator
414 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
416 # extract list of namspaces
417 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
423 # Iterate through pages
425 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
427 # skip namespaces not in the filter
428 if self.namespace_filter is not None:
429 if namespace not in self.namespace_filter:
432 rev_detector = mwreverts.Detector(radius = self.revert_radius)
434 if self.persist != PersistMethod.none:
435 window = deque(maxlen=PERSISTENCE_RADIUS)
437 if self.persist == PersistMethod.sequence:
438 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
439 revert_radius=PERSISTENCE_RADIUS)
441 elif self.persist == PersistMethod.segment:
442 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
443 revert_radius=PERSISTENCE_RADIUS)
445 # self.persist == PersistMethod.legacy
447 from mw.lib import persistence
448 state = persistence.State()
450 # Iterate through a page's revisions
453 rev_data = self.revdata_type(revid = rev.id,
454 date_time = rev.timestamp,
456 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
458 deleted = rev.deleted.text
461 rev_data = self.matchmake(rev, rev_data)
463 if not rev.deleted.text:
464 # rev.text can be None if the page has no text
467 # if text exists, we'll check for a sha1 and generate one otherwise
473 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
475 rev_data.sha1 = text_sha1
477 # TODO rev.bytes doesn't work.. looks like a bug
478 rev_data.text_chars = len(rev.text)
480 # generate revert data
481 rev_data.revert = rev_detector.process(text_sha1, rev.id)
484 rev_data.reverteds = revert.reverteds
486 # if the fact that the edit was minor can be hidden, this might be an issue
487 rev_data.minor = rev.minor
489 if not rev.deleted.user:
490 # wrap user-defined editors in quotes for fread
491 rev_data.editor = rev.user.text
492 rev_data.anon = rev.user.id == None
494 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
499 #TODO missing: additions_size deletions_size
501 # if collapse user was on, lets run that
502 if self.collapse_user:
503 rev_data.collapsed_revs = rev.collapsed_revs
505 if self.persist != PersistMethod.none:
507 if not rev.deleted.text:
509 if self.persist != PersistMethod.legacy:
510 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
513 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
515 window.append((rev.id, rev_data, tokens_added, tokens_removed))
517 if len(window) == PERSISTENCE_RADIUS:
518 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
520 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
522 rev_data.token_revs = num_token_revs
523 rev_data.tokens_added = num_tokens
524 rev_data.tokens_removed = len(old_tokens_removed)
525 rev_data.tokens_window = PERSISTENCE_RADIUS-1
527 self.print_rev_data(rev_data)
530 self.print_rev_data(rev_data)
534 if self.persist != PersistMethod.none:
535 # print out metadata for the last RADIUS revisions
536 for i, item in enumerate(window):
537 # if the window was full, we've already printed item 0
538 if len(window) == PERSISTENCE_RADIUS and i == 0:
541 rev_id, rev_data, tokens_added, tokens_removed = item
542 num_token_revs, num_tokens = calculate_persistence(tokens_added)
544 rev_data.token_revs = num_token_revs
545 rev_data.tokens_added = num_tokens
546 rev_data.tokens_removed = len(tokens_removed)
547 rev_data.tokens_window = len(window)-(i+1)
548 self.print_rev_data(rev_data)
552 print("Done: %s revisions and %s pages." % (rev_count, page_count),
555 if self.output_parquet is True:
556 self.flush_parquet_buffer()
557 self.pq_writer.close()
563 def write_parquet_row(self, rev_data):
564 padata = rev_data.to_pyarrow()
565 self.output_buffer.append(padata)
567 if len(self.output_buffer) >= self.output_buffer_size:
568 self.flush_parquet_buffer()
571 def flush_parquet_buffer(self):
572 outtable = pa.table.concat_arrays(self.output_buffer)
573 if self.pq_writer is None:
574 schema = pa.schema(self.revdata_type.pa_schema_field)
575 self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
577 self.pq_writer.write_table(outtable)
578 self.output_buffer = []
580 def print_rev_data(self, rev_data):
581 if self.output_parquet is False:
582 printfunc = self.write_tsv_row
584 printfunc = self.write_parquet_row
588 def write_tsv_row(self, rev_data):
590 self.output_buffer.append(rev_data.to_tsv_line())
592 if len(self.output_buffer) >= self.output_buffer_size:
593 self.flush_tsv_buffer()
596 def flush_tsv_buffer():
597 if self.output_header:
599 def open_input_file(input_filename):
600 if re.match(r'.*\.7z$', input_filename):
601 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
602 elif re.match(r'.*\.gz$', input_filename):
603 cmd = ["zcat", input_filename]
604 elif re.match(r'.*\.bz2$', input_filename):
605 cmd = ["bzcat", "-dk", input_filename]
608 input_file = Popen(cmd, stdout=PIPE).stdout
610 input_file = open(input_filename, 'r')
614 def get_output_filename(input_filename, parquet = False):
615 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
616 output_filename = re.sub(r'\.xml', '', output_filename)
618 output_filename = output_filename + ".tsv"
620 output_filename = output_filename + ".parquet"
621 return output_filename
623 def open_output_file(input_filename):
624 # create a regex that creates the output filename
625 output_filename = get_output_filename(input_filename, parquet = False)
626 output_file = open(output_filename, "w")
629 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
631 # arguments for the input direction
632 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
633 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
635 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
636 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
638 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
639 help="Write output to standard out (do not create dump file)")
641 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
642 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
644 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
645 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
647 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
648 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
650 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
651 help="Id number of namspace to include. Can be specified more than once.")
653 parser.add_argument('-rr',
655 dest="revert_radius",
659 help="Number of edits to check when looking for reverts (default: 15)")
661 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
662 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
664 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
665 help="The label for the outputted column based on matching the regex in revision text.")
667 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
668 help="The regular expression to search for in comments of revisions.")
670 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
671 help="The label for the outputted column based on matching the regex in comments.")
673 args = parser.parse_args()
677 # set persistence method
679 if args.persist is None:
680 persist = PersistMethod.none
681 elif args.persist == "segment":
682 persist = PersistMethod.segment
683 elif args.persist == "legacy":
684 persist = PersistMethod.legacy
686 persist = PersistMethod.sequence
688 if args.namespace_filter is not None:
689 namespaces = args.namespace_filter
693 if len(args.dumpfiles) > 0:
694 output_parquet = False
695 for filename in args.dumpfiles:
696 input_file = open_input_file(filename)
698 # open directory for output
700 output_dir = args.output_dir[0]
704 if output_dir.endswith(".parquet"):
705 output_parquet = True
707 print("Processing file: %s" % filename, file=sys.stderr)
710 output_file = sys.stdout
712 filename = os.path.join(output_dir, os.path.basename(filename))
713 output_file = get_output_filename(filename, parquet = output_parquet)
715 wikiq = WikiqParser(input_file,
717 collapse_user=args.collapse_user,
719 urlencode=args.urlencode,
720 namespaces=namespaces,
721 revert_radius=args.revert_radius,
722 regex_match_revision = args.regex_match_revision,
723 regex_revision_label = args.regex_revision_label,
724 regex_match_comment = args.regex_match_comment,
725 regex_comment_label = args.regex_comment_label,
726 output_parquet=output_parquet)
728 print(wikiq.output_parquet)
735 wikiq = WikiqParser(sys.stdin,
737 collapse_user=args.collapse_user,
739 #persist_legacy=args.persist_legacy,
740 urlencode=args.urlencode,
741 namespaces=namespaces,
742 revert_radius=args.revert_radius,
743 regex_match_revision = args.regex_match_revision,
744 regex_revision_label = args.regex_revision_label,
745 regex_match_comment = args.regex_match_comment,
746 regex_comment_label = args.regex_comment_label)
750 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
751 # stop_words = stop_words.split(",")