3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from datetime import datetime
13 from subprocess import Popen, PIPE
14 from collections import deque
15 from hashlib import sha1
17 from mwxml import Dump
19 from deltas.tokenizers import wikitext_split
22 from urllib.parse import quote
23 TO_ENCODE = ('title', 'editor')
25 from deltas import SequenceMatcher
26 from deltas import SegmentMatcher
28 import dataclasses as dc
29 from dataclasses import dataclass, make_dataclass
31 import pyarrow.parquet as pq
39 def calculate_persistence(tokens_added):
40 return(sum([(len(x.revisions)-1) for x in tokens_added]),
43 class WikiqIterator():
44 def __init__(self, fh, collapse_user=False):
46 self.collapse_user = collapse_user
47 self.mwiterator = Dump.from_file(self.fh)
48 self.namespace_map = { ns.id : ns.name for ns in
49 self.mwiterator.site_info.namespaces }
50 self.__pages = self.load_pages()
53 for page in self.mwiterator:
55 namespace_map = self.namespace_map,
56 collapse_user=self.collapse_user)
62 return next(self._pages)
65 __slots__ = ('id', 'title', 'namespace', 'redirect',
66 'restrictions', 'mwpage', '__revisions',
69 def __init__(self, page, namespace_map, collapse_user=False):
71 self.namespace = page.namespace
72 # following mwxml, we assume namespace 0 in cases where
73 # page.namespace is inconsistent with namespace_map
74 if page.namespace not in namespace_map:
75 self.title = page.title
77 if page.namespace != 0:
78 self.title = ':'.join([namespace_map[page.namespace], page.title])
80 self.title = page.title
81 self.restrictions = page.restrictions
82 self.collapse_user = collapse_user
84 self.__revisions = self.rev_list()
87 # Outline for how we want to handle collapse_user=True
88 # iteration rev.user prev_rev.user add prev_rev?
95 for i, rev in enumerate(self.mwpage):
96 # never yield the first time
98 if self.collapse_user:
100 rev.collapsed_revs = collapsed_revs
103 if self.collapse_user:
104 # yield if this is the last edit in a seq by a user and reset
105 # also yield if we do know who the user is
107 if rev.deleted.user or prev_rev.deleted.user:
110 rev.collapsed_revs = collapsed_revs
112 elif not rev.user.text == prev_rev.user.text:
115 rev.collapsed_revs = collapsed_revs
116 # otherwise, add one to the counter
119 rev.collapsed_revs = collapsed_revs
120 # if collapse_user is false, we always yield
126 # also yield the final time
130 return self.__revisions
133 return next(self.__revisions)
136 class RegexPair(object):
137 def __init__(self, pattern, label):
138 self.pattern = re.compile(pattern)
140 self.has_groups = bool(self.pattern.groupindex)
142 self.capture_groups = list(self.pattern.groupindex.keys())
144 def _make_key(self, cap_group):
145 return ("{}_{}".format(self.label, cap_group))
147 def matchmake(self, content, rev_data):
150 # if there are named capture groups in the regex
153 # if there are matches of some sort in this revision content, fill the lists for each cap_group
154 if self.pattern.search(content) is not None:
155 m = self.pattern.finditer(content)
156 matchobjects = list(m)
158 for cap_group in self.capture_groups:
159 key = self._make_key(cap_group)
161 for match in matchobjects:
162 # we only want to add the match for the capture group if the match is not None
163 if match.group(cap_group) != None:
164 temp_list.append(match.group(cap_group))
166 # if temp_list of matches is empty just make that column None
167 if len(temp_list)==0:
168 temp_dict[key] = None
169 # else we put in the list we made in the for-loop above
171 temp_dict[key] = ', '.join(temp_list)
173 # there are no matches at all in this revision content, we default values to None
175 for cap_group in self.capture_groups:
176 key = self._make_key(cap_group)
177 temp_dict[key] = None
179 # there are no capture groups, we just search for all the matches of the regex
181 #given that there are matches to be made
182 if type(content) in(str, bytes):
183 if self.pattern.search(content) is not None:
184 m = self.pattern.findall(content)
185 temp_dict[self.label] = ', '.join(m)
187 temp_dict[self.label] = None
189 # update rev_data with our new columns
190 for k, v in temp_dict.items():
191 setattr(rev_data, k, v)
204 text_chars: int = None
206 reverteds: list[int] = None
213 pa.field("revid", pa.int64()),
214 pa.field("date_time",pa.timestamp('ms')),
215 pa.field("articleid",pa.int64()),
216 pa.field("editorid",pa.int64()),
217 pa.field("title",pa.string()),
218 pa.field("namespace",pa.int32()),
219 pa.field("deleted",pa.bool_()),
220 pa.field("test_chars",pa.int32()),
221 pa.field("revert",pa.bool_()),
222 pa.field("reverteds",pa.list_(pa.int64())),
223 pa.field("sha1",pa.string()),
224 pa.field("minor",pa.bool_()),
225 pa.field("editor",pa.string()),
226 pa.field("anon",pa.bool_())
229 def to_pyarrow(self):
230 return dc.astuple(self)
232 def to_tsv_row(self):
235 for f in self.fields():
236 val = getattr(self, f.name)
237 if getattr(self, f.name) is None:
240 row.append("TRUE" if val else "FALSE")
242 elif f.type == datetime:
243 row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
245 elif f.name in {'editor','title'}:
247 if f.name in TO_ENCODE:
248 row.append(quote(str(val)))
250 elif f.type == list[int]:
251 row.append('"' + ",".join([str(x) for x in val]) + '"')
254 if f.name in TO_ENCODE:
255 row.append(quote(str(val)))
259 return '\t'.join(row)
261 # def __init__(revid: int,
262 # date_time: datetime,
270 # reverteds: list[bool],
279 class RevDataCollapse(RevDataBase):
280 collapsed_revs:int = None
281 pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
282 pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
285 class RevDataPersistence(RevDataBase):
286 token_revs:int = None
287 tokens_added:int = None
288 tokens_removed:int = None
289 tokens_window:int = None
291 pa_persistence_schema_fields = [
292 pa.field("token_revs", pa.int64()),
293 pa.field("tokens_added", pa.int64()),
294 pa.field("tokens_removed", pa.int64()),
295 pa.field("tokens_window", pa.int64())]
297 pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
300 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
301 pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
304 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
307 persist : what persistence method to use. Takes a PersistMethod value
309 self.input_file = input_file
311 self.collapse_user = collapse_user
312 self.persist = persist
314 self.urlencode = urlencode
315 self.revert_radius = revert_radius
317 if namespaces is not None:
318 self.namespace_filter = set(namespaces)
320 self.namespace_filter = None
322 self.regex_schemas = []
323 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
324 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
326 if self.collapse_user is True:
327 if self.persist == PersistMethod.none:
328 revdata_type = RevDataCollapse
330 revdata_type = RevDataCollapsePersistence
331 elif self.persist != PersistMethod.none:
332 revdata_type = RevDataPersistence
334 revdata_type = RevDataBase
336 regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
338 self.revdata_type = make_dataclass('RevData_Parser',
340 bases=(revdata_type,))
342 self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
345 # print(list(map(lambda d: d.name, dc.fields(self.revdata_type))))
346 # print(self.revdata_type.pa_schema_fields)
348 if output_parquet is True:
349 self.output_parquet = True
350 self.pq_writer = None
351 self.output_file = output_file
352 self.parquet_buffer = []
353 self.parquet_buffer_size = parquet_buffer_size
355 self.output_file = open(output_file,'w')
358 def make_matchmake_pairs(self, patterns, labels):
359 if (patterns is not None and labels is not None) and \
360 (len(patterns) == len(labels)):
362 for pattern, label in zip(patterns, labels):
363 result.append(RegexPair(pattern, label))
364 self.regex_schemas.append(pa.field(label, pa.list_(pa.string())))
367 elif (patterns is None and labels is None):
370 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
372 def matchmake(self, rev, rev_data):
373 rev_data = self.matchmake_revision(rev.text, rev_data)
374 rev_data = self.matchmake_comment(rev.comment, rev_data)
377 def matchmake_revision(self, text, rev_data):
378 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
380 def matchmake_comment(self, comment, rev_data):
381 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
383 def matchmake_pairs(self, text, rev_data, pairs):
385 rev_data = pair.matchmake(text, rev_data)
388 def __get_namespace_from_title(self, title):
391 for ns in self.namespaces:
392 # skip if the namespace is not defined
394 default_ns = self.namespaces[ns]
397 if title.startswith(ns + ":"):
398 return self.namespaces[ns]
400 # if we've made it this far with no matches, we return the default namespace
406 # create a regex that creates the output filename
407 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
408 # r'output/wikiq-\1-\2.tsv',
411 # Construct dump file iterator
412 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
414 # extract list of namspaces
415 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
421 # Iterate through pages
423 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
425 # skip namespaces not in the filter
426 if self.namespace_filter is not None:
427 if namespace not in self.namespace_filter:
430 rev_detector = mwreverts.Detector(radius = self.revert_radius)
432 if self.persist != PersistMethod.none:
433 window = deque(maxlen=PERSISTENCE_RADIUS)
435 if self.persist == PersistMethod.sequence:
436 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
437 revert_radius=PERSISTENCE_RADIUS)
439 elif self.persist == PersistMethod.segment:
440 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
441 revert_radius=PERSISTENCE_RADIUS)
443 # self.persist == PersistMethod.legacy
445 from mw.lib import persistence
446 state = persistence.State()
448 # Iterate through a page's revisions
451 rev_data = self.revdata_type(revid = rev.id,
452 date_time = datetime.fromtimestamp(rev.timestamp.unix()),
454 editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
456 deleted = rev.deleted.text,
457 namespace = namespace
460 rev_data = self.matchmake(rev, rev_data)
462 if not rev.deleted.text:
463 # rev.text can be None if the page has no text
466 # if text exists, we'll check for a sha1 and generate one otherwise
472 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
474 rev_data.sha1 = text_sha1
476 # TODO rev.bytes doesn't work.. looks like a bug
477 rev_data.text_chars = len(rev.text)
479 # generate revert data
480 revert = rev_detector.process(text_sha1, rev.id)
483 rev_data.revert = True
484 rev_data.reverteds = revert.reverteds
486 rev_data.revert = False
488 # if the fact that the edit was minor can be hidden, this might be an issue
489 rev_data.minor = rev.minor
491 if not rev.deleted.user:
492 # wrap user-defined editors in quotes for fread
493 rev_data.editor = rev.user.text
494 rev_data.anon = rev.user.id == None
496 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
501 #TODO missing: additions_size deletions_size
503 # if collapse user was on, lets run that
504 if self.collapse_user:
505 rev_data.collapsed_revs = rev.collapsed_revs
507 if self.persist != PersistMethod.none:
509 if not rev.deleted.text:
511 if self.persist != PersistMethod.legacy:
512 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
515 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
517 window.append((rev.id, rev_data, tokens_added, tokens_removed))
519 if len(window) == PERSISTENCE_RADIUS:
520 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
522 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
524 rev_data.token_revs = num_token_revs
525 rev_data.tokens_added = num_tokens
526 rev_data.tokens_removed = len(old_tokens_removed)
527 rev_data.tokens_window = PERSISTENCE_RADIUS-1
529 self.print_rev_data(rev_data)
532 self.print_rev_data(rev_data)
536 if self.persist != PersistMethod.none:
537 # print out metadata for the last RADIUS revisions
538 for i, item in enumerate(window):
539 # if the window was full, we've already printed item 0
540 if len(window) == PERSISTENCE_RADIUS and i == 0:
543 rev_id, rev_data, tokens_added, tokens_removed = item
544 num_token_revs, num_tokens = calculate_persistence(tokens_added)
546 rev_data.token_revs = num_token_revs
547 rev_data.tokens_added = num_tokens
548 rev_data.tokens_removed = len(tokens_removed)
549 rev_data.tokens_window = len(window)-(i+1)
550 self.print_rev_data(rev_data)
554 print("Done: %s revisions and %s pages." % (rev_count, page_count),
557 if self.output_parquet is True:
558 self.flush_parquet_buffer()
559 self.pq_writer.close()
565 def write_parquet_row(self, rev_data):
566 padata = rev_data.to_pyarrow()
567 self.parquet_buffer.append(padata)
569 if len(self.parquet_buffer) >= self.parquet_buffer_size:
570 self.flush_parquet_buffer()
573 def flush_parquet_buffer(self):
574 schema = pa.schema(self.revdata_type.pa_schema_fields)
576 def row_to_col(rg, types):
583 for j in range(len(cols)):
584 cols[j].append(row[j])
587 for col, typ in zip(cols, types):
588 arrays.append(pa.array(col, typ))
591 outtable = pa.Table.from_arrays(row_to_col(self.parquet_buffer, schema.types), schema=schema)
592 if self.pq_writer is None:
593 self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
595 self.pq_writer.write_table(outtable)
596 self.parquet_buffer = []
598 def print_rev_data(self, rev_data):
599 if self.output_parquet is False:
600 printfunc = self.write_tsv_row
602 printfunc = self.write_parquet_row
606 def write_tsv_row(self, rev_data):
607 line = rev_data.to_tsv_line()
608 print(line, file=self.output_file)
611 def open_input_file(input_filename):
612 if re.match(r'.*\.7z$', input_filename):
613 cmd = ["7za", "x", "-so", input_filename, "*.xml"]
614 elif re.match(r'.*\.gz$', input_filename):
615 cmd = ["zcat", input_filename]
616 elif re.match(r'.*\.bz2$', input_filename):
617 cmd = ["bzcat", "-dk", input_filename]
620 input_file = Popen(cmd, stdout=PIPE).stdout
622 input_file = open(input_filename, 'r')
626 def get_output_filename(input_filename, parquet = False):
627 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
628 output_filename = re.sub(r'\.xml', '', output_filename)
630 output_filename = output_filename + ".tsv"
632 output_filename = output_filename + ".parquet"
633 return output_filename
635 def open_output_file(input_filename):
636 # create a regex that creates the output filename
637 output_filename = get_output_filename(input_filename, parquet = False)
638 output_file = open(output_filename, "w")
641 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
643 # arguments for the input direction
644 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
645 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
647 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
648 help="Directory for output files. If it ends with .parquet output will be in parquet format.")
650 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
651 help="Write output to standard out (do not create dump file)")
653 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
654 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
656 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
657 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
659 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
660 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
662 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
663 help="Id number of namspace to include. Can be specified more than once.")
665 parser.add_argument('-rr',
667 dest="revert_radius",
671 help="Number of edits to check when looking for reverts (default: 15)")
673 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
674 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
676 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
677 help="The label for the outputted column based on matching the regex in revision text.")
679 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
680 help="The regular expression to search for in comments of revisions.")
682 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
683 help="The label for the outputted column based on matching the regex in comments.")
685 args = parser.parse_args()
689 # set persistence method
691 if args.persist is None:
692 persist = PersistMethod.none
693 elif args.persist == "segment":
694 persist = PersistMethod.segment
695 elif args.persist == "legacy":
696 persist = PersistMethod.legacy
698 persist = PersistMethod.sequence
700 if args.namespace_filter is not None:
701 namespaces = args.namespace_filter
705 if len(args.dumpfiles) > 0:
706 output_parquet = False
707 for filename in args.dumpfiles:
708 input_file = open_input_file(filename)
710 # open directory for output
712 output_dir = args.output_dir[0]
716 if output_dir.endswith(".parquet"):
717 output_parquet = True
719 print("Processing file: %s" % filename, file=sys.stderr)
722 output_file = sys.stdout
724 filename = os.path.join(output_dir, os.path.basename(filename))
725 output_file = get_output_filename(filename, parquet = output_parquet)
727 wikiq = WikiqParser(input_file,
729 collapse_user=args.collapse_user,
731 urlencode=args.urlencode,
732 namespaces=namespaces,
733 revert_radius=args.revert_radius,
734 regex_match_revision = args.regex_match_revision,
735 regex_revision_label = args.regex_revision_label,
736 regex_match_comment = args.regex_match_comment,
737 regex_comment_label = args.regex_comment_label,
738 output_parquet=output_parquet)
740 print(wikiq.output_parquet)
747 wikiq = WikiqParser(sys.stdin,
749 collapse_user=args.collapse_user,
751 #persist_legacy=args.persist_legacy,
752 urlencode=args.urlencode,
753 namespaces=namespaces,
754 revert_radius=args.revert_radius,
755 regex_match_revision = args.regex_match_revision,
756 regex_revision_label = args.regex_revision_label,
757 regex_match_comment = args.regex_match_comment,
758 regex_comment_label = args.regex_comment_label)
762 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
763 # stop_words = stop_words.split(",")