1 #!/usr/bin/env python3
\r
3 # original wikiq headers are: title articleid revid date_time anon
\r
4 # editor editor_id minor text_size text_entropy text_md5 reversion
\r
5 # additions_size deletions_size
\r
12 from subprocess import Popen, PIPE
\r
13 from collections import deque
\r
14 from hashlib import sha1
\r
16 from mwxml import Dump
\r
18 from deltas.tokenizers import wikitext_split
\r
19 import mwpersistence
\r
21 from urllib.parse import quote
\r
22 TO_ENCODE = ('title', 'editor')
\r
23 PERSISTENCE_RADIUS=7
\r
24 from deltas import SequenceMatcher
\r
25 from deltas import SegmentMatcher
\r
27 class PersistMethod:
\r
33 def calculate_persistence(tokens_added):
\r
34 return(sum([(len(x.revisions)-1) for x in tokens_added]),
\r
38 class WikiqIterator():
\r
39 def __init__(self, fh, collapse_user=False):
\r
41 self.collapse_user = collapse_user
\r
42 self.mwiterator = Dump.from_file(self.fh)
\r
43 self.namespace_map = { ns.id : ns.name for ns in
\r
44 self.mwiterator.site_info.namespaces }
\r
45 self.__pages = self.load_pages()
\r
47 def load_pages(self):
\r
48 for page in self.mwiterator:
\r
49 yield WikiqPage(page,
\r
50 namespace_map = self.namespace_map,
\r
51 collapse_user=self.collapse_user)
\r
57 return next(self._pages)
\r
60 __slots__ = ('id', 'title', 'namespace', 'redirect',
\r
61 'restrictions', 'mwpage', '__revisions',
\r
64 def __init__(self, page, namespace_map, collapse_user=False):
\r
66 self.namespace = page.namespace
\r
67 # following mwxml, we assume namespace 0 in cases where
\r
68 # page.namespace is inconsistent with namespace_map
\r
69 if page.namespace not in namespace_map:
\r
70 self.title = page.title
\r
72 if page.namespace != 0:
\r
73 self.title = ':'.join([namespace_map[page.namespace], page.title])
\r
75 self.title = page.title
\r
76 self.restrictions = page.restrictions
\r
77 self.collapse_user = collapse_user
\r
79 self.__revisions = self.rev_list()
\r
82 # Outline for how we want to handle collapse_user=True
\r
83 # iteration rev.user prev_rev.user add prev_rev?
\r
89 # Post-loop A Always
\r
90 for i, rev in enumerate(self.mwpage):
\r
91 # never yield the first time
\r
93 if self.collapse_user:
\r
95 rev.collapsed_revs = collapsed_revs
\r
98 if self.collapse_user:
\r
99 # yield if this is the last edit in a seq by a user and reset
\r
100 # also yield if we do know who the user is
\r
102 if rev.deleted.user or prev_rev.deleted.user:
\r
105 rev.collapsed_revs = collapsed_revs
\r
107 elif not rev.user.text == prev_rev.user.text:
\r
110 rev.collapsed_revs = collapsed_revs
\r
111 # otherwise, add one to the counter
\r
113 collapsed_revs += 1
\r
114 rev.collapsed_revs = collapsed_revs
\r
115 # if collapse_user is false, we always yield
\r
121 # also yield the final time
\r
124 def __iter__(self):
\r
125 return self.__revisions
\r
127 def __next__(self):
\r
128 return next(self.__revisions)
\r
131 class RegexPair(object):
\r
132 def __init__(self, pattern, label):
\r
133 self.pattern = re.compile(pattern)
\r
135 self.has_groups = bool(self.pattern.groupindex)
\r
136 if self.has_groups:
\r
137 self.capture_groups = list(self.pattern.groupindex.keys())
\r
139 def _make_key(self, cap_group):
\r
140 return ("{}_{}".format(self.label, cap_group))
\r
142 def matchmake(self, content, rev_data):
\r
146 # the searched text (content, which is rev.comment or rev.text) is empty
\r
148 # if there are capture groups, we go through and put in a value for each group
\r
149 if self.has_groups:
\r
150 for cap_group in self.capture_groups:
\r
151 key = self._make_key(cap_group)
\r
152 temp_dict[key] = None
\r
153 # if no capture groups, just put the value in for the associated label
\r
155 temp_dict[self.label] = None
\r
156 # searched text is not empty and we do the searches
\r
158 # if there are named capture groups in the regex
\r
159 if self.has_groups:
\r
161 # if there are matches of some sort in this revision content, fill the lists for each cap_group
\r
162 if self.pattern.search(content) is not None:
\r
163 m = self.pattern.finditer(content)
\r
164 matchobjects = list(m)
\r
166 for cap_group in self.capture_groups:
\r
167 key = self._make_key(cap_group)
\r
169 for match in matchobjects:
\r
170 # we only want to add the match for the capture group if the match is not None
\r
171 if match.group(cap_group) != None:
\r
172 temp_list.append(match.group(cap_group))
\r
174 # if temp_list of matches is empty just make that column None
\r
175 if len(temp_list)==0:
\r
176 temp_dict[key] = None
\r
177 # else we put in the list we made in the for-loop above
\r
179 temp_dict[key] = ', '.join(temp_list)
\r
181 # there are no matches at all in this revision content, we default values to None
\r
183 for cap_group in self.capture_groups:
\r
184 key = self._make_key(cap_group)
\r
185 temp_dict[key] = None
\r
187 # there are no capture groups, we just search for all the matches of the regex
\r
189 #given that there are matches to be made
\r
190 if self.pattern.search(content) is not None:
\r
191 m = self.pattern.findall(content)
\r
192 temp_dict[self.label] = ', '.join(m)
\r
194 temp_dict[self.label] = None
\r
196 # update rev_data with our new columns
\r
197 rev_data.update(temp_dict)
\r
201 class WikiqParser():
\r
202 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
\r
205 persist : what persistence method to use. Takes a PersistMethod value
\r
207 self.input_file = input_file
\r
208 self.output_file = output_file
\r
209 self.collapse_user = collapse_user
\r
210 self.persist = persist
\r
211 self.printed_header = False
\r
212 self.namespaces = []
\r
213 self.urlencode = urlencode
\r
214 self.revert_radius = revert_radius
\r
216 if namespaces is not None:
\r
217 self.namespace_filter = set(namespaces)
\r
219 self.namespace_filter = None
\r
221 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
\r
222 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
\r
225 def make_matchmake_pairs(self, patterns, labels):
\r
226 if (patterns is not None and labels is not None) and \
\r
227 (len(patterns) == len(labels)):
\r
228 return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
\r
229 elif (patterns is None and labels is None):
\r
232 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
\r
234 def matchmake(self, rev, rev_data):
\r
237 if not rev.comment:
\r
240 rev_data = self.matchmake_revision(rev.text, rev_data)
\r
241 rev_data = self.matchmake_comment(rev.comment, rev_data)
\r
244 def matchmake_revision(self, text, rev_data):
\r
245 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
\r
247 def matchmake_comment(self, comment, rev_data):
\r
248 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
\r
250 def matchmake_pairs(self, content, rev_data, pairs):
\r
252 rev_data = pair.matchmake(content, rev_data)
\r
255 def __get_namespace_from_title(self, title):
\r
258 for ns in self.namespaces:
\r
259 # skip if the namespace is not defined
\r
261 default_ns = self.namespaces[ns]
\r
264 if title.startswith(ns + ":"):
\r
265 return self.namespaces[ns]
\r
267 # if we've made it this far with no matches, we return the default namespace
\r
273 # create a regex that creates the output filename
\r
274 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
\r
275 # r'output/wikiq-\1-\2.tsv',
\r
278 # Construct dump file iterator
\r
279 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
\r
281 # extract list of namspaces
\r
282 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
\r
288 # Iterate through pages
\r
290 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
\r
292 # skip namespaces not in the filter
\r
293 if self.namespace_filter is not None:
\r
294 if namespace not in self.namespace_filter:
\r
297 rev_detector = mwreverts.Detector(radius = self.revert_radius)
\r
299 if self.persist != PersistMethod.none:
\r
300 window = deque(maxlen=PERSISTENCE_RADIUS)
\r
302 if self.persist == PersistMethod.sequence:
\r
303 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
\r
304 revert_radius=PERSISTENCE_RADIUS)
\r
306 elif self.persist == PersistMethod.segment:
\r
307 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
\r
308 revert_radius=PERSISTENCE_RADIUS)
\r
310 # self.persist == PersistMethod.legacy
\r
312 from mw.lib import persistence
\r
313 state = persistence.State()
\r
315 # Iterate through a page's revisions
\r
318 # initialize rev_data
\r
321 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
\r
322 'articleid' : page.id,
\r
323 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
\r
324 'title' : '"' + page.title + '"',
\r
325 'namespace' : namespace,
\r
326 'deleted' : "TRUE" if rev.deleted.text else "FALSE"
\r
329 rev_data = self.matchmake(rev, rev_data)
\r
331 # if revisions are deleted, /many/ things will be missing
\r
332 if rev.deleted.text:
\r
333 rev_data['text_chars'] = ""
\r
334 rev_data['sha1'] = ""
\r
335 rev_data['revert'] = ""
\r
336 rev_data['reverteds'] = ""
\r
339 # rev.text can be None if the page has no text
\r
342 # if text exists, we'll check for a sha1 and generate one otherwise
\r
345 text_sha1 = rev.sha1
\r
348 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
\r
350 rev_data['sha1'] = text_sha1
\r
352 # TODO rev.bytes doesn't work.. looks like a bug
\r
353 rev_data['text_chars'] = len(rev.text)
\r
355 # generate revert data
\r
356 revert = rev_detector.process(text_sha1, rev.id)
\r
359 rev_data['revert'] = "TRUE"
\r
360 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
\r
362 rev_data['revert'] = "FALSE"
\r
363 rev_data['reverteds'] = ""
\r
365 # if the fact that the edit was minor can be hidden, this might be an issue
\r
366 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
\r
368 if not rev.deleted.user:
\r
369 # wrap user-defined editors in quotes for fread
\r
370 rev_data['editor'] = '"' + rev.user.text + '"'
\r
371 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
\r
374 rev_data['anon'] = ""
\r
375 rev_data['editor'] = ""
\r
377 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
\r
382 #TODO missing: additions_size deletions_size
\r
384 # if collapse user was on, lets run that
\r
385 if self.collapse_user:
\r
386 rev_data['collapsed_revs'] = rev.collapsed_revs
\r
388 if self.persist != PersistMethod.none:
\r
389 if rev.deleted.text:
\r
390 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
\r
391 old_rev_data[k] = None
\r
394 if self.persist != PersistMethod.legacy:
\r
395 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
\r
398 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
\r
400 window.append((rev.id, rev_data, tokens_added, tokens_removed))
\r
402 if len(window) == PERSISTENCE_RADIUS:
\r
403 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
\r
405 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
\r
407 old_rev_data["token_revs"] = num_token_revs
\r
408 old_rev_data["tokens_added"] = num_tokens
\r
409 old_rev_data["tokens_removed"] = len(old_tokens_removed)
\r
410 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
\r
412 self.print_rev_data(old_rev_data)
\r
415 self.print_rev_data(rev_data)
\r
419 if self.persist != PersistMethod.none:
\r
420 # print out metadata for the last RADIUS revisions
\r
421 for i, item in enumerate(window):
\r
422 # if the window was full, we've already printed item 0
\r
423 if len(window) == PERSISTENCE_RADIUS and i == 0:
\r
426 rev_id, rev_data, tokens_added, tokens_removed = item
\r
427 num_token_revs, num_tokens = calculate_persistence(tokens_added)
\r
429 rev_data["token_revs"] = num_token_revs
\r
430 rev_data["tokens_added"] = num_tokens
\r
431 rev_data["tokens_removed"] = len(tokens_removed)
\r
432 rev_data["tokens_window"] = len(window)-(i+1)
\r
434 self.print_rev_data(rev_data)
\r
438 print("Done: %s revisions and %s pages." % (rev_count, page_count),
\r
441 def print_rev_data(self, rev_data):
\r
442 # if it's the first time through, print the header
\r
444 for field in TO_ENCODE:
\r
445 rev_data[field] = quote(str(rev_data[field]))
\r
447 if not self.printed_header:
\r
448 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
\r
449 self.printed_header = True
\r
451 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
\r
454 def open_input_file(input_filename):
\r
455 if re.match(r'.*\.7z$', input_filename):
\r
456 cmd = ["7za", "x", "-so", input_filename, '*']
\r
457 elif re.match(r'.*\.gz$', input_filename):
\r
458 cmd = ["zcat", input_filename]
\r
459 elif re.match(r'.*\.bz2$', input_filename):
\r
460 cmd = ["bzcat", "-dk", input_filename]
\r
463 input_file = Popen(cmd, stdout=PIPE).stdout
\r
465 input_file = open(input_filename, 'r')
\r
469 def open_output_file(input_filename):
\r
470 # create a regex that creates the output filename
\r
471 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
\r
472 output_filename = re.sub(r'\.xml', '', output_filename)
\r
473 output_filename = output_filename + ".tsv"
\r
474 output_file = open(output_filename, "w")
\r
478 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
\r
480 # arguments for the input direction
\r
481 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
\r
482 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
\r
484 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
\r
485 help="Directory for output files.")
\r
487 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
\r
488 help="Write output to standard out (do not create dump file)")
\r
490 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
\r
491 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
\r
493 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
\r
494 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
\r
496 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
\r
497 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
\r
499 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
\r
500 help="Id number of namspace to include. Can be specified more than once.")
\r
502 parser.add_argument('-rr',
\r
504 dest="revert_radius",
\r
508 help="Number of edits to check when looking for reverts (default: 15)")
\r
510 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
\r
511 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
\r
513 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
\r
514 help="The label for the outputted column based on matching the regex in revision text.")
\r
516 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
\r
517 help="The regular expression to search for in comments of revisions.")
\r
519 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
\r
520 help="The label for the outputted column based on matching the regex in comments.")
\r
522 args = parser.parse_args()
\r
524 # set persistence method
\r
526 if args.persist is None:
\r
527 persist = PersistMethod.none
\r
528 elif args.persist == "segment":
\r
529 persist = PersistMethod.segment
\r
530 elif args.persist == "legacy":
\r
531 persist = PersistMethod.legacy
\r
533 persist = PersistMethod.sequence
\r
535 if args.namespace_filter is not None:
\r
536 namespaces = args.namespace_filter
\r
540 if len(args.dumpfiles) > 0:
\r
541 for filename in args.dumpfiles:
\r
542 input_file = open_input_file(filename)
\r
544 # open directory for output
\r
545 if args.output_dir:
\r
546 output_dir = args.output_dir[0]
\r
550 print("Processing file: %s" % filename, file=sys.stderr)
\r
553 output_file = sys.stdout
\r
555 filename = os.path.join(output_dir, os.path.basename(filename))
\r
556 output_file = open_output_file(filename)
\r
558 wikiq = WikiqParser(input_file,
\r
560 collapse_user=args.collapse_user,
\r
562 urlencode=args.urlencode,
\r
563 namespaces=namespaces,
\r
564 revert_radius=args.revert_radius,
\r
565 regex_match_revision = args.regex_match_revision,
\r
566 regex_revision_label = args.regex_revision_label,
\r
567 regex_match_comment = args.regex_match_comment,
\r
568 regex_comment_label = args.regex_comment_label)
\r
574 output_file.close()
\r
576 wikiq = WikiqParser(sys.stdin,
\r
578 collapse_user=args.collapse_user,
\r
580 #persist_legacy=args.persist_legacy,
\r
581 urlencode=args.urlencode,
\r
582 namespaces=namespaces,
\r
583 revert_radius=args.revert_radius,
\r
584 regex_match_revision = args.regex_match_revision,
\r
585 regex_revision_label = args.regex_revision_label,
\r
586 regex_match_comment = args.regex_match_comment,
\r
587 regex_comment_label = args.regex_comment_label)
\r
591 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
\r
592 # stop_words = stop_words.split(",")
\r