1 #!/usr/bin/env python3
\r
3 # original wikiq headers are: title articleid revid date_time anon
\r
4 # editor editor_id minor text_size text_entropy text_md5 reversion
\r
5 # additions_size deletions_size
\r
12 from subprocess import Popen, PIPE
\r
13 from collections import deque
\r
14 from hashlib import sha1
\r
16 from mwxml import Dump
\r
18 from deltas.tokenizers import wikitext_split
\r
19 import mwpersistence
\r
21 from urllib.parse import quote
\r
22 TO_ENCODE = ('title', 'editor')
\r
23 PERSISTENCE_RADIUS=7
\r
24 from deltas import SequenceMatcher
\r
25 from deltas import SegmentMatcher
\r
27 class PersistMethod:
\r
33 def calculate_persistence(tokens_added):
\r
34 return(sum([(len(x.revisions)-1) for x in tokens_added]),
\r
38 class WikiqIterator():
\r
39 def __init__(self, fh, collapse_user=False):
\r
41 self.collapse_user = collapse_user
\r
42 self.mwiterator = Dump.from_file(self.fh)
\r
43 self.namespace_map = { ns.id : ns.name for ns in
\r
44 self.mwiterator.site_info.namespaces }
\r
45 self.__pages = self.load_pages()
\r
47 def load_pages(self):
\r
48 for page in self.mwiterator:
\r
49 yield WikiqPage(page,
\r
50 namespace_map = self.namespace_map,
\r
51 collapse_user=self.collapse_user)
\r
57 return next(self._pages)
\r
60 __slots__ = ('id', 'title', 'namespace', 'redirect',
\r
61 'restrictions', 'mwpage', '__revisions',
\r
64 def __init__(self, page, namespace_map, collapse_user=False):
\r
66 self.namespace = page.namespace
\r
67 # following mwxml, we assume namespace 0 in cases where
\r
68 # page.namespace is inconsistent with namespace_map
\r
69 if page.namespace not in namespace_map:
\r
70 self.title = page.title
\r
72 if page.namespace != 0:
\r
73 self.title = ':'.join([namespace_map[page.namespace], page.title])
\r
75 self.title = page.title
\r
76 self.restrictions = page.restrictions
\r
77 self.collapse_user = collapse_user
\r
79 self.__revisions = self.rev_list()
\r
82 # Outline for how we want to handle collapse_user=True
\r
83 # iteration rev.user prev_rev.user add prev_rev?
\r
89 # Post-loop A Always
\r
90 for i, rev in enumerate(self.mwpage):
\r
91 # never yield the first time
\r
93 if self.collapse_user:
\r
95 rev.collapsed_revs = collapsed_revs
\r
98 if self.collapse_user:
\r
99 # yield if this is the last edit in a seq by a user and reset
\r
100 # also yield if we do know who the user is
\r
102 if rev.deleted.user or prev_rev.deleted.user:
\r
105 rev.collapsed_revs = collapsed_revs
\r
107 elif not rev.user.text == prev_rev.user.text:
\r
110 rev.collapsed_revs = collapsed_revs
\r
111 # otherwise, add one to the counter
\r
113 collapsed_revs += 1
\r
114 rev.collapsed_revs = collapsed_revs
\r
115 # if collapse_user is false, we always yield
\r
121 # also yield the final time
\r
124 def __iter__(self):
\r
125 return self.__revisions
\r
127 def __next__(self):
\r
128 return next(self.__revisions)
\r
131 class RegexPair(object):
\r
132 def __init__(self, pattern, label):
\r
133 self.pattern = re.compile(pattern)
\r
135 self.has_groups = bool(self.pattern.groupindex)
\r
136 if self.has_groups:
\r
137 self.capture_groups = list(self.pattern.groupindex.keys())
\r
139 def _make_key(self, cap_group):
\r
140 return ("{}_{}".format(self.label, cap_group))
\r
142 def matchmake(self, content, rev_data):
\r
146 # the searched text (content, which is rev.comment or rev.text) is empty
\r
148 # if there are capture groups, we go through and put in a value for each group
\r
149 if self.has_groups:
\r
150 for cap_group in self.capture_groups:
\r
151 key = self._make_key(cap_group)
\r
152 temp_dict[key] = None
\r
153 # if no capture groups, just put the value in for the associated label
\r
155 temp_dict[self.label] = None
\r
156 # searched text is not empty and we do the searches
\r
158 # if there are named capture groups in the regex
\r
159 if self.has_groups:
\r
161 # if there are matches of some sort in this revision content, fill the lists for each cap_group
\r
162 if self.pattern.search(content) is not None:
\r
163 m = self.pattern.finditer(content)
\r
164 matchobjects = list(m)
\r
166 for cap_group in self.capture_groups:
\r
167 key = self._make_key(cap_group)
\r
169 for match in matchobjects:
\r
170 # we only want to add the match for the capture group if the match is not None
\r
171 if match.group(cap_group) != None:
\r
172 temp_list.append(match.group(cap_group))
\r
174 # if temp_list of matches is empty just make that column None
\r
175 if len(temp_list)==0:
\r
176 temp_dict[key] = None
\r
177 # else we put in the list we made in the for-loop above
\r
179 temp_dict[key] = ', '.join(temp_list)
\r
181 # there are no matches at all in this revision content, we default values to None
\r
183 for cap_group in self.capture_groups:
\r
184 key = self._make_key(cap_group)
\r
185 temp_dict[key] = None
\r
187 # there are no capture groups, we just search for all the matches of the regex
\r
189 #given that there are matches to be made
\r
190 if self.pattern.search(content) is not None:
\r
191 m = self.pattern.findall(content)
\r
194 if type(match) is tuple:
\r
196 for sub_m in match:
\r
197 matchies.add(sub_m)
\r
198 m_fixed += matchies
\r
200 m_fixed.append(match)
\r
202 temp_dict[self.label] = ', '.join(m_fixed)
\r
204 temp_dict[self.label] = None
\r
206 # update rev_data with our new columns
\r
207 rev_data.update(temp_dict)
\r
211 class WikiqParser():
\r
212 def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
\r
215 persist : what persistence method to use. Takes a PersistMethod value
\r
217 self.input_file = input_file
\r
218 self.output_file = output_file
\r
219 self.collapse_user = collapse_user
\r
220 self.persist = persist
\r
221 self.printed_header = False
\r
222 self.namespaces = []
\r
223 self.urlencode = urlencode
\r
224 self.revert_radius = revert_radius
\r
226 if namespaces is not None:
\r
227 self.namespace_filter = set(namespaces)
\r
229 self.namespace_filter = None
\r
231 self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
\r
232 self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
\r
235 def make_matchmake_pairs(self, patterns, labels):
\r
236 if (patterns is not None and labels is not None) and \
\r
237 (len(patterns) == len(labels)):
\r
238 return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
\r
239 elif (patterns is None and labels is None):
\r
242 sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
\r
244 def matchmake(self, rev, rev_data):
\r
247 if not rev.comment:
\r
250 rev_data = self.matchmake_revision(rev.text, rev_data)
\r
251 rev_data = self.matchmake_comment(rev.comment, rev_data)
\r
254 def matchmake_revision(self, text, rev_data):
\r
255 return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
\r
257 def matchmake_comment(self, comment, rev_data):
\r
258 return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
\r
260 def matchmake_pairs(self, content, rev_data, pairs):
\r
262 rev_data = pair.matchmake(content, rev_data)
\r
265 def __get_namespace_from_title(self, title):
\r
268 for ns in self.namespaces:
\r
269 # skip if the namespace is not defined
\r
271 default_ns = self.namespaces[ns]
\r
274 if title.startswith(ns + ":"):
\r
275 return self.namespaces[ns]
\r
277 # if we've made it this far with no matches, we return the default namespace
\r
283 # create a regex that creates the output filename
\r
284 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
\r
285 # r'output/wikiq-\1-\2.tsv',
\r
288 # Construct dump file iterator
\r
289 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
\r
291 # extract list of namspaces
\r
292 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
\r
298 # Iterate through pages
\r
300 namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
\r
302 # skip namespaces not in the filter
\r
303 if self.namespace_filter is not None:
\r
304 if namespace not in self.namespace_filter:
\r
307 rev_detector = mwreverts.Detector(radius = self.revert_radius)
\r
309 if self.persist != PersistMethod.none:
\r
310 window = deque(maxlen=PERSISTENCE_RADIUS)
\r
312 if self.persist == PersistMethod.sequence:
\r
313 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
\r
314 revert_radius=PERSISTENCE_RADIUS)
\r
316 elif self.persist == PersistMethod.segment:
\r
317 state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
\r
318 revert_radius=PERSISTENCE_RADIUS)
\r
320 # self.persist == PersistMethod.legacy
\r
322 from mw.lib import persistence
\r
323 state = persistence.State()
\r
325 # Iterate through a page's revisions
\r
328 # initialize rev_data
\r
331 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
\r
332 'articleid' : page.id,
\r
333 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
\r
334 'title' : '"' + page.title + '"',
\r
335 'namespace' : namespace,
\r
336 'deleted' : "TRUE" if rev.deleted.text else "FALSE"
\r
339 rev_data = self.matchmake(rev, rev_data)
\r
341 # if revisions are deleted, /many/ things will be missing
\r
342 if rev.deleted.text:
\r
343 rev_data['text_chars'] = ""
\r
344 rev_data['sha1'] = ""
\r
345 rev_data['revert'] = ""
\r
346 rev_data['reverteds'] = ""
\r
349 # rev.text can be None if the page has no text
\r
352 # if text exists, we'll check for a sha1 and generate one otherwise
\r
355 text_sha1 = rev.sha1
\r
358 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
\r
360 rev_data['sha1'] = text_sha1
\r
362 # TODO rev.bytes doesn't work.. looks like a bug
\r
363 rev_data['text_chars'] = len(rev.text)
\r
365 # generate revert data
\r
366 revert = rev_detector.process(text_sha1, rev.id)
\r
369 rev_data['revert'] = "TRUE"
\r
370 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
\r
372 rev_data['revert'] = "FALSE"
\r
373 rev_data['reverteds'] = ""
\r
375 # if the fact that the edit was minor can be hidden, this might be an issue
\r
376 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
\r
378 if not rev.deleted.user:
\r
379 # wrap user-defined editors in quotes for fread
\r
380 rev_data['editor'] = '"' + rev.user.text + '"'
\r
381 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
\r
384 rev_data['anon'] = ""
\r
385 rev_data['editor'] = ""
\r
387 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
\r
392 #TODO missing: additions_size deletions_size
\r
394 # if collapse user was on, lets run that
\r
395 if self.collapse_user:
\r
396 rev_data['collapsed_revs'] = rev.collapsed_revs
\r
398 if self.persist != PersistMethod.none:
\r
399 if rev.deleted.text:
\r
400 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
\r
401 old_rev_data[k] = None
\r
404 if self.persist != PersistMethod.legacy:
\r
405 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
\r
408 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
\r
410 window.append((rev.id, rev_data, tokens_added, tokens_removed))
\r
412 if len(window) == PERSISTENCE_RADIUS:
\r
413 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
\r
415 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
\r
417 old_rev_data["token_revs"] = num_token_revs
\r
418 old_rev_data["tokens_added"] = num_tokens
\r
419 old_rev_data["tokens_removed"] = len(old_tokens_removed)
\r
420 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
\r
422 self.print_rev_data(old_rev_data)
\r
425 self.print_rev_data(rev_data)
\r
429 if self.persist != PersistMethod.none:
\r
430 # print out metadata for the last RADIUS revisions
\r
431 for i, item in enumerate(window):
\r
432 # if the window was full, we've already printed item 0
\r
433 if len(window) == PERSISTENCE_RADIUS and i == 0:
\r
436 rev_id, rev_data, tokens_added, tokens_removed = item
\r
437 num_token_revs, num_tokens = calculate_persistence(tokens_added)
\r
439 rev_data["token_revs"] = num_token_revs
\r
440 rev_data["tokens_added"] = num_tokens
\r
441 rev_data["tokens_removed"] = len(tokens_removed)
\r
442 rev_data["tokens_window"] = len(window)-(i+1)
\r
444 self.print_rev_data(rev_data)
\r
448 print("Done: %s revisions and %s pages." % (rev_count, page_count),
\r
451 def print_rev_data(self, rev_data):
\r
452 # if it's the first time through, print the header
\r
454 for field in TO_ENCODE:
\r
455 rev_data[field] = quote(str(rev_data[field]))
\r
457 if not self.printed_header:
\r
458 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
\r
459 self.printed_header = True
\r
461 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
\r
464 def open_input_file(input_filename):
\r
465 if re.match(r'.*\.7z$', input_filename):
\r
466 cmd = ["7za", "x", "-so", input_filename, '*']
\r
467 elif re.match(r'.*\.gz$', input_filename):
\r
468 cmd = ["zcat", input_filename]
\r
469 elif re.match(r'.*\.bz2$', input_filename):
\r
470 cmd = ["bzcat", "-dk", input_filename]
\r
473 input_file = Popen(cmd, stdout=PIPE).stdout
\r
475 input_file = open(input_filename, 'r')
\r
479 def open_output_file(input_filename):
\r
480 # create a regex that creates the output filename
\r
481 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
\r
482 output_filename = re.sub(r'\.xml', '', output_filename)
\r
483 output_filename = output_filename + ".tsv"
\r
484 output_file = open(output_filename, "w")
\r
488 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
\r
490 # arguments for the input direction
\r
491 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
\r
492 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
\r
494 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
\r
495 help="Directory for output files.")
\r
497 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
\r
498 help="Write output to standard out (do not create dump file)")
\r
500 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
\r
501 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
\r
503 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
\r
504 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
\r
506 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
\r
507 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
\r
509 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
\r
510 help="Id number of namspace to include. Can be specified more than once.")
\r
512 parser.add_argument('-rr',
\r
514 dest="revert_radius",
\r
518 help="Number of edits to check when looking for reverts (default: 15)")
\r
520 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
\r
521 help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
\r
523 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
\r
524 help="The label for the outputted column based on matching the regex in revision text.")
\r
526 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
\r
527 help="The regular expression to search for in comments of revisions.")
\r
529 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
\r
530 help="The label for the outputted column based on matching the regex in comments.")
\r
532 args = parser.parse_args()
\r
534 # set persistence method
\r
536 if args.persist is None:
\r
537 persist = PersistMethod.none
\r
538 elif args.persist == "segment":
\r
539 persist = PersistMethod.segment
\r
540 elif args.persist == "legacy":
\r
541 persist = PersistMethod.legacy
\r
543 persist = PersistMethod.sequence
\r
545 if args.namespace_filter is not None:
\r
546 namespaces = args.namespace_filter
\r
550 if len(args.dumpfiles) > 0:
\r
551 for filename in args.dumpfiles:
\r
552 input_file = open_input_file(filename)
\r
554 # open directory for output
\r
555 if args.output_dir:
\r
556 output_dir = args.output_dir[0]
\r
560 print("Processing file: %s" % filename, file=sys.stderr)
\r
563 output_file = sys.stdout
\r
565 filename = os.path.join(output_dir, os.path.basename(filename))
\r
566 output_file = open_output_file(filename)
\r
568 wikiq = WikiqParser(input_file,
\r
570 collapse_user=args.collapse_user,
\r
572 urlencode=args.urlencode,
\r
573 namespaces=namespaces,
\r
574 revert_radius=args.revert_radius,
\r
575 regex_match_revision = args.regex_match_revision,
\r
576 regex_revision_label = args.regex_revision_label,
\r
577 regex_match_comment = args.regex_match_comment,
\r
578 regex_comment_label = args.regex_comment_label)
\r
584 output_file.close()
\r
586 wikiq = WikiqParser(sys.stdin,
\r
588 collapse_user=args.collapse_user,
\r
590 #persist_legacy=args.persist_legacy,
\r
591 urlencode=args.urlencode,
\r
592 namespaces=namespaces,
\r
593 revert_radius=args.revert_radius,
\r
594 regex_match_revision = args.regex_match_revision,
\r
595 regex_revision_label = args.regex_revision_label,
\r
596 regex_match_comment = args.regex_match_comment,
\r
597 regex_comment_label = args.regex_comment_label)
\r
601 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
\r
602 # stop_words = stop_words.split(",")
\r