3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
15 from mwxml import Dump, Page
17 from deltas.tokenizers import wikitext_split
18 from mwdiffs.utilities import dump2diffs
20 from mwpersistence.state import DiffState
22 from mwpersistence import Token
23 from mwpersistence.utilities import diffs2persistence
25 from urllib.parse import quote
27 from deltas import SequenceMatcher
28 from deltas import SegmentMatcher
29 TO_ENCODE = ('title', 'editor')
32 ws_lex = ['break','whitespace']
33 punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
41 def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False):
44 cond = lambda t: not (exclude_punct and (t.type in punct_lex)) \
45 and not(exclude_ws and (t.type in ws_lex))
47 tokens_added = [t for t in tokens_added if cond(t)]
48 tokens_removed = [t for t in tokens_removed if cond(t)]
50 return(sum([(len(x.revisions)-1) for x in tokens_added]),
55 class WikiqIterator(Dump):
58 def from_file(cls, fh, collapse_user = False):
60 cls.collapse_user = collapse_user
61 cls = super(WikiqIterator, cls).from_file(fh)
65 def process_item(cls, item_element, namespace_map):
66 if not hasattr(cls,'inv_namespace_map'):
67 cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()}
69 if item_element.tag == "page":
70 return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user)
71 elif item_element.tag == "logitem":
72 return LogItem.from_element(item_element, namespace_map)
74 raise MalformedXML("Expected to see <page> or <logitem>. " +
75 "Instead saw <{0}>".format(item_element.tag))
77 class WikiqPage(Page):
78 __slots__ = ('id', 'title', 'namespace', 'redirect',
79 'restrictions','collapse_user')
82 def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False):
85 cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
87 # following mwxml, we assume namespace 0 in cases where
88 # page.namespace is inconsistent with namespace_map
89 # this undoes the "correction" of the namespace in mwxml
91 if cls.namespace not in inv_namespace_map:
93 if cls.namespace != 0:
94 cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
96 cls.collapse_user = collapse_user
97 cls.revisions = cls._Page__revisions
101 def _correct_sha(rev_data):
103 if rev_data.deleted.text:
105 rev_data.text_chars = 0
108 rev_data.reverteds = ""
111 if rev_data.text is None :
114 rev_data.text_chars = len(rev_data.text)
116 if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
117 text_sha1 = rev_data.sha1
120 text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
122 rev_data.sha1 = text_sha1
126 # Outline for how we want to handle collapse_user=True
127 # iteration rev.user prev_rev.user add prev_rev?
134 def __find_next_revision(self):
136 if self.prev_rev is None:
137 prev_rev = WikiqPage._correct_sha(next(self.revisions))
138 self.prev_rev = prev_rev
140 prev_rev = self.prev_rev
142 if self.collapse_user:
144 self.prev_rev.collapsed_revs = collapsed_revs
145 prev_rev = self.prev_rev
147 for rev in self.revisions:
148 rev = WikiqPage._correct_sha(rev)
149 if self.collapse_user:
150 # yield if this is the last edit in a seq by a user and reset
151 # also yield if we do know who the user is
153 if rev.deleted.user or prev_rev.deleted.user:
155 if prev_rev is not None:
156 prev_rev.collapsed_revs = collapsed_revs
159 elif not rev.user.text == prev_rev.user.text:
161 if prev_rev is not None:
162 prev_rev.collapsed_revs = collapsed_revs
165 # otherwise, add one to the counter
168 rev.collapsed_revs = collapsed_revs
169 # if collapse_user is false, we always yield
172 if prev_rev is not None:
178 if self.collapse_user:
179 prev_rev.collapsed_revs = collapsed_revs
184 revision = self.__find_next_revision()
190 revision = self.__find_next_revision()
196 def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
199 persist : what persistence method to use. Takes a PersistMethod value
201 self.input_file = input_file
202 self.output_file = output_file
203 self.collapse_user = collapse_user
204 self.persist = persist
205 self.printed_header = False
207 self.urlencode = urlencode
208 if namespaces is not None:
209 self.namespace_filter = set(namespaces)
211 self.namespace_filter = None
213 # create a regex that creates the output filename
214 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
215 # r'output/wikiq-\1-\2.tsv',
218 # Construct dump file iterator
219 self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
221 self.diff_engine = None
223 if self.persist == PersistMethod.sequence:
224 self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
226 if self.persist == PersistMethod.segment:
227 self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
229 # def __get_namespace_from_title(self, title):
232 # for ns in self.namespaces:
233 # # skip if the namespace is not defined
235 # default_ns = self.namespaces[ns]
238 # if title.startswith(ns + ":"):
239 # return self.namespaces[ns]
241 # # if we've made it this far with no matches, we return the default namespace
244 # def _set_namespace(self, rev_docs):
246 # for rev_data in rev_docs:
247 # if 'namespace' not in rev_data['page']:
248 # namespace = self.__get_namespace_from_title(page['title'])
249 # rev_data['page']['namespace'] = namespace
256 for page in self.dump:
258 # skip pages not in the namespaces we want
259 if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
262 rev_detector = mwreverts.Detector()
264 if self.persist != PersistMethod.none:
265 window = deque(maxlen=PERSISTENCE_RADIUS)
267 if self.persist == PersistMethod.sequence:
268 state = DiffState(SequenceMatcher(tokenizer = wikitext_split),
269 revert_radius=PERSISTENCE_RADIUS)
271 elif self.persist == PersistMethod.segment:
272 state = DiffState(SegmentMatcher(tokenizer = wikitext_split),
273 revert_radius=PERSISTENCE_RADIUS)
276 from mw.lib import persistence
277 state = persistence.State()
279 # Iterate through a page's revisions
281 rev_data = {'revid' : rev.id,
282 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
283 'articleid' : page.id,
284 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
285 'title' : '"' + page.title + '"',
286 'namespace' : page.namespace,
287 'deleted' : "TRUE" if rev.deleted.text else "FALSE" }
289 # if revisions are deleted, /many/ things will be missing
291 rev_data['text_chars'] = ""
292 rev_data['sha1'] = ""
293 rev_data['revert'] = ""
294 rev_data['reverteds'] = ""
297 # rev.text can be None if the page has no text
300 # if text exists, we'll check for a sha1 and generate one otherwise
306 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
308 rev_data['sha1'] = text_sha1
310 # TODO rev.bytes doesn't work.. looks like a bug
311 rev_data['text_chars'] = len(rev.text)
313 # generate revert data
314 revert = rev_detector.process(text_sha1, rev.id)
317 rev_data['revert'] = "TRUE"
318 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
320 rev_data['revert'] = "FALSE"
321 rev_data['reverteds'] = ""
323 # if the fact that the edit was minor can be hidden, this might be an issue
324 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
326 if not rev.deleted.user:
327 # wrap user-defined editors in quotes for fread
328 rev_data['editor'] = '"' + rev.user.text + '"'
329 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
332 rev_data['anon'] = ""
333 rev_data['editor'] = ""
335 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
340 #TODO missing: additions_size deletions_size
342 # if collapse user was on, lets run that
343 # if self.collapse_user:
344 # rev_data.collapsed_revs = rev.collapsed_revs
346 if self.persist != PersistMethod.none:
348 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
349 old_rev_data[k] = None
352 if self.persist != PersistMethod.legacy:
353 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
356 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
358 window.append((rev.id, rev_data, tokens_added, tokens_removed))
360 if len(window) == PERSISTENCE_RADIUS:
361 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
363 num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(old_tokens_added, old_tokens_removed, legacy = self.persist == PersistMethod.legacy)
365 old_rev_data["token_revs"] = num_token_revs
366 old_rev_data["tokens_added"] = num_tokens_added
367 old_rev_data["tokens_removed"] = num_tokens_removed
368 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
370 self.print_rev_data(old_rev_data)
373 self.print_rev_data(rev_data)
377 if self.persist != PersistMethod.none:
378 # print out metadata for the last RADIUS revisions
379 for i, item in enumerate(window):
380 # if the window was full, we've already printed item 0
381 if len(window) == PERSISTENCE_RADIUS and i == 0:
384 rev_id, rev_data, tokens_added, tokens_removed = item
386 num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed, legacy = self.persist == PersistMethod.legacy)
388 rev_data["token_revs"] = num_token_revs
389 rev_data["tokens_added"] = num_tokens_added
390 rev_data["tokens_removed"] = num_tokens_removed
391 rev_data["tokens_window"] = len(window)-(i+1)
393 self.print_rev_data(rev_data)
397 print("Done: %s revisions and %s pages." % (rev_count, page_count),
400 def print_rev_data(self, rev_data):
401 # if it's the first time through, print the header
403 for field in TO_ENCODE:
404 rev_data[field] = quote(str(rev_data[field]))
406 if not self.printed_header:
407 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
408 self.printed_header = True
410 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
413 def open_input_file(input_filename):
414 if re.match(r'.*\.7z$', input_filename):
415 cmd = ["7za", "x", "-so", input_filename, '*']
416 elif re.match(r'.*\.gz$', input_filename):
417 cmd = ["zcat", input_filename]
418 elif re.match(r'.*\.bz2$', input_filename):
419 cmd = ["bzcat", "-dk", input_filename]
422 input_file = Popen(cmd, stdout=PIPE).stdout
424 input_file = open(input_filename, 'r')
428 def open_output_file(input_filename):
429 # create a regex that creates the output filename
430 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
431 output_filename = re.sub(r'\.xml', '', output_filename)
432 output_filename = output_filename + ".tsv"
433 output_file = open(output_filename, "w")
437 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
439 # arguments for the input direction
440 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
441 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
443 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
444 help="Directory for output files.")
446 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
447 help="Write output to standard out (do not create dump file)")
449 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
450 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
452 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
453 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
455 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
456 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
458 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
459 help="Id number of namspace to include. Can be specified more than once.")
463 args = parser.parse_args()
465 # set persistence method
467 if args.persist is None:
468 persist = PersistMethod.none
469 elif args.persist == "segment":
470 persist = PersistMethod.segment
471 elif args.persist == "legacy":
472 persist = PersistMethod.legacy
474 persist = PersistMethod.sequence
476 if args.namespace_filter is not None:
477 namespaces = args.namespace_filter
481 if len(args.dumpfiles) > 0:
482 for filename in args.dumpfiles:
483 input_file = open_input_file(filename)
485 # open directory for output
487 output_dir = args.output_dir[0]
491 print("Processing file: %s" % filename, file=sys.stderr)
494 output_file = sys.stdout
496 filename = os.path.join(output_dir, os.path.basename(filename))
497 output_file = open_output_file(filename)
499 wikiq = WikiqParser(input_file, output_file,
500 collapse_user=args.collapse_user,
502 urlencode=args.urlencode,
503 namespaces = namespaces)
511 wikiq = WikiqParser(sys.stdin, sys.stdout,
512 collapse_user=args.collapse_user,
514 persist_legacy=args.persist_legacy,
515 urlencode=args.urlencode,
516 namespaces = namespaces)
519 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
520 # stop_words = stop_words.split(",")