3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
15 from mwxml import Dump, Page
17 from deltas.tokenizers import wikitext_split
18 from mwdiffs.utilities import dump2diffs
20 from mwpersistence.state import DiffState
22 from mwpersistence import Token
23 from mwpersistence.utilities import diffs2persistence
25 from urllib.parse import quote
27 from deltas import SequenceMatcher
28 from deltas import SegmentMatcher
29 TO_ENCODE = ('title', 'editor')
32 ws_lex = ['break','whitespace']
33 punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
41 def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False):
44 cond = lambda t: not (exclude_punct and (t.type in punct_lex)) \
45 and not(exclude_ws and (t.type in ws_lex))
47 tokens_added = [t for t in tokens_added if cond(t)]
48 tokens_removed = [t for t in tokens_removed if cond(t)]
50 return(sum([(len(x.revisions)-1) for x in tokens_added]),
55 class WikiqIterator(Dump):
58 def from_file(cls, fh, collapse_user = False):
60 cls.collapse_user = collapse_user
61 cls = super(WikiqIterator, cls).from_file(fh)
65 def process_item(cls, item_element, namespace_map):
66 if not hasattr(cls,'inv_namespace_map'):
67 cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()}
69 if item_element.tag == "page":
70 return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user)
71 elif item_element.tag == "logitem":
72 return LogItem.from_element(item_element, namespace_map)
74 raise MalformedXML("Expected to see <page> or <logitem>. " +
75 "Instead saw <{0}>".format(item_element.tag))
77 class WikiqPage(Page):
78 __slots__ = ('id', 'title', 'namespace', 'redirect',
79 'restrictions','collapse_user')
82 def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False):
85 cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
87 # following mwxml, we assume namespace 0 in cases where
88 # page.namespace is inconsistent with namespace_map
89 # this undoes the "correction" of the namespace in mwxml
91 if cls.namespace not in inv_namespace_map:
93 if cls.namespace != 0:
94 cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
96 cls.collapse_user = collapse_user
97 cls.revisions = cls._Page__revisions
101 def _correct_sha(rev_data):
103 if rev_data.deleted.text:
105 rev_data.text_chars = 0
108 rev_data.reverteds = ""
111 if rev_data.text is None :
114 rev_data.text_chars = len(rev_data.text)
116 if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
117 text_sha1 = rev_data.sha1
120 text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
122 rev_data.sha1 = text_sha1
126 # Outline for how we want to handle collapse_user=True
127 # iteration rev.user prev_rev.user add prev_rev?
134 def __find_next_revision(self):
135 if self.prev_rev is None:
136 prev_rev = WikiqPage._correct_sha(next(self.revisions))
137 self.prev_rev = prev_rev
139 prev_rev = self.prev_rev
141 if self.collapse_user:
143 self.prev_rev.collapsed_revs = collapsed_revs
144 prev_rev = self.prev_rev
146 for rev in self.revisions:
147 rev = WikiqPage._correct_sha(rev)
148 if self.collapse_user:
149 # yield if this is the last edit in a seq by a user and reset
150 # also yield if we do know who the user is
152 if rev.deleted.user or prev_rev.deleted.user:
154 if prev_rev is not None:
155 prev_rev.collapsed_revs = collapsed_revs
158 elif not rev.user.text == prev_rev.user.text:
160 if prev_rev is not None:
161 prev_rev.collapsed_revs = collapsed_revs
164 # otherwise, add one to the counter
167 rev.collapsed_revs = collapsed_revs
168 # if collapse_user is false, we always yield
171 if prev_rev is not None:
177 if self.collapse_user:
178 prev_rev.collapsed_revs = collapsed_revs
183 revision = self.__find_next_revision()
189 revision = self.__find_next_revision()
195 def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None, exclude_punct = False, exclude_ws = False):
198 persist : what persistence method to use. Takes a PersistMethod value
200 self.input_file = input_file
201 self.output_file = output_file
202 self.collapse_user = collapse_user
203 self.persist = persist
204 self.printed_header = False
206 self.urlencode = urlencode
207 if namespaces is not None:
208 self.namespace_filter = set(namespaces)
210 self.namespace_filter = None
212 self.exclude_punct = exclude_punct
213 self.exclude_ws = exclude_ws
215 # Construct dump file iterator
216 self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
218 self.diff_engine = None
220 if self.persist == PersistMethod.sequence:
221 self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
223 if self.persist == PersistMethod.segment:
224 self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
230 for page in self.dump:
232 # skip pages not in the namespaces we want
233 if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
236 rev_detector = mwreverts.Detector()
238 if self.persist != PersistMethod.none:
239 window = deque(maxlen=PERSISTENCE_RADIUS)
241 if self.persist == PersistMethod.sequence:
242 state = DiffState(SequenceMatcher(tokenizer = wikitext_split),
243 revert_radius=PERSISTENCE_RADIUS)
245 elif self.persist == PersistMethod.segment:
246 state = DiffState(SegmentMatcher(tokenizer = wikitext_split),
247 revert_radius=PERSISTENCE_RADIUS)
250 from mw.lib import persistence
251 state = persistence.State()
253 # Iterate through a page's revisions
255 rev_data = {'revid' : rev.id,
256 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
257 'articleid' : page.id,
258 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
259 'title' : '"' + page.title + '"',
260 'namespace' : page.namespace,
261 'deleted' : "TRUE" if rev.deleted.text else "FALSE" }
263 # if revisions are deleted, /many/ things will be missing
265 rev_data['text_chars'] = ""
266 rev_data['sha1'] = ""
267 rev_data['revert'] = ""
268 rev_data['reverteds'] = ""
271 # rev.text can be None if the page has no text
274 # if text exists, we'll check for a sha1 and generate one otherwise
280 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
282 rev_data['sha1'] = text_sha1
284 # TODO rev.bytes doesn't work.. looks like a bug
285 rev_data['text_chars'] = len(rev.text)
287 # generate revert data
288 revert = rev_detector.process(text_sha1, rev.id)
291 rev_data['revert'] = "TRUE"
292 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
294 rev_data['revert'] = "FALSE"
295 rev_data['reverteds'] = ""
297 # if the fact that the edit was minor can be hidden, this might be an issue
298 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
300 if not rev.deleted.user:
301 # wrap user-defined editors in quotes for fread
302 rev_data['editor'] = '"' + rev.user.text + '"'
303 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
306 rev_data['anon'] = ""
307 rev_data['editor'] = ""
309 # we can easily add redirect info
310 # rev_data['redirect'] = rev.page.redirect
312 if self.collapse_user:
313 rev_data['collapsed_revs'] = rev.collapsed_revs
315 if self.persist != PersistMethod.none:
317 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
318 old_rev_data[k] = None
321 if self.persist != PersistMethod.legacy:
322 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
325 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
327 window.append((rev.id, rev_data, tokens_added, tokens_removed))
329 if len(window) == PERSISTENCE_RADIUS:
330 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
334 num_tokens_removed = \
335 calculate_persistence(
338 exclude_ws = self.exclude_ws,
339 exclude_punct = self.exclude_punct,
340 legacy = self.persist == PersistMethod.legacy)
342 old_rev_data["token_revs"] = num_token_revs
343 old_rev_data["tokens_added"] = num_tokens_added
344 old_rev_data["tokens_removed"] = num_tokens_removed
345 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
347 self.print_rev_data(old_rev_data)
350 self.print_rev_data(rev_data)
354 if self.persist != PersistMethod.none:
355 # print out metadata for the last RADIUS revisions
356 for i, item in enumerate(window):
357 # if the window was full, we've already printed item 0
358 if len(window) == PERSISTENCE_RADIUS and i == 0:
361 rev_id, rev_data, tokens_added, tokens_removed = item
365 num_tokens_removed = calculate_persistence(
368 exclude_ws = self.exclude_ws,
369 exclude_punct = self.exclude_punct,
370 legacy = self.persist == PersistMethod.legacy)
373 rev_data["token_revs"] = num_token_revs
374 rev_data["tokens_added"] = num_tokens_added
375 rev_data["tokens_removed"] = num_tokens_removed
376 rev_data["tokens_window"] = len(window)-(i+1)
378 self.print_rev_data(rev_data)
382 print("Done: %s revisions and %s pages." % (rev_count, page_count),
385 def print_rev_data(self, rev_data):
386 # if it's the first time through, print the header
388 for field in TO_ENCODE:
389 rev_data[field] = quote(str(rev_data[field]))
391 if not self.printed_header:
392 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
393 self.printed_header = True
395 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
398 def open_input_file(input_filename):
399 if re.match(r'.*\.7z$', input_filename):
400 cmd = ["7za", "x", "-so", input_filename, '*']
401 elif re.match(r'.*\.gz$', input_filename):
402 cmd = ["zcat", input_filename]
403 elif re.match(r'.*\.bz2$', input_filename):
404 cmd = ["bzcat", "-dk", input_filename]
407 input_file = Popen(cmd, stdout=PIPE).stdout
409 input_file = open(input_filename, 'r')
413 def open_output_file(input_filename):
414 # create a regex that creates the output filename
415 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
416 output_filename = re.sub(r'\.xml', '', output_filename)
417 output_filename = output_filename + ".tsv"
418 output_file = open(output_filename, "w")
422 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
424 # arguments for the input direction
425 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
426 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
428 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
429 help="Directory for output files.")
431 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
432 help="Write output to standard out (do not create dump file)")
434 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
435 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
437 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
438 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
440 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
441 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
443 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
444 help="Id number of namspace to include. Can be specified more than once.")
446 parser.add_argument('--exclude-whitespace', dest="exclude_ws", action="store_true",
447 help="Flag to remove whitespace from persistence measures.")
449 parser.add_argument('--exclude-punctuation', dest="exclude_punct", action="store_true",
450 help="Flag to remove punctuation from persistence measures.")
452 args = parser.parse_args()
454 # set persistence method
456 if args.persist is None:
457 persist = PersistMethod.none
458 elif args.persist == "segment":
459 persist = PersistMethod.segment
460 elif args.persist == "legacy":
461 persist = PersistMethod.legacy
463 persist = PersistMethod.sequence
465 if args.namespace_filter is not None:
466 namespaces = args.namespace_filter
470 if len(args.dumpfiles) > 0:
471 for filename in args.dumpfiles:
472 input_file = open_input_file(filename)
474 # open directory for output
476 output_dir = args.output_dir[0]
480 print("Processing file: %s" % filename, file=sys.stderr)
483 output_file = sys.stdout
485 filename = os.path.join(output_dir, os.path.basename(filename))
486 output_file = open_output_file(filename)
488 wikiq = WikiqParser(input_file, output_file,
489 collapse_user=args.collapse_user,
491 urlencode=args.urlencode,
492 namespaces = namespaces,
493 exclude_punct = args.exclude_punct,
494 exclude_ws = args.exclude_ws)
502 wikiq = WikiqParser(sys.stdin, sys.stdout,
503 collapse_user=args.collapse_user,
505 persist_legacy=args.persist_legacy,
506 urlencode=args.urlencode,
507 namespaces = namespaces)
510 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
511 # stop_words = stop_words.split(",")