3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
16 from mwxml import Dump
18 from deltas.tokenizers import wikitext_split
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
24 from deltas import SequenceMatcher
26 def calculate_persistence(tokens_added):
27 return(sum([(len(x.revisions)-1) for x in tokens_added]),
31 class WikiqIterator():
32 def __init__(self, fh, collapse_user=False):
34 self.collapse_user = collapse_user
35 self.mwiterator = Dump.from_file(self.fh)
36 self.namespace_map = { ns.id : ns.name for ns in
37 self.mwiterator.site_info.namespaces }
38 self.__pages = self.load_pages()
41 for page in self.mwiterator:
43 namespace_map = self.namespace_map,
44 collapse_user=self.collapse_user)
50 return next(self._pages)
53 __slots__ = ('id', 'title', 'namespace', 'redirect',
54 'restrictions', 'mwpage', '__revisions',
57 def __init__(self, page, namespace_map, collapse_user=False):
59 self.namespace = page.namespace
60 if page.namespace != 0:
61 self.title = ':'.join([namespace_map[page.namespace], page.title])
63 self.title = page.title
64 self.restrictions = page.restrictions
65 self.collapse_user = collapse_user
67 self.__revisions = self.rev_list()
70 # Outline for how we want to handle collapse_user=True
71 # iteration rev.user prev_rev.user add prev_rev?
78 for i, rev in enumerate(self.mwpage):
79 # never yield the first time
81 if self.collapse_user:
83 rev.collapsed_revs = collapsed_revs
86 if self.collapse_user:
87 # yield if this is the last edit in a seq by a user and reset
88 # also yield if we do know who the user is
90 if rev.deleted.user or prev_rev.deleted.user:
93 rev.collapsed_revs = collapsed_revs
95 elif not rev.user.text == prev_rev.user.text:
98 rev.collapsed_revs = collapsed_revs
99 # otherwise, add one to the counter
102 rev.collapsed_revs = collapsed_revs
103 # if collapse_user is false, we always yield
109 # also yield the final time
113 return self.__revisions
116 return next(self.__revisions)
120 def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
122 self.input_file = input_file
123 self.output_file = output_file
124 self.collapse_user = collapse_user
125 self.persist = persist
126 self.persist_legacy = persist_legacy
127 self.printed_header = False
129 self.urlencode = urlencode
131 def __get_namespace_from_title(self, title):
134 for ns in self.namespaces:
135 # skip if the namespace is not defined
137 default_ns = self.namespaces[ns]
140 if title.startswith(ns + ":"):
141 return self.namespaces[ns]
143 # if we've made it this far with no matches, we return the default namespace
148 # create a regex that creates the output filename
149 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
150 # r'output/wikiq-\1-\2.tsv',
153 # Construct dump file iterator
154 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
156 # extract list of namspaces
157 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
163 # Iterate through pages
165 rev_detector = mwreverts.Detector()
167 if self.persist or self.persist_legacy:
168 window = deque(maxlen=PERSISTENCE_RADIUS)
170 if not self.persist_legacy:
171 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
172 revert_radius=PERSISTENCE_RADIUS)
175 from mw.lib import persistence
176 state = persistence.State()
178 # Iterate through a page's revisions
181 rev_data = {'revid' : rev.id,
182 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
183 'articleid' : page.id,
184 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
185 'title' : '"' + page.title + '"',
186 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
187 'deleted' : "TRUE" if rev.deleted.text else "FALSE" }
189 # if revisions are deleted, /many/ things will be missing
191 rev_data['text_chars'] = ""
192 rev_data['sha1'] = ""
193 rev_data['revert'] = ""
194 rev_data['reverteds'] = ""
197 # rev.text can be None if the page has no text
200 # if text exists, we'll check for a sha1 and generate one otherwise
206 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
208 rev_data['sha1'] = text_sha1
210 # TODO rev.bytes doesn't work.. looks like a bug
211 rev_data['text_chars'] = len(rev.text)
213 # generate revert data
214 revert = rev_detector.process(text_sha1, rev.id)
217 rev_data['revert'] = "TRUE"
218 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
220 rev_data['revert'] = "FALSE"
221 rev_data['reverteds'] = ""
223 # if the fact that the edit was minor can be hidden, this might be an issue
224 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
226 if not rev.deleted.user:
227 # wrap user-defined editors in quotes for fread
228 rev_data['editor'] = '"' + rev.user.text + '"'
229 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
232 rev_data['anon'] = ""
233 rev_data['editor'] = ""
235 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
240 #TODO missing: additions_size deletions_size
242 # if collapse user was on, lets run that
243 if self.collapse_user:
244 rev_data['collapsed_revs'] = rev.collapsed_revs
246 if self.persist or self.persist_legacy:
249 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
250 old_rev_data[k] = None
253 if not self.persist_legacy:
254 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
257 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
259 window.append((rev.id, rev_data, tokens_added, tokens_removed))
261 if len(window) == PERSISTENCE_RADIUS:
262 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
264 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
266 old_rev_data["token_revs"] = num_token_revs
267 old_rev_data["tokens_added"] = num_tokens
268 old_rev_data["tokens_removed"] = len(old_tokens_removed)
269 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
271 self.print_rev_data(old_rev_data)
274 self.print_rev_data(rev_data)
278 if self.persist or self.persist_legacy:
279 # print out metadata for the last RADIUS revisions
280 for i, item in enumerate(window):
281 # if the window was full, we've already printed item 0
282 if len(window) == PERSISTENCE_RADIUS and i == 0:
285 rev_id, rev_data, tokens_added, tokens_removed = item
286 num_token_revs, num_tokens = calculate_persistence(tokens_added)
288 rev_data["token_revs"] = num_token_revs
289 rev_data["tokens_added"] = num_tokens
290 rev_data["tokens_removed"] = len(tokens_removed)
291 rev_data["tokens_window"] = len(window)-(i+1)
293 self.print_rev_data(rev_data)
297 print("Done: %s revisions and %s pages." % (rev_count, page_count),
300 def print_rev_data(self, rev_data):
301 # if it's the first time through, print the header
303 for field in TO_ENCODE:
304 rev_data[field] = quote(str(rev_data[field]))
306 if not self.printed_header:
307 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
308 self.printed_header = True
310 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
313 def open_input_file(input_filename):
314 if re.match(r'.*\.7z$', input_filename):
315 cmd = ["7za", "x", "-so", input_filename, '*']
316 elif re.match(r'.*\.gz$', input_filename):
317 cmd = ["zcat", input_filename]
318 elif re.match(r'.*\.bz2$', input_filename):
319 cmd = ["bzcat", "-dk", input_filename]
322 input_file = Popen(cmd, stdout=PIPE).stdout
324 input_file = open(input_filename, 'r')
328 def open_output_file(input_filename):
329 # create a regex that creates the output filename
330 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
331 output_filename = re.sub(r'\.xml', '', output_filename)
332 output_filename = output_filename + ".tsv"
333 output_file = open(output_filename, "w")
337 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
339 # arguments for the input direction
340 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
341 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
343 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
344 help="Directory for output files.")
346 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
347 help="Write output to standard out (do not create dump file)")
349 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
350 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
352 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
353 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
355 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
356 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
358 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
359 help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
361 args = parser.parse_args()
363 if len(args.dumpfiles) > 0:
364 for filename in args.dumpfiles:
365 input_file = open_input_file(filename)
367 # open directory for output
369 output_dir = args.output_dir[0]
373 print("Processing file: %s" % filename, file=sys.stderr)
376 output_file = sys.stdout
378 filename = os.path.join(output_dir, os.path.basename(filename))
379 output_file = open_output_file(filename)
381 wikiq = WikiqParser(input_file, output_file,
382 collapse_user=args.collapse_user,
383 persist=args.persist,
384 persist_legacy=args.persist_legacy,
385 urlencode=args.urlencode)
394 wikiq = WikiqParser(sys.stdin, sys.stdout,
395 collapse_user=args.collapse_user,
396 persist=args.persist,
397 persist_legacy=args.persist_legacy,
398 urlencode=args.urlencode)
401 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
402 # stop_words = stop_words.split(",")