3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
15 from mwxml import Dump
17 from deltas.tokenizers import wikitext_split
20 from urllib.parse import quote
21 TO_ENCODE = ('title', 'editor')
23 from deltas import SequenceMatcher
25 def calculate_persistence(tokens_added):
26 return(sum([(len(x.revisions)-1) for x in tokens_added]),
30 class WikiqIterator():
31 def __init__(self, fh, collapse_user=False):
33 self.collapse_user = collapse_user
34 self.mwiterator = Dump.from_file(self.fh)
35 self.__pages = self.load_pages()
38 for page in self.mwiterator:
39 yield WikiqPage(page, collapse_user=self.collapse_user)
45 return next(self._pages)
48 __slots__ = ('id', 'title', 'namespace', 'redirect',
49 'restrictions', 'mwpage', '__revisions',
52 def __init__(self, page, collapse_user=False):
54 self.title = page.title
55 self.namespace = page.namespace
56 self.redirect = page.redirect
57 self.restrictions = page.restrictions
59 self.collapse_user = collapse_user
61 self.__revisions = self.rev_list()
64 # Outline for how we want to handle collapse_user=True
65 # iteration rev.user prev_rev.user add prev_rev?
72 for i, rev in enumerate(self.mwpage):
73 # never yield the first time
75 if self.collapse_user:
77 rev.collapsed_revs = collapsed_revs
80 if self.collapse_user:
81 # yield if this is the last edit in a seq by a user and reset
82 # also yield if we do know who the user is
84 if rev.deleted.user or prev_rev.deleted.user:
87 rev.collapsed_revs = collapsed_revs
89 elif not rev.user.text == prev_rev.user.text:
92 rev.collapsed_revs = collapsed_revs
93 # otherwise, add one to the counter
96 rev.collapsed_revs = collapsed_revs
97 # if collapse_user is false, we always yield
103 # also yield the final time
107 return self.__revisions
110 return next(self.__revisions)
115 def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
117 self.input_file = input_file
118 self.output_file = output_file
119 self.collapse_user = collapse_user
120 self.persist = persist
121 self.persist_legacy = persist_legacy
122 self.printed_header = False
124 self.urlencode = urlencode
126 def __get_namespace_from_title(self, title):
129 for ns in self.namespaces:
130 # skip if the namespace is not defined
132 default_ns = self.namespaces[ns]
135 if title.startswith(ns + ":"):
136 return self.namespaces[ns]
138 # if we've made it this far with no matches, we return the default namespace
143 # create a regex that creates the output filename
144 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
145 # r'output/wikiq-\1-\2.tsv',
148 # Construct dump file iterator
149 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
151 # extract list of namspaces
152 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
158 # Iterate through pages
160 rev_detector = mwreverts.Detector()
162 if self.persist or self.persist_legacy:
163 window = deque(maxlen=PERSISTENCE_RADIUS)
165 if not self.persist_legacy:
166 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
167 revert_radius=PERSISTENCE_RADIUS)
170 from mw.lib import persistence
171 state = persistence.State()
173 # Iterate through a page's revisions
176 rev_data = {'revid' : rev.id,
177 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
178 'articleid' : page.id,
179 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
180 'title' : '"' + page.title + '"',
181 'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
182 'deleted' : "TRUE" if rev.deleted.text else "FALSE" }
184 # if revisions are deleted, /many/ things will be missing
186 rev_data['text_chars'] = ""
187 rev_data['sha1'] = ""
188 rev_data['revert'] = ""
189 rev_data['reverteds'] = ""
192 # rev.text can be None if the page has no text
195 # if text exists, we'll check for a sha1 and generate one otherwise
201 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
203 rev_data['sha1'] = text_sha1
205 # TODO rev.bytes doesn't work.. looks like a bug
206 rev_data['text_chars'] = len(rev.text)
208 # generate revert data
209 revert = rev_detector.process(text_sha1, rev.id)
212 rev_data['revert'] = "TRUE"
213 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
215 rev_data['revert'] = "FALSE"
216 rev_data['reverteds'] = ""
218 # if the fact that the edit was minor can be hidden, this might be an issue
219 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
221 if not rev.deleted.user:
222 # wrap user-defined editors in quotes for fread
223 rev_data['editor'] = '"' + rev.user.text + '"'
224 rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
227 rev_data['anon'] = ""
228 rev_data['editor'] = ""
230 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
235 #TODO missing: additions_size deletions_size
237 # if collapse user was on, lets run that
238 if self.collapse_user:
239 rev_data['collapsed_revs'] = rev.collapsed_revs
241 if self.persist or self.persist_legacy:
244 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
245 old_rev_data[k] = None
248 if not self.persist_legacy:
249 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
252 _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
254 window.append((rev.id, rev_data, tokens_added, tokens_removed))
256 if len(window) == PERSISTENCE_RADIUS:
257 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
259 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
261 old_rev_data["token_revs"] = num_token_revs
262 old_rev_data["tokens_added"] = num_tokens
263 old_rev_data["tokens_removed"] = len(old_tokens_removed)
264 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
266 self.print_rev_data(old_rev_data)
269 self.print_rev_data(rev_data)
273 if self.persist or self.persist_legacy:
274 # print out metadata for the last RADIUS revisions
275 for i, item in enumerate(window):
276 # if the window was full, we've already printed item 0
277 if len(window) == PERSISTENCE_RADIUS and i == 0:
280 rev_id, rev_data, tokens_added, tokens_removed = item
281 num_token_revs, num_tokens = calculate_persistence(tokens_added)
283 rev_data["token_revs"] = num_token_revs
284 rev_data["tokens_added"] = num_tokens
285 rev_data["tokens_removed"] = len(tokens_removed)
286 rev_data["tokens_window"] = len(window)-(i+1)
288 self.print_rev_data(rev_data)
292 print("Done: %s revisions and %s pages." % (rev_count, page_count),
295 def print_rev_data(self, rev_data):
296 # if it's the first time through, print the header
298 for field in TO_ENCODE:
299 rev_data[field] = quote(str(rev_data[field]))
301 if not self.printed_header:
302 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
303 self.printed_header = True
305 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
308 def open_input_file(input_filename):
309 if re.match(r'.*\.7z$', input_filename):
310 cmd = ["7za", "x", "-so", input_filename, '*']
311 elif re.match(r'.*\.gz$', input_filename):
312 cmd = ["zcat", input_filename]
313 elif re.match(r'.*\.bz2$', input_filename):
314 cmd = ["bzcat", "-dk", input_filename]
317 input_file = Popen(cmd, stdout=PIPE).stdout
319 input_file = open(input_filename, 'r')
323 def open_output_file(input_filename):
324 # create a regex that creates the output filename
325 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
326 output_filename = re.sub(r'\.xml', '', output_filename)
327 output_filename = output_filename + ".tsv"
328 output_file = open(output_filename, "w")
332 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
334 # arguments for the input direction
335 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
336 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
338 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
339 help="Directory for output files.")
341 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
342 help="Write output to standard out (do not create dump file)")
344 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
345 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
347 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
348 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
350 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
351 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
353 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
354 help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
356 args = parser.parse_args()
358 if len(args.dumpfiles) > 0:
359 for filename in args.dumpfiles:
360 input_file = open_input_file(filename)
362 # open directory for output
364 output_dir = args.output_dir[0]
368 print("Processing file: %s" % filename, file=sys.stderr)
371 output_file = sys.stdout
373 filename = os.path.join(output_dir, os.path.basename(filename))
374 output_file = open_output_file(filename)
376 wikiq = WikiqParser(input_file, output_file,
377 collapse_user=args.collapse_user,
378 persist=args.persist,
379 persist_legacy=args.persist_legacy,
380 urlencode=args.urlencode)
389 wikiq = WikiqParser(sys.stdin, sys.stdout,
390 collapse_user=args.collapse_user,
391 persist=args.persist,
392 persist_legacy=args.persist_legacy,
393 urlencode=args.urlencode)
396 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
397 # stop_words = stop_words.split(",")