3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
16 from mw.xml_dump import Iterator
18 from deltas.tokenizers import wikitext_split
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
24 from deltas import SequenceMatcher
26 def calculate_persistence(tokens_added):
27 return(sum([(len(x.revisions)-1) for x in tokens_added]),
31 class WikiqIterator():
32 def __init__(self, fh, collapse_user=False):
34 self.collapse_user = collapse_user
35 self.mwiterator = Iterator.from_file(self.fh)
36 self.__pages = self.load_pages()
39 for page in self.mwiterator:
40 yield WikiqPage(page, collapse_user=self.collapse_user)
46 return next(self._pages)
49 __slots__ = ('id', 'title', 'namespace', 'redirect',
50 'restrictions', 'mwpage', '__revisions',
53 def __init__(self, page, collapse_user=False):
55 self.title = page.title
56 self.namespace = page.namespace
57 self.redirect = page.redirect
58 self.restrictions = page.restrictions
60 self.collapse_user = collapse_user
62 self.__revisions = self.rev_list()
65 # Outline for how we want to handle collapse_user=True
66 # iteration rev.user prev_rev.user add prev_rev?
73 for i, rev in enumerate(self.mwpage):
74 # never yield the first time
76 if self.collapse_user:
78 rev.collapsed_revs = collapsed_revs
81 if self.collapse_user:
82 # yield if this is the last edit in a seq by a user and reset
83 if not rev.contributor.user_text == prev_rev.contributor.user_text:
86 rev.collapsed_revs = collapsed_revs
87 # otherwise, add one to the counter
90 rev.collapsed_revs = collapsed_revs
91 # if collapse_user is false, we always yield
96 # also yield the final time
100 return self.__revisions
103 return next(self.__revisions)
108 def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
110 self.input_file = input_file
111 self.output_file = output_file
112 self.collapse_user = collapse_user
113 self.persist = persist
114 self.persist_legacy = persist_legacy
115 self.printed_header = False
117 self.urlencode = urlencode
119 def __get_namespace_from_title(self, title):
122 for ns in self.namespaces:
123 # skip if the namespace is not defined
125 default_ns = self.namespaces[ns]
128 if title.startswith(ns + ":"):
129 return self.namespaces[ns]
131 # if we've made it this far with no matches, we return the default namespace
136 # create a regex that creates the output filename
137 # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
138 # r'output/wikiq-\1-\2.tsv',
141 # Construct dump file iterator
142 dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
144 # extract list of namspaces
145 self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
151 # Iterate through pages
153 rev_detector = mwreverts.Detector()
155 if self.persist or self.persist_legacy:
156 window = deque(maxlen=PERSISTENCE_RADIUS)
158 if not self.persist_legacy:
159 state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
160 revert_radius=PERSISTENCE_RADIUS)
163 from mw.lib import persistence
164 state = persistence.State()
166 # Iterate through a page's revisions
169 rev_data = {'revid' : rev.id,
170 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
171 'articleid' : page.id,
172 'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
173 'title' : '"' + page.title + '"',
174 'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
175 'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
177 # if revisions are deleted, /many/ things will be missing
179 rev_data['text_chars'] = ""
180 rev_data['sha1'] = ""
181 rev_data['revert'] = ""
182 rev_data['reverteds'] = ""
185 # if text exists, we'll check for a sha1 and generate one otherwise
189 text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
191 rev_data['sha1'] = text_sha1
193 # TODO rev.bytes doesn't work.. looks like a bug
194 rev_data['text_chars'] = len(rev.text)
196 # generate revert data
197 revert = rev_detector.process(text_sha1, rev.id)
200 rev_data['revert'] = "TRUE"
201 rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
203 rev_data['revert'] = "FALSE"
204 rev_data['reverteds'] = ""
206 # if the fact that the edit was minor can be hidden, this might be an issue
207 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
209 if rev.contributor.user_text:
210 # wrap user-defined editors in quotes for fread
211 rev_data['editor'] = '"' + rev.contributor.user_text + '"'
212 rev_data['anon'] = "TRUE" if rev.contributor.id == None else "FALSE"
215 rev_data['anon'] = ""
216 rev_data['editor'] = ""
218 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
223 #TODO missing: additions_size deletions_size
225 # if collapse user was on, lets run that
226 if self.collapse_user:
227 rev_data['collapsed_revs'] = rev.collapsed_revs
229 if self.persist or self.persist_legacy:
231 for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
232 old_rev_data[k] = None
235 if not self.persist_legacy:
236 _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
239 _, tokens_added, tokens_removed = state.process(rev.text, rev.id,text_sha1)
241 window.append((rev.id, rev_data, tokens_added, tokens_removed))
243 if len(window) == PERSISTENCE_RADIUS:
244 old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
246 num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
248 old_rev_data["token_revs"] = num_token_revs
249 old_rev_data["tokens_added"] = num_tokens
250 old_rev_data["tokens_removed"] = len(old_tokens_removed)
251 old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
253 self.print_rev_data(old_rev_data)
256 self.print_rev_data(rev_data)
260 if self.persist or self.persist_legacy:
261 # print out metadata for the last RADIUS revisions
262 for i, item in enumerate(window):
263 # if the window was full, we've already printed item 0
264 if len(window) == PERSISTENCE_RADIUS and i == 0:
267 rev_id, rev_data, tokens_added, tokens_removed = item
268 num_token_revs, num_tokens = calculate_persistence(tokens_added)
270 rev_data["token_revs"] = num_token_revs
271 rev_data["tokens_added"] = num_tokens
272 rev_data["tokens_removed"] = len(tokens_removed)
273 rev_data["tokens_window"] = len(window)-(i+1)
275 self.print_rev_data(rev_data)
279 print("Done: %s revisions and %s pages." % (rev_count, page_count),
282 def print_rev_data(self, rev_data):
283 # if it's the first time through, print the header
285 for field in TO_ENCODE:
286 rev_data[field] = quote(str(rev_data[field]))
288 if not self.printed_header:
289 print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
290 self.printed_header = True
292 print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
295 def open_input_file(input_filename):
296 if re.match(r'.*\.7z$', input_filename):
297 cmd = ["7za", "x", "-so", input_filename, '*']
298 elif re.match(r'.*\.gz$', input_filename):
299 cmd = ["zcat", input_filename]
300 elif re.match(r'.*\.bz2$', input_filename):
301 cmd = ["bzcat", "-dk", input_filename]
304 input_file = Popen(cmd, stdout=PIPE).stdout
306 input_file = open(input_filename, 'r')
310 def open_output_file(input_filename):
311 # create a regex that creates the output filename
312 output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
313 output_filename = re.sub(r'\.xml', '', output_filename)
314 output_filename = output_filename + ".tsv"
315 output_file = open(output_filename, "w")
319 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
321 # arguments for the input direction
322 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
323 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
325 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
326 help="Directory for output files.")
328 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
329 help="Write output to standard out (do not create dump file)")
331 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
332 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
334 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
335 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
337 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
338 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
340 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
341 help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
343 args = parser.parse_args()
345 if len(args.dumpfiles) > 0:
346 for filename in args.dumpfiles:
347 input_file = open_input_file(filename)
349 # open directory for output
351 output_dir = args.output_dir[0]
355 print("Processing file: %s" % filename, file=sys.stderr)
358 output_file = sys.stdout
360 filename = os.path.join(output_dir, os.path.basename(filename))
361 output_file = open_output_file(filename)
363 wikiq = WikiqParser(input_file, output_file,
364 collapse_user=args.collapse_user,
365 persist=args.persist,
366 persist_legacy=args.persist_legacy,
367 urlencode=args.urlencode)
376 wikiq = WikiqParser(sys.stdin, sys.stdout,
377 collapse_user=args.collapse_user,
378 persist=args.persist,
379 persist_legacy=args.persist_legacy,
380 urlencode=args.urlencode)
383 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
384 # stop_words = stop_words.split(",")