]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
bc6b06ded1f7f0ca1e8c4e81c8337e3a02930118
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25
26 def calculate_persistence(tokens_added):
27     return(sum([(len(x.revisions)-1) for x in tokens_added]),
28            len(tokens_added))
29
30
31 class WikiqIterator():
32     def __init__(self, fh, collapse_user=False):
33         self.fh = fh
34         self.collapse_user = collapse_user
35         self.mwiterator = Dump.from_file(self.fh)
36         self.namespace_map = { ns.id : ns.name for ns in
37                                self.mwiterator.site_info.namespaces }
38         self.__pages = self.load_pages()
39
40     def load_pages(self):
41         for page in self.mwiterator:
42             yield WikiqPage(page,
43                             namespace_map = self.namespace_map,
44                             collapse_user=self.collapse_user)
45
46     def __iter__(self):
47         return self.__pages
48
49     def __next__(self):
50         return next(self._pages)
51
52 class WikiqPage():
53     __slots__ = ('id', 'title', 'namespace', 'redirect',
54                  'restrictions', 'mwpage', '__revisions',
55                  'collapse_user')
56     
57     def __init__(self, page, namespace_map, collapse_user=False):
58         self.id = page.id
59         self.namespace = page.namespace
60         if page.namespace != 0:
61             self.title = ':'.join([namespace_map[page.namespace], page.title])
62         else:
63             self.title = page.title
64         self.restrictions = page.restrictions
65         self.collapse_user = collapse_user
66         self.mwpage = page
67         self.__revisions = self.rev_list()
68
69     def rev_list(self):
70         # Outline for how we want to handle collapse_user=True
71         # iteration   rev.user   prev_rev.user   add prev_rev?
72         #         0          A            None           Never
73         #         1          A               A           False
74         #         2          B               A            True
75         #         3          A               B            True
76         #         4          A               A           False
77         # Post-loop                          A          Always
78         for i, rev in enumerate(self.mwpage):
79             # never yield the first time
80             if i == 0:
81                 if self.collapse_user: 
82                     collapsed_revs = 1
83                     rev.collapsed_revs = collapsed_revs
84
85             else:
86                 if self.collapse_user:
87                     # yield if this is the last edit in a seq by a user and reset
88                     # also yield if we do know who the user is
89
90                     if rev.deleted.user or prev_rev.deleted.user:
91                         yield prev_rev
92                         collapsed_revs = 1
93                         rev.collapsed_revs = collapsed_revs
94
95                     elif not rev.user.text == prev_rev.user.text:
96                         yield prev_rev
97                         collapsed_revs = 1
98                         rev.collapsed_revs = collapsed_revs
99                     # otherwise, add one to the counter
100                     else:
101                         collapsed_revs += 1
102                         rev.collapsed_revs = collapsed_revs
103                 # if collapse_user is false, we always yield
104                 else:
105                     yield prev_rev
106
107             prev_rev = rev
108
109         # also yield the final time
110         yield prev_rev
111
112     def __iter__(self):
113         return self.__revisions
114
115     def __next__(self):
116         return next(self.__revisions)
117
118 class WikiqParser():
119
120     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
121         
122         self.input_file = input_file
123         self.output_file = output_file
124         self.collapse_user = collapse_user
125         self.persist = persist
126         self.persist_legacy = persist_legacy
127         self.printed_header = False
128         self.namespaces = []
129         self.urlencode = urlencode
130         
131     def __get_namespace_from_title(self, title):
132         default_ns = None
133
134         for ns in self.namespaces:
135             # skip if the namespace is not defined
136             if ns == None:
137                 default_ns = self.namespaces[ns]
138                 continue
139
140             if title.startswith(ns + ":"):
141                 return self.namespaces[ns]
142
143         # if we've made it this far with no matches, we return the default namespace
144         return default_ns
145
146     def process(self):
147
148         # create a regex that creates the output filename
149         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
150         #                         r'output/wikiq-\1-\2.tsv',
151         #                         input_filename)
152
153         # Construct dump file iterator
154         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
155
156         # extract list of namspaces
157         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
158
159         page_count = 0
160         rev_count = 0
161
162
163         # Iterate through pages
164         for page in dump:
165             rev_detector = mwreverts.Detector()
166
167             if self.persist or self.persist_legacy:
168                 window = deque(maxlen=PERSISTENCE_RADIUS)
169
170                 if not self.persist_legacy:
171                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
172                                                     revert_radius=PERSISTENCE_RADIUS)
173
174                 else:
175                     from mw.lib import persistence
176                     state = persistence.State()
177
178             # Iterate through a page's revisions
179             for rev in page:
180
181                 rev_data = {'revid' : rev.id,
182                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
183                             'articleid' : page.id,
184                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
185                             'title' : '"' + page.title + '"',
186                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
187                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
188
189                 # if revisions are deleted, /many/ things will be missing
190                 if rev.deleted.text:
191                     rev_data['text_chars'] = ""
192                     rev_data['sha1'] = ""
193                     rev_data['revert'] = ""
194                     rev_data['reverteds'] = ""
195
196                 else:
197                     # rev.text can be None if the page has no text
198                     if not rev.text:
199                         rev.text = ""
200                     # if text exists, we'll check for a sha1 and generate one otherwise
201
202                     if rev.sha1:
203                         text_sha1 = rev.sha1
204                     else:
205
206                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
207                     
208                     rev_data['sha1'] = text_sha1
209
210                     # TODO rev.bytes doesn't work.. looks like a bug
211                     rev_data['text_chars'] = len(rev.text)
212                
213                     # generate revert data
214                     revert = rev_detector.process(text_sha1, rev.id)
215                     
216                     if revert:
217                         rev_data['revert'] = "TRUE"
218                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
219                     else:
220                         rev_data['revert'] = "FALSE"
221                         rev_data['reverteds'] = ""
222
223                 # if the fact that the edit was minor can be hidden, this might be an issue
224                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
225
226                 if not rev.deleted.user:
227                     # wrap user-defined editors in quotes for fread
228                     rev_data['editor'] = '"' + rev.user.text + '"'
229                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
230                     
231                 else:
232                     rev_data['anon'] = ""
233                     rev_data['editor'] = ""
234
235                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
236                 #    redirect = True
237                 #else:
238                 #    redirect = False
239                 
240                 #TODO missing: additions_size deletions_size
241                 
242                 # if collapse user was on, lets run that
243                 if self.collapse_user:
244                     rev_data['collapsed_revs'] = rev.collapsed_revs
245
246                 if self.persist or self.persist_legacy:
247                     if rev.deleted.text:
248
249                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
250                             old_rev_data[k] = None
251                     else:
252
253                         if not self.persist_legacy:
254                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
255
256                         else:
257                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
258                             
259                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
260                         
261                         if len(window) == PERSISTENCE_RADIUS:
262                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
263                             
264                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
265
266                             old_rev_data["token_revs"] = num_token_revs
267                             old_rev_data["tokens_added"] = num_tokens
268                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
269                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
270
271                             self.print_rev_data(old_rev_data)
272
273                 else:
274                     self.print_rev_data(rev_data)
275
276                 rev_count += 1
277
278             if self.persist or self.persist_legacy:
279                 # print out metadata for the last RADIUS revisions
280                 for i, item in enumerate(window):
281                     # if the window was full, we've already printed item 0
282                     if len(window) == PERSISTENCE_RADIUS and i == 0:
283                         continue
284
285                     rev_id, rev_data, tokens_added, tokens_removed = item
286                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
287
288                     rev_data["token_revs"] = num_token_revs
289                     rev_data["tokens_added"] = num_tokens
290                     rev_data["tokens_removed"] = len(tokens_removed)
291                     rev_data["tokens_window"] = len(window)-(i+1)
292                     
293                     self.print_rev_data(rev_data)
294
295             page_count += 1
296
297         print("Done: %s revisions and %s pages." % (rev_count, page_count),
298               file=sys.stderr)
299
300     def print_rev_data(self, rev_data):
301         # if it's the first time through, print the header
302         if self.urlencode:
303             for field in TO_ENCODE:
304                 rev_data[field] = quote(str(rev_data[field]))
305
306         if not self.printed_header:
307             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
308             self.printed_header = True
309         
310         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
311
312
313 def open_input_file(input_filename):
314     if re.match(r'.*\.7z$', input_filename):
315         cmd = ["7za", "x", "-so", input_filename, '*'] 
316     elif re.match(r'.*\.gz$', input_filename):
317         cmd = ["zcat", input_filename] 
318     elif re.match(r'.*\.bz2$', input_filename):
319         cmd = ["bzcat", "-dk", input_filename] 
320
321     try:
322         input_file = Popen(cmd, stdout=PIPE).stdout
323     except NameError:
324         input_file = open(input_filename, 'r')
325
326     return input_file
327
328 def open_output_file(input_filename):
329     # create a regex that creates the output filename
330     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
331     output_filename = re.sub(r'\.xml', '', output_filename)
332     output_filename = output_filename + ".tsv"
333     output_file = open(output_filename, "w")
334
335     return output_file
336
337 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
338
339 # arguments for the input direction
340 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
341                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
342
343 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
344                     help="Directory for output files.")
345
346 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
347                     help="Write output to standard out (do not create dump file)")
348
349 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
350                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
351
352 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
353                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
354
355 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
356                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
357
358 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
359                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
360
361 args = parser.parse_args()
362
363 if len(args.dumpfiles) > 0:
364     for filename in args.dumpfiles:
365         input_file = open_input_file(filename)
366
367         # open directory for output
368         if args.output_dir:
369             output_dir = args.output_dir[0]
370         else:
371             output_dir = "."
372
373         print("Processing file: %s" % filename, file=sys.stderr)
374
375         if args.stdout:
376             output_file = sys.stdout
377         else:
378             filename = os.path.join(output_dir, os.path.basename(filename))
379             output_file = open_output_file(filename)
380
381         wikiq = WikiqParser(input_file, output_file, 
382                             collapse_user=args.collapse_user,
383                             persist=args.persist,
384                             persist_legacy=args.persist_legacy,
385                             urlencode=args.urlencode)
386
387
388         wikiq.process()
389
390         # close things 
391         input_file.close()
392         output_file.close()
393 else:
394     wikiq = WikiqParser(sys.stdin, sys.stdout,
395                         collapse_user=args.collapse_user,
396                         persist=args.persist,
397                         persist_legacy=args.persist_legacy,
398                         urlencode=args.urlencode)
399     wikiq.process()
400
401 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
402 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?