7cf4be26e60db633db9d210e7f5ec74addf1f357
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 class WikiqIterator():
38     def __init__(self, fh, collapse_user=False):
39         self.fh = fh
40         self.collapse_user = collapse_user
41         self.mwiterator = Dump.from_file(self.fh)
42         self.namespace_map = { ns.id : ns.name for ns in
43                                self.mwiterator.site_info.namespaces }
44         self.__pages = self.load_pages()
45
46     def load_pages(self):
47         for page in self.mwiterator:
48             yield WikiqPage(page,
49                             namespace_map = self.namespace_map,
50                             collapse_user=self.collapse_user)
51
52     def __iter__(self):
53         return self.__pages
54
55     def __next__(self):
56         return next(self._pages)
57
58 class WikiqPage():
59     __slots__ = ('id', 'title', 'namespace', 'redirect',
60                  'restrictions', 'mwpage', '__revisions',
61                  'collapse_user')
62     
63     def __init__(self, page, namespace_map, collapse_user=False):
64         self.id = page.id
65         self.namespace = page.namespace
66         if page.namespace != 0:
67             self.title = ':'.join([namespace_map[page.namespace], page.title])
68         else:
69             self.title = page.title
70         self.restrictions = page.restrictions
71         self.collapse_user = collapse_user
72         self.mwpage = page
73         self.__revisions = self.rev_list()
74
75     def rev_list(self):
76         # Outline for how we want to handle collapse_user=True
77         # iteration   rev.user   prev_rev.user   add prev_rev?
78         #         0          A            None           Never
79         #         1          A               A           False
80         #         2          B               A            True
81         #         3          A               B            True
82         #         4          A               A           False
83         # Post-loop                          A          Always
84         for i, rev in enumerate(self.mwpage):
85             # never yield the first time
86             if i == 0:
87                 if self.collapse_user: 
88                     collapsed_revs = 1
89                     rev.collapsed_revs = collapsed_revs
90
91             else:
92                 if self.collapse_user:
93                     # yield if this is the last edit in a seq by a user and reset
94                     # also yield if we do know who the user is
95
96                     if rev.deleted.user or prev_rev.deleted.user:
97                         yield prev_rev
98                         collapsed_revs = 1
99                         rev.collapsed_revs = collapsed_revs
100
101                     elif not rev.user.text == prev_rev.user.text:
102                         yield prev_rev
103                         collapsed_revs = 1
104                         rev.collapsed_revs = collapsed_revs
105                     # otherwise, add one to the counter
106                     else:
107                         collapsed_revs += 1
108                         rev.collapsed_revs = collapsed_revs
109                 # if collapse_user is false, we always yield
110                 else:
111                     yield prev_rev
112
113             prev_rev = rev
114
115         # also yield the final time
116         yield prev_rev
117
118     def __iter__(self):
119         return self.__revisions
120
121     def __next__(self):
122         return next(self.__revisions)
123
124 class WikiqParser():
125     
126     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False):
127         """ 
128         Parameters:
129            persist : what persistence method to use. Takes a PersistMethod value
130         """
131
132         self.input_file = input_file
133         self.output_file = output_file
134         self.collapse_user = collapse_user
135         self.persist = persist
136         self.printed_header = False
137         self.namespaces = []
138         self.urlencode = urlencode
139         
140     def __get_namespace_from_title(self, title):
141         default_ns = None
142
143         for ns in self.namespaces:
144             # skip if the namespace is not defined
145             if ns == None:
146                 default_ns = self.namespaces[ns]
147                 continue
148
149             if title.startswith(ns + ":"):
150                 return self.namespaces[ns]
151
152         # if we've made it this far with no matches, we return the default namespace
153         return default_ns
154
155     def process(self):
156
157         # create a regex that creates the output filename
158         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
159         #                         r'output/wikiq-\1-\2.tsv',
160         #                         input_filename)
161
162         # Construct dump file iterator
163         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
164
165         # extract list of namspaces
166         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
167
168         page_count = 0
169         rev_count = 0
170
171
172         # Iterate through pages
173         for page in dump:
174             rev_detector = mwreverts.Detector()
175
176             if self.persist != PersistMethod.none:
177                 window = deque(maxlen=PERSISTENCE_RADIUS)
178
179                 if self.persist == PersistMethod.sequence:
180                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
181                                                     revert_radius=PERSISTENCE_RADIUS)
182
183                 elif self.persist == PersistMethod.segment:
184                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
185                                                     revert_radius=PERSISTENCE_RADIUS)
186
187                 # self.persist == PersistMethod.legacy
188                 else:
189                     from mw.lib import persistence
190                     state = persistence.State()
191
192             # Iterate through a page's revisions
193             for rev in page:
194
195                 rev_data = {'revid' : rev.id,
196                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
197                             'articleid' : page.id,
198                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
199                             'title' : '"' + page.title + '"',
200                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
201                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
202
203                 # if revisions are deleted, /many/ things will be missing
204                 if rev.deleted.text:
205                     rev_data['text_chars'] = ""
206                     rev_data['sha1'] = ""
207                     rev_data['revert'] = ""
208                     rev_data['reverteds'] = ""
209
210                 else:
211                     # rev.text can be None if the page has no text
212                     if not rev.text:
213                         rev.text = ""
214                     # if text exists, we'll check for a sha1 and generate one otherwise
215
216                     if rev.sha1:
217                         text_sha1 = rev.sha1
218                     else:
219
220                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
221                     
222                     rev_data['sha1'] = text_sha1
223
224                     # TODO rev.bytes doesn't work.. looks like a bug
225                     rev_data['text_chars'] = len(rev.text)
226                
227                     # generate revert data
228                     revert = rev_detector.process(text_sha1, rev.id)
229                     
230                     if revert:
231                         rev_data['revert'] = "TRUE"
232                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
233                     else:
234                         rev_data['revert'] = "FALSE"
235                         rev_data['reverteds'] = ""
236
237                 # if the fact that the edit was minor can be hidden, this might be an issue
238                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
239
240                 if not rev.deleted.user:
241                     # wrap user-defined editors in quotes for fread
242                     rev_data['editor'] = '"' + rev.user.text + '"'
243                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
244                     
245                 else:
246                     rev_data['anon'] = ""
247                     rev_data['editor'] = ""
248
249                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
250                 #    redirect = True
251                 #else:
252                 #    redirect = False
253                 
254                 #TODO missing: additions_size deletions_size
255                 
256                 # if collapse user was on, lets run that
257                 if self.collapse_user:
258                     rev_data['collapsed_revs'] = rev.collapsed_revs
259
260                 if self.persist != PersistMethod.none:
261                     if rev.deleted.text:
262                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
263                             old_rev_data[k] = None
264                     else:
265
266                         if self.persist != PersistMethod.legacy:
267                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
268
269                         else:
270                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
271                             
272                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
273                         
274                         if len(window) == PERSISTENCE_RADIUS:
275                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
276                             
277                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
278
279                             old_rev_data["token_revs"] = num_token_revs
280                             old_rev_data["tokens_added"] = num_tokens
281                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
282                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
283
284                             self.print_rev_data(old_rev_data)
285
286                 else:
287                     self.print_rev_data(rev_data)
288
289                 rev_count += 1
290
291             if self.persist != PersistMethod.none:
292                 # print out metadata for the last RADIUS revisions
293                 for i, item in enumerate(window):
294                     # if the window was full, we've already printed item 0
295                     if len(window) == PERSISTENCE_RADIUS and i == 0:
296                         continue
297
298                     rev_id, rev_data, tokens_added, tokens_removed = item
299                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
300
301                     rev_data["token_revs"] = num_token_revs
302                     rev_data["tokens_added"] = num_tokens
303                     rev_data["tokens_removed"] = len(tokens_removed)
304                     rev_data["tokens_window"] = len(window)-(i+1)
305                     
306                     self.print_rev_data(rev_data)
307
308             page_count += 1
309
310         print("Done: %s revisions and %s pages." % (rev_count, page_count),
311               file=sys.stderr)
312
313     def print_rev_data(self, rev_data):
314         # if it's the first time through, print the header
315         if self.urlencode:
316             for field in TO_ENCODE:
317                 rev_data[field] = quote(str(rev_data[field]))
318
319         if not self.printed_header:
320             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
321             self.printed_header = True
322         
323         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
324
325
326 def open_input_file(input_filename):
327     if re.match(r'.*\.7z$', input_filename):
328         cmd = ["7za", "x", "-so", input_filename, '*'] 
329     elif re.match(r'.*\.gz$', input_filename):
330         cmd = ["zcat", input_filename] 
331     elif re.match(r'.*\.bz2$', input_filename):
332         cmd = ["bzcat", "-dk", input_filename] 
333
334     try:
335         input_file = Popen(cmd, stdout=PIPE).stdout
336     except NameError:
337         input_file = open(input_filename, 'r')
338
339     return input_file
340
341 def open_output_file(input_filename):
342     # create a regex that creates the output filename
343     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
344     output_filename = re.sub(r'\.xml', '', output_filename)
345     output_filename = output_filename + ".tsv"
346     output_file = open(output_filename, "w")
347
348     return output_file
349
350 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
351
352 # arguments for the input direction
353 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
354                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
355
356 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
357                     help="Directory for output files.")
358
359 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
360                     help="Write output to standard out (do not create dump file)")
361
362 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
363                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
364
365 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
366                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
367
368 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
369                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
370
371 args = parser.parse_args()
372
373 # set persistence method
374
375 if args.persist is None:
376     persist = PersistMethod.none
377 elif args.persist == "segment":
378     persist = PersistMethod.segment
379 elif args.persist == "legacy":
380     persist = PersistMethod.legacy
381 else:
382     persist = PersistMethod.sequence
383
384 if len(args.dumpfiles) > 0:
385     for filename in args.dumpfiles:
386         input_file = open_input_file(filename)
387
388         # open directory for output
389         if args.output_dir:
390             output_dir = args.output_dir[0]
391         else:
392             output_dir = "."
393
394         print("Processing file: %s" % filename, file=sys.stderr)
395
396         if args.stdout:
397             output_file = sys.stdout
398         else:
399             filename = os.path.join(output_dir, os.path.basename(filename))
400             output_file = open_output_file(filename)
401
402             wikiq = WikiqParser(input_file, output_file, 
403                             collapse_user=args.collapse_user,
404                                 persist=persist,
405                             urlencode=args.urlencode)
406
407
408         wikiq.process()
409
410         # close things 
411         input_file.close()
412         output_file.close()
413 else:
414     wikiq = WikiqParser(sys.stdin, sys.stdout,
415                         collapse_user=args.collapse_user,
416                         persist=persist,
417                         persist_legacy=args.persist_legacy,
418                         urlencode=args.urlencode)
419     wikiq.process()
420
421 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
422 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?