]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
add unit tests for configuring revert_radius
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 class WikiqIterator():
38     def __init__(self, fh, collapse_user=False):
39         self.fh = fh
40         self.collapse_user = collapse_user
41         self.mwiterator = Dump.from_file(self.fh)
42         self.namespace_map = { ns.id : ns.name for ns in
43                                self.mwiterator.site_info.namespaces }
44         self.__pages = self.load_pages()
45
46     def load_pages(self):
47         for page in self.mwiterator:
48             yield WikiqPage(page,
49                             namespace_map = self.namespace_map,
50                             collapse_user=self.collapse_user)
51
52     def __iter__(self):
53         return self.__pages
54
55     def __next__(self):
56         return next(self._pages)
57
58 class WikiqPage():
59     __slots__ = ('id', 'title', 'namespace', 'redirect',
60                  'restrictions', 'mwpage', '__revisions',
61                  'collapse_user')
62     
63     def __init__(self, page, namespace_map, collapse_user=False):
64         self.id = page.id
65         self.namespace = page.namespace
66         # following mwxml, we assume namespace 0 in cases where
67         # page.namespace is inconsistent with namespace_map
68         if page.namespace not in namespace_map:
69             self.title = page.title
70             page.namespace = 0
71         if page.namespace != 0:
72             self.title = ':'.join([namespace_map[page.namespace], page.title])
73         else:
74             self.title = page.title
75         self.restrictions = page.restrictions
76         self.collapse_user = collapse_user
77         self.mwpage = page
78         self.__revisions = self.rev_list()
79
80     def rev_list(self):
81         # Outline for how we want to handle collapse_user=True
82         # iteration   rev.user   prev_rev.user   add prev_rev?
83         #         0          A            None           Never
84         #         1          A               A           False
85         #         2          B               A            True
86         #         3          A               B            True
87         #         4          A               A           False
88         # Post-loop                          A          Always
89         for i, rev in enumerate(self.mwpage):
90             # never yield the first time
91             if i == 0:
92                 if self.collapse_user: 
93                     collapsed_revs = 1
94                     rev.collapsed_revs = collapsed_revs
95
96             else:
97                 if self.collapse_user:
98                     # yield if this is the last edit in a seq by a user and reset
99                     # also yield if we do know who the user is
100
101                     if rev.deleted.user or prev_rev.deleted.user:
102                         yield prev_rev
103                         collapsed_revs = 1
104                         rev.collapsed_revs = collapsed_revs
105
106                     elif not rev.user.text == prev_rev.user.text:
107                         yield prev_rev
108                         collapsed_revs = 1
109                         rev.collapsed_revs = collapsed_revs
110                     # otherwise, add one to the counter
111                     else:
112                         collapsed_revs += 1
113                         rev.collapsed_revs = collapsed_revs
114                 # if collapse_user is false, we always yield
115                 else:
116                     yield prev_rev
117
118             prev_rev = rev
119
120         # also yield the final time
121         yield prev_rev
122
123     def __iter__(self):
124         return self.__revisions
125
126     def __next__(self):
127         return next(self.__revisions)
128
129 class WikiqParser():
130     
131     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
132         """ 
133         Parameters:
134            persist : what persistence method to use. Takes a PersistMethod value
135         """
136
137         self.input_file = input_file
138         self.output_file = output_file
139         self.collapse_user = collapse_user
140         self.persist = persist
141         self.printed_header = False
142         self.namespaces = []
143         self.urlencode = urlencode
144         self.revert_radius = revert_radius
145         if namespaces is not None:
146             self.namespace_filter = set(namespaces)
147         else:
148             self.namespace_filter = None
149
150     def __get_namespace_from_title(self, title):
151         default_ns = None
152
153         for ns in self.namespaces:
154             # skip if the namespace is not defined
155             if ns == None:
156                 default_ns = self.namespaces[ns]
157                 continue
158
159             if title.startswith(ns + ":"):
160                 return self.namespaces[ns]
161
162         # if we've made it this far with no matches, we return the default namespace
163         return default_ns
164
165     def process(self):
166
167         # create a regex that creates the output filename
168         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
169         #                         r'output/wikiq-\1-\2.tsv',
170         #                         input_filename)
171
172         # Construct dump file iterator
173         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
174
175         # extract list of namspaces
176         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
177
178         page_count = 0
179         rev_count = 0
180
181
182         # Iterate through pages
183         for page in dump:
184             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
185
186             # skip namespaces not in the filter
187             if self.namespace_filter is not None:
188                 if namespace not in self.namespace_filter:
189                     continue
190
191             print(self.revert_radius)
192             rev_detector = mwreverts.Detector(radius = self.revert_radius)
193
194
195             if self.persist != PersistMethod.none:
196                 window = deque(maxlen=PERSISTENCE_RADIUS)
197
198                 if self.persist == PersistMethod.sequence:
199                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
200                                                     revert_radius=PERSISTENCE_RADIUS)
201
202                 elif self.persist == PersistMethod.segment:
203                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
204                                                     revert_radius=PERSISTENCE_RADIUS)
205
206                 # self.persist == PersistMethod.legacy
207                 else:
208                     from mw.lib import persistence
209                     state = persistence.State()
210
211             # Iterate through a page's revisions
212             for rev in page:
213
214                 rev_data = {'revid' : rev.id,
215                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
216                             'articleid' : page.id,
217                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
218                             'title' : '"' + page.title + '"',
219                             'namespace' : namespace,
220                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
221
222                 # if revisions are deleted, /many/ things will be missing
223                 if rev.deleted.text:
224                     rev_data['text_chars'] = ""
225                     rev_data['sha1'] = ""
226                     rev_data['revert'] = ""
227                     rev_data['reverteds'] = ""
228
229                 else:
230                     # rev.text can be None if the page has no text
231                     if not rev.text:
232                         rev.text = ""
233                     # if text exists, we'll check for a sha1 and generate one otherwise
234
235                     if rev.sha1:
236                         text_sha1 = rev.sha1
237                     else:
238
239                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
240                     
241                     rev_data['sha1'] = text_sha1
242
243                     # TODO rev.bytes doesn't work.. looks like a bug
244                     rev_data['text_chars'] = len(rev.text)
245                
246                     # generate revert data
247                     revert = rev_detector.process(text_sha1, rev.id)
248                     
249                     if revert:
250                         rev_data['revert'] = "TRUE"
251                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
252                     else:
253                         rev_data['revert'] = "FALSE"
254                         rev_data['reverteds'] = ""
255
256                 # if the fact that the edit was minor can be hidden, this might be an issue
257                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
258
259                 if not rev.deleted.user:
260                     # wrap user-defined editors in quotes for fread
261                     rev_data['editor'] = '"' + rev.user.text + '"'
262                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
263                     
264                 else:
265                     rev_data['anon'] = ""
266                     rev_data['editor'] = ""
267
268                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
269                 #    redirect = True
270                 #else:
271                 #    redirect = False
272                 
273                 #TODO missing: additions_size deletions_size
274                 
275                 # if collapse user was on, lets run that
276                 if self.collapse_user:
277                     rev_data['collapsed_revs'] = rev.collapsed_revs
278
279                 if self.persist != PersistMethod.none:
280                     if rev.deleted.text:
281                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
282                             old_rev_data[k] = None
283                     else:
284
285                         if self.persist != PersistMethod.legacy:
286                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
287
288                         else:
289                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
290                             
291                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
292                         
293                         if len(window) == PERSISTENCE_RADIUS:
294                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
295                             
296                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
297
298                             old_rev_data["token_revs"] = num_token_revs
299                             old_rev_data["tokens_added"] = num_tokens
300                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
301                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
302
303                             self.print_rev_data(old_rev_data)
304
305                 else:
306                     self.print_rev_data(rev_data)
307
308                 rev_count += 1
309
310             if self.persist != PersistMethod.none:
311                 # print out metadata for the last RADIUS revisions
312                 for i, item in enumerate(window):
313                     # if the window was full, we've already printed item 0
314                     if len(window) == PERSISTENCE_RADIUS and i == 0:
315                         continue
316
317                     rev_id, rev_data, tokens_added, tokens_removed = item
318                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
319
320                     rev_data["token_revs"] = num_token_revs
321                     rev_data["tokens_added"] = num_tokens
322                     rev_data["tokens_removed"] = len(tokens_removed)
323                     rev_data["tokens_window"] = len(window)-(i+1)
324                     
325                     self.print_rev_data(rev_data)
326
327             page_count += 1
328
329         print("Done: %s revisions and %s pages." % (rev_count, page_count),
330               file=sys.stderr)
331
332     def print_rev_data(self, rev_data):
333         # if it's the first time through, print the header
334         if self.urlencode:
335             for field in TO_ENCODE:
336                 rev_data[field] = quote(str(rev_data[field]))
337
338         if not self.printed_header:
339             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
340             self.printed_header = True
341         
342         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
343
344
345 def open_input_file(input_filename):
346     if re.match(r'.*\.7z$', input_filename):
347         cmd = ["7za", "x", "-so", input_filename, '*'] 
348     elif re.match(r'.*\.gz$', input_filename):
349         cmd = ["zcat", input_filename] 
350     elif re.match(r'.*\.bz2$', input_filename):
351         cmd = ["bzcat", "-dk", input_filename] 
352
353     try:
354         input_file = Popen(cmd, stdout=PIPE).stdout
355     except NameError:
356         input_file = open(input_filename, 'r')
357
358     return input_file
359
360 def open_output_file(input_filename):
361     # create a regex that creates the output filename
362     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
363     output_filename = re.sub(r'\.xml', '', output_filename)
364     output_filename = output_filename + ".tsv"
365     output_file = open(output_filename, "w")
366
367     return output_file
368
369 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
370
371 # arguments for the input direction
372 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
373                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
374
375 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
376                     help="Directory for output files.")
377
378 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
379                     help="Write output to standard out (do not create dump file)")
380
381 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
382                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
383
384 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
385                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
386
387 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
388                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
389
390 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
391                     help="Id number of namspace to include. Can be specified more than once.")
392
393 parser.add_argument('-rr',
394                     '--revert-radius',
395                     dest="revert_radius",
396                     type=int,
397                     action='store',
398                     default=15,
399                     help="Number of edits to check when looking for reverts (default: 15)")
400
401
402
403 args = parser.parse_args()
404
405 # set persistence method
406
407 if args.persist is None:
408     persist = PersistMethod.none
409 elif args.persist == "segment":
410     persist = PersistMethod.segment
411 elif args.persist == "legacy":
412     persist = PersistMethod.legacy
413 else:
414     persist = PersistMethod.sequence
415
416 if args.namespace_filter is not None:
417     namespaces = args.namespace_filter
418 else:
419     namespaces = None
420
421 if len(args.dumpfiles) > 0:
422     for filename in args.dumpfiles:
423         input_file = open_input_file(filename)
424
425         # open directory for output
426         if args.output_dir:
427             output_dir = args.output_dir[0]
428         else:
429             output_dir = "."
430
431         print("Processing file: %s" % filename, file=sys.stderr)
432
433         if args.stdout:
434             output_file = sys.stdout
435         else:
436             filename = os.path.join(output_dir, os.path.basename(filename))
437             output_file = open_output_file(filename)
438
439         wikiq = WikiqParser(input_file,
440                             output_file,
441                             collapse_user=args.collapse_user,
442                             persist=persist,
443                             urlencode=args.urlencode,
444                             namespaces=namespaces,
445                             revert_radius=args.revert_radius)
446
447         wikiq.process()
448
449         # close things 
450         input_file.close()
451         output_file.close()
452 else:
453     wikiq = WikiqParser(sys.stdin,
454                         sys.stdout,
455                         collapse_user=args.collapse_user,
456                         persist=persist,
457                         persist_legacy=args.persist_legacy,
458                         urlencode=args.urlencode,
459                         namespaces=namespaces,
460                         revert_radius=args.revert_radius)
461     wikiq.process()
462
463 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
464 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?