]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
wikiq mostly functional, but reverters take all the credit for the content they restore.
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os, os.path
9 import re
10
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
14
15 from mwxml import Dump, Page
16
17 from deltas.tokenizers import wikitext_split
18 from mwdiffs.utilities import dump2diffs
19 import mwpersistence
20 from mwpersistence.state import Version, apply_opdocs, apply_operations, persist_revision_once
21
22 from mwpersistence import Token
23 from  mwpersistence.utilities import diffs2persistence 
24 import mwreverts
25 from urllib.parse import quote
26
27 from deltas import SequenceMatcher
28 from deltas import SegmentMatcher
29 TO_ENCODE = ('title', 'editor')
30 PERSISTENCE_RADIUS=7
31
32 # this is a simple override of mwpersistence.DiffState that doesn't do anything special for reverts. 
33 class WikiqDiffState(mwpersistence.DiffState):
34     def _update(self, text=None, checksum=None, opdocs=None, revision=None):
35         if checksum is None:
36             if text is None:
37                 raise TypeError("Either 'text' or 'checksum' must be " +
38                                 "specified.")
39             else:
40                 checksum = sha1(bytes(text, 'utf8')).hexdigest()
41
42         current_version = Version()
43
44         # the main difference we have is that we don't do anything special for reverts
45         if opdocs is not None:
46             transition = apply_opdocs(opdocs, self.last.tokens or [])
47             current_version.tokens, _, _ = transition
48         else:
49             # NOTICE: HEAVY COMPUTATION HERE!!!
50             #
51             # Diffs usually run in O(n^2) -- O(n^3) time and most
52             # tokenizers produce a lot of tokens.
53             if self.diff_processor is None:
54                 raise RuntimeError("DiffState cannot process raw text " +
55                                    "without a diff_engine specified.")
56             operations, _, current_tokens = \
57                 self.diff_processor.process(text, token_class=Token)
58
59             transition = apply_operations(operations,
60                                           self.last.tokens or [],
61                                           current_tokens)
62             current_version.tokens, _, _ = transition
63
64         # Record persistence
65         persist_revision_once(current_version.tokens, revision)
66
67         # Update last version
68         self.last = current_version
69
70         # Return the tranisitoned state
71         return transition
72
73 class PersistMethod:
74     none = 0
75     sequence = 1
76     segment = 2
77     legacy = 3
78
79 def calculate_persistence(tokens_added, tokens_removed, exclude_ws = True, exclude_punct = False):
80     cond = True
81     ws_lex = ['break','whitespace']
82     punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
83
84
85     if exclude_ws:
86         cond = lambda t: cond and not t.type in ws_lex
87
88     if exclude_punct:
89         cond = lambda t: cond and not t.type in punct_lex
90     
91     tokens_added = [t for t in tokens_added if cond(t)]
92     tokens_removed = [t for t in tokens_removed if cond(t)]
93
94     return(sum([(len(x.revisions)-1) for x in tokens_added]),
95            len(tokens_added),
96            len(tokens_removed)
97     )
98
99 class WikiqIterator(Dump):
100
101     @classmethod
102     def from_file(cls, fh, collapse_user = False):
103         cls = super(WikiqIterator, cls).from_file(fh)
104         cls.fh = fh
105         cls.collapse_user = collapse_user
106         cls.namespace_map = { ns.id : ns.name for ns in
107                                cls.site_info.namespaces }
108         return cls
109
110     @classmethod
111     def process_item(cls, item_element, namespace_map, collapse_user = False):
112         if item_element.tag == "page":
113             return WikiqPage.from_element(item_element, namespace_map, collapse_user)
114         elif item_element.tag == "logitem":
115             return LogItem.from_element(item_element, namespace_map)
116         else:
117             raise MalformedXML("Expected to see <page> or <logitem>.  " +
118                                "Instead saw <{0}>".format(item_element.tag))
119
120 class WikiqPage(Page):
121     __slots__ = ('id', 'title', 'namespace', 'redirect',
122                  'restrictions','collapse_user')
123         
124     @classmethod
125     def from_element(cls, item_element, namespace_map, collapse_user = False):
126         cls.prev_rev = None
127
128         inv_namespace_map = {ns.id:name for name,ns in namespace_map.items()}
129         
130         cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
131
132         # following mwxml, we assume namespace 0 in cases where
133         # page.namespace is inconsistent with namespace_map
134         # this undoes the "correction" of the namespace in mwxml
135         
136         if cls.namespace not in inv_namespace_map:
137             cls.namespace = 0
138         if cls.namespace != 0:
139             cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
140
141         cls.collapse_user = collapse_user
142         cls.revisions = cls._Page__revisions
143         return cls
144
145     @staticmethod
146     def _correct_sha(rev_data):
147
148         if rev_data.deleted.text:
149             rev_data.text = ""
150             rev_data.text_chars = 0
151             rev_data.sha1 = ""
152             rev_data.revert = ""
153             rev_data.reverteds = ""
154
155         else:
156             if rev_data.text is None :
157                 rev_data.text = ""
158                 
159         rev_data.text_chars = len(rev_data.text)
160
161         if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
162             text_sha1 = rev_data.sha1
163
164         else:
165             text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
166
167         rev_data.sha1 = text_sha1
168
169         return rev_data 
170
171     # Outline for how we want to handle collapse_user=True
172     # iteration   rev.user   prev_rev.user   add prev_rev?
173     #         0          A            None           Never
174     #         1          A               A           False
175     #         2          B               A            True
176     #         3          A               B            True
177     #         4          A               A           False
178     # Post-loop                          A          Always
179     def __find_next_revision(self):
180
181         if self.prev_rev is None:
182             prev_rev = WikiqPage._correct_sha(next(self.revisions))
183             self.prev_rev = prev_rev
184         else:
185             prev_rev = self.prev_rev
186
187         if self.collapse_user: 
188             collapsed_revs = 1
189             rev.collapsed_revs = collapsed_revs
190
191         for rev in self.revisions:
192             rev = WikiqPage._correct_sha(rev)
193             if self.collapse_user:
194                 # yield if this is the last edit in a seq by a user and reset
195                 # also yield if we do know who the user is
196
197                 if rev.deleted.user or prev_rev.deleted.user:
198                     self.prev_rev = rev
199                     if prev_rev is not None:
200                         prev_rev.collapsed_revs = collapsed_revs
201                         return prev_rev
202
203                 elif not rev.user.text == prev_rev.user.text:
204                     self.prev_rev = rev
205                     if prev_rev is not None:
206                         prev_rev.collapsed_revs = collapsed_revs
207                         return prev_rev
208
209                 # otherwise, add one to the counter
210                 else:
211                     collapsed_revs += 1
212                     rev.collapsed_revs = collapsed_revs
213                 # if collapse_user is false, we always yield
214             else:
215                 self.prev_rev = rev
216                 if prev_rev is not None:
217                     return prev_rev
218             prev_rev = rev
219
220         self.prev_rev = None
221
222         if self.collapse_user:
223             prev_rev.collapsed_revs = collapsed_revs
224         return prev_rev
225
226
227     def __next__(self):
228         revision = self.__find_next_revision()
229         revision.page = self
230         return revision
231
232     def __iter__(self):
233         while(True):
234             revision = self.__find_next_revision()
235             revision.page = self
236             yield revision
237
238     # def __iter__(self):
239     #     return self.__revisions
240
241     # def __next__(self):
242     #     return next(self.__revisions)
243
244 class WikiqParser():
245     
246     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
247         """ 
248         Parameters:
249            persist : what persistence method to use. Takes a PersistMethod value
250         """
251         self.input_file = input_file
252         self.output_file = output_file
253         self.collapse_user = collapse_user
254         self.persist = persist
255         self.printed_header = False
256         self.namespaces = []
257         self.urlencode = urlencode
258         if namespaces is not None:
259             self.namespace_filter = set(namespaces)
260         else:
261             self.namespace_filter = None
262
263         # create a regex that creates the output filename
264         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
265         #                         r'output/wikiq-\1-\2.tsv',
266         #                         input_filename)
267
268         # Construct dump file iterator
269         self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
270       
271         self.diff_engine = None
272
273         if self.persist == PersistMethod.sequence:  
274             self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
275
276         if self.persist == PersistMethod.segment:
277             self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
278
279     # def __get_namespace_from_title(self, title):
280     #     default_ns = None
281
282     #     for ns in self.namespaces:
283     #         # skip if the namespace is not defined
284     #         if ns == None:
285     #             default_ns = self.namespaces[ns]
286     #             continue
287
288     #         if title.startswith(ns + ":"):
289     #             return self.namespaces[ns]
290
291     #     # if we've made it this far with no matches, we return the default namespace
292     #     return default_ns
293
294     # def _set_namespace(self, rev_docs):
295         
296     #     for rev_data in rev_docs:
297     #         if 'namespace' not in rev_data['page']:
298     #             namespace = self.__get_namespace_from_title(page['title'])
299     #             rev_data['page']['namespace'] = namespace
300     #         yield rev_data
301
302     def process(self):
303         page_count = 0
304         rev_count = 0
305
306         for page in self.dump:
307
308             # skip pages not in the namespaces we want
309             if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
310                 continue
311
312             rev_detector = mwreverts.Detector()
313
314             if self.persist != PersistMethod.none:
315                 window = deque(maxlen=PERSISTENCE_RADIUS)
316
317                 if self.persist == PersistMethod.sequence:
318                     state = WikiqDiffState(SequenceMatcher(tokenizer = wikitext_split),
319                                                     revert_radius=PERSISTENCE_RADIUS)
320
321                 elif self.persist == PersistMethod.segment:
322                     state = WikiqDiffState(SegmentMatcher(tokenizer = wikitext_split),
323                                                     revert_radius=PERSISTENCE_RADIUS)
324
325                 else:
326                     from mw.lib import persistence
327                     state = persistence.State()
328
329             # Iterate through a page's revisions
330             for rev in page:
331                 rev_data = {'revid' : rev.id,
332                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
333                             'articleid' : page.id,
334                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
335                             'title' : '"' + page.title + '"',
336                             'namespace' : page.namespace,
337                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
338
339                 # if revisions are deleted, /many/ things will be missing
340                 if rev.deleted.text:
341                     rev_data['text_chars'] = ""
342                     rev_data['sha1'] = ""
343                     rev_data['revert'] = ""
344                     rev_data['reverteds'] = ""
345
346                 else:
347                     # rev.text can be None if the page has no text
348                     if not rev.text:
349                         rev.text = ""
350                     # if text exists, we'll check for a sha1 and generate one otherwise
351
352                     if rev.sha1:
353                         text_sha1 = rev.sha1
354                     else:
355
356                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
357                     
358                     rev_data['sha1'] = text_sha1
359
360                     # TODO rev.bytes doesn't work.. looks like a bug
361                     rev_data['text_chars'] = len(rev.text)
362                
363                     # generate revert data
364                     revert = rev_detector.process(text_sha1, rev.id)
365                     
366                     if revert:
367                         rev_data['revert'] = "TRUE"
368                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
369                     else:
370                         rev_data['revert'] = "FALSE"
371                         rev_data['reverteds'] = ""
372
373                 # if the fact that the edit was minor can be hidden, this might be an issue
374                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
375
376                 if not rev.deleted.user:
377                     # wrap user-defined editors in quotes for fread
378                     rev_data['editor'] = '"' + rev.user.text + '"'
379                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
380                     
381                 else:
382                     rev_data['anon'] = ""
383                     rev_data['editor'] = ""
384
385                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
386                 #    redirect = True
387                 #else:
388                 #    redirect = False
389                 
390                 #TODO missing: additions_size deletions_size
391                 
392                 # if collapse user was on, lets run that
393                 # if self.collapse_user:
394                 #     rev_data.collapsed_revs = rev.collapsed_revs
395
396                 if self.persist != PersistMethod.none:
397                     if rev.deleted.text:
398                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
399                             old_rev_data[k] = None
400                     else:
401  
402                         if self.persist != PersistMethod.legacy:
403                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
404
405                         else:
406                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
407                             
408                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
409                         
410                         if len(window) == PERSISTENCE_RADIUS:
411                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
412                             
413                             num_token_revs, num_tokens_added, num_tokens_removed  = calculate_persistence(old_tokens_added, old_tokens_removed)
414
415                             old_rev_data["token_revs"] = num_token_revs
416                             old_rev_data["tokens_added"] = num_tokens_added
417                             old_rev_data["tokens_removed"] = num_tokens_removed
418                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
419
420                             self.print_rev_data(old_rev_data)
421
422                 else:
423                     self.print_rev_data(rev_data)
424
425                 rev_count += 1
426
427             if self.persist != PersistMethod.none:
428                 # print out metadata for the last RADIUS revisions
429                 for i, item in enumerate(window):
430                     # if the window was full, we've already printed item 0
431                     if len(window) == PERSISTENCE_RADIUS and i == 0:
432                         continue
433
434                     rev_id, rev_data, tokens_added, tokens_removed = item
435
436                     num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed)
437
438                     rev_data["token_revs"] = num_token_revs
439                     rev_data["tokens_added"] = num_tokens_added
440                     rev_data["tokens_removed"] = num_tokens_removed
441                     rev_data["tokens_window"] = len(window)-(i+1)
442                     
443                     self.print_rev_data(rev_data)
444
445             page_count += 1
446
447         print("Done: %s revisions and %s pages." % (rev_count, page_count),
448               file=sys.stderr)
449
450     def print_rev_data(self, rev_data):
451         # if it's the first time through, print the header
452         if self.urlencode:
453             for field in TO_ENCODE:
454                 rev_data[field] = quote(str(rev_data[field]))
455
456         if not self.printed_header:
457             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
458             self.printed_header = True
459         
460         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
461
462
463 def open_input_file(input_filename):
464     if re.match(r'.*\.7z$', input_filename):
465         cmd = ["7za", "x", "-so", input_filename, '*'] 
466     elif re.match(r'.*\.gz$', input_filename):
467         cmd = ["zcat", input_filename] 
468     elif re.match(r'.*\.bz2$', input_filename):
469         cmd = ["bzcat", "-dk", input_filename] 
470
471     try:
472         input_file = Popen(cmd, stdout=PIPE).stdout
473     except NameError:
474         input_file = open(input_filename, 'r')
475
476     return input_file
477
478 def open_output_file(input_filename):
479     # create a regex that creates the output filename
480     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
481     output_filename = re.sub(r'\.xml', '', output_filename)
482     output_filename = output_filename + ".tsv"
483     output_file = open(output_filename, "w")
484
485     return output_file
486
487 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
488
489 # arguments for the input direction
490 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
491                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
492
493 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
494                     help="Directory for output files.")
495
496 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
497                     help="Write output to standard out (do not create dump file)")
498
499 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
500                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
501
502 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
503                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
504
505 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
506                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
507
508 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
509                     help="Id number of namspace to include. Can be specified more than once.")
510
511
512
513 args = parser.parse_args()
514
515 # set persistence method
516
517 if args.persist is None:
518     persist = PersistMethod.none
519 elif args.persist == "segment":
520     persist = PersistMethod.segment
521 elif args.persist == "legacy":
522     persist = PersistMethod.legacy
523 else:
524     persist = PersistMethod.sequence
525
526 if args.namespace_filter is not None:
527     namespaces = args.namespace_filter
528 else:
529     namespaces = None
530
531 if len(args.dumpfiles) > 0:
532     for filename in args.dumpfiles:
533         input_file = open_input_file(filename)
534
535         # open directory for output
536         if args.output_dir:
537             output_dir = args.output_dir[0]
538         else:
539             output_dir = "."
540
541         print("Processing file: %s" % filename, file=sys.stderr)
542
543         if args.stdout:
544             output_file = sys.stdout
545         else:
546             filename = os.path.join(output_dir, os.path.basename(filename))
547             output_file = open_output_file(filename)
548
549         wikiq = WikiqParser(input_file, output_file, 
550                             collapse_user=args.collapse_user,
551                             persist=persist,
552                             urlencode=args.urlencode,
553                             namespaces = namespaces)
554
555         wikiq.process()
556
557         # close things 
558         input_file.close()
559         output_file.close()
560 else:
561     wikiq = WikiqParser(sys.stdin, sys.stdout,
562                         collapse_user=args.collapse_user,
563                         persist=persist,
564                         persist_legacy=args.persist_legacy,
565                         urlencode=args.urlencode,
566                         namespaces = namespaces)
567     wikiq.process()
568
569 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
570 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?