]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
checking in work to deepen migration to new mediawikiutils
[mediawiki_dump_tools.git] / wikiq
1  #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb 
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump, Page
17
18 from deltas.tokenizers import wikitext_split
19 from mwdiffs.utilities import dump2diffs
20 import mwpersistence
21 from mwpersistence.state import Version, apply_opdocs, apply_operations, persist_revision_once
22
23 from mwpersistence import Token
24 from  mwpersistence.utilities import diffs2persistence 
25 import mwreverts
26 from urllib.parse import quote
27
28 from deltas import SequenceMatcher
29 from deltas import SegmentMatcher
30 TO_ENCODE = ('title', 'editor')
31 PERSISTENCE_RADIUS=7
32
33 # this is a simple override of mwpersistence.DiffState that doesn't do anything special for reverts. 
34 class WikiqDiffState(mwpersistence.DiffState):
35     def _update(self, text=None, checksum=None, opdocs=None, revision=None):
36         if checksum is None:
37             if text is None:
38                 raise TypeError("Either 'text' or 'checksum' must be " +
39                                 "specified.")
40             else:
41                 checksum = sha1(bytes(text, 'utf8')).hexdigest()
42
43         current_version = Version()
44
45         # the main difference we have is that we don't do anything special for reverts
46         if opdocs is not None:
47             transition = apply_opdocs(opdocs, self.last.tokens or [])
48             current_version.tokens, _, _ = transition
49         else:
50             # NOTICE: HEAVY COMPUTATION HERE!!!
51             #
52             # Diffs usually run in O(n^2) -- O(n^3) time and most
53             # tokenizers produce a lot of tokens.
54             if self.diff_processor is None:
55                 raise RuntimeError("DiffState cannot process raw text " +
56                                    "without a diff_engine specified.")
57             operations, _, current_tokens = \
58                 self.diff_processor.process(text, token_class=Token)
59
60             transition = apply_operations(operations,
61                                           self.last.tokens or [],
62                                           current_tokens)
63             current_version.tokens, _, _ = transition
64
65         # Record persistence
66         persist_revision_once(current_version.tokens, revision)
67
68         # Update last version
69         self.last = current_version
70
71         # Return the tranisitoned state
72         return transition
73
74 class PersistMethod:
75     none = 0
76     sequence = 1
77     segment = 2
78     legacy = 3
79
80 def calculate_persistence(tokens_added):
81     return(sum([(len(x.revisions)-1) for x in tokens_added]),
82            len(tokens_added))
83
84 class WikiqIterator(Dump):
85
86     @classmethod
87     def from_file(cls, fh, collapse_user = False):
88         cls = super(WikiqIterator, cls).from_file(fh)
89         cls.fh = fh
90         cls.collapse_user = collapse_user
91         cls.namespace_map = { ns.id : ns.name for ns in
92                                cls.site_info.namespaces }
93         return cls
94
95     @classmethod
96     def process_item(cls, item_element, namespace_map, collapse_user = False):
97         if item_element.tag == "page":
98             return WikiqPage.from_element(item_element, namespace_map, collapse_user)
99         elif item_element.tag == "logitem":
100             return LogItem.from_element(item_element, namespace_map)
101         else:
102             raise MalformedXML("Expected to see <page> or <logitem>.  " +
103                                "Instead saw <{0}>".format(item_element.tag))
104
105 class WikiqPage(Page):
106     __slots__ = ('id', 'title', 'namespace', 'redirect',
107                  'restrictions','collapse_user')
108         
109     @classmethod
110     def from_element(cls, item_element, namespace_map, collapse_user = False):
111         cls.prev_rev = None
112
113         inv_namespace_map = {ns.id:name for name,ns in namespace_map.items()}
114         
115         cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
116
117         # following mwxml, we assume namespace 0 in cases where
118         # page.namespace is inconsistent with namespace_map
119         # this undoes the "correction" of the namespace in mwxml
120         
121         if cls.namespace not in inv_namespace_map:
122             cls.namespace = 0
123         if cls.namespace != 0:
124             cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
125
126         cls.collapse_user = collapse_user
127         cls.revisions = cls._Page__revisions
128         return cls
129
130     @staticmethod
131     def _correct_sha(rev_data):
132
133         if rev_data.deleted.text:
134             rev_data.text = ""
135             rev_data.text_chars = 0
136             rev_data.sha1 = ""
137             rev_data.revert = ""
138             rev_data.reverteds = ""
139
140         else:
141             if rev_data.text is None :
142                 rev_data.text = ""
143                 
144         rev_data.text_chars = len(rev_data.text)
145
146         if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
147             text_sha1 = rev_data.sha1
148
149         else:
150             text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
151
152         rev_data.sha1 = text_sha1
153
154         return rev_data 
155
156     # Outline for how we want to handle collapse_user=True
157     # iteration   rev.user   prev_rev.user   add prev_rev?
158     #         0          A            None           Never
159     #         1          A               A           False
160     #         2          B               A            True
161     #         3          A               B            True
162     #         4          A               A           False
163     # Post-loop                          A          Always
164     def __find_next_revision(self):
165
166         if self.prev_rev is None:
167             prev_rev = WikiqPage._correct_sha(next(self.revisions))
168             self.prev_rev = prev_rev
169         else:
170             prev_rev = self.prev_rev
171
172         if self.collapse_user: 
173             collapsed_revs = 1
174             rev.collapsed_revs = collapsed_revs
175
176         for rev in self.revisions:
177             rev = WikiqPage._correct_sha(rev)
178             if self.collapse_user:
179                 # yield if this is the last edit in a seq by a user and reset
180                 # also yield if we do know who the user is
181
182                 if rev.deleted.user or prev_rev.deleted.user:
183                     self.prev_rev = rev
184                     if prev_rev is not None:
185                         prev_rev.collapsed_revs = collapsed_revs
186                         return prev_rev
187
188                 elif not rev.user.text == prev_rev.user.text:
189                     self.prev_rev = rev
190                     if prev_rev is not None:
191                         prev_rev.collapsed_revs = collapsed_revs
192                         return prev_rev
193
194                 # otherwise, add one to the counter
195                 else:
196                     collapsed_revs += 1
197                     rev.collapsed_revs = collapsed_revs
198                 # if collapse_user is false, we always yield
199             else:
200                 self.prev_rev = rev
201                 if prev_rev is not None:
202                     return prev_rev
203             prev_rev = rev
204
205         self.prev_rev = None
206
207         if self.collapse_user:
208             prev_rev.collapsed_revs = collapsed_revs
209         return prev_rev
210
211
212     def __next__(self):
213         revision = self.__find_next_revision()
214         revision.page = self
215         return revision
216
217     def __iter__(self):
218         while(True):
219             revision = self.__find_next_revision()
220             revision.page = self
221             yield revision
222
223     # def __iter__(self):
224     #     return self.__revisions
225
226     # def __next__(self):
227     #     return next(self.__revisions)
228
229 class WikiqParser():
230     
231     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
232         """ 
233         Parameters:
234            persist : what persistence method to use. Takes a PersistMethod value
235         """
236         self.input_file = input_file
237         self.output_file = output_file
238         self.collapse_user = collapse_user
239         self.persist = persist
240         self.printed_header = False
241         self.namespaces = []
242         self.urlencode = urlencode
243         if namespaces is not None:
244             self.namespace_filter = set(namespaces)
245         else:
246             self.namespace_filter = None
247
248         # create a regex that creates the output filename
249         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
250         #                         r'output/wikiq-\1-\2.tsv',
251         #                         input_filename)
252
253         # Construct dump file iterator
254         self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
255       
256         self.diff_engine = None
257
258         if self.persist == PersistMethod.sequence:  
259             self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
260
261         if self.persist == PersistMethod.segment:
262             self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
263
264     # def __get_namespace_from_title(self, title):
265     #     default_ns = None
266
267     #     for ns in self.namespaces:
268     #         # skip if the namespace is not defined
269     #         if ns == None:
270     #             default_ns = self.namespaces[ns]
271     #             continue
272
273     #         if title.startswith(ns + ":"):
274     #             return self.namespaces[ns]
275
276     #     # if we've made it this far with no matches, we return the default namespace
277     #     return default_ns
278
279     # def _set_namespace(self, rev_docs):
280         
281     #     for rev_data in rev_docs:
282     #         if 'namespace' not in rev_data['page']:
283     #             namespace = self.__get_namespace_from_title(page['title'])
284     #             rev_data['page']['namespace'] = namespace
285     #         yield rev_data
286
287     def process(self):
288         page_count = 0
289         rev_count = 0
290
291         for page in self.dump:
292
293             # skip pages not in the namespaces we want
294             if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
295                 continue
296
297             rev_detector = mwreverts.Detector()
298
299             if self.persist != PersistMethod.none:
300                 window = deque(maxlen=PERSISTENCE_RADIUS)
301
302                 if self.persist == PersistMethod.sequence:
303                     state = WikiqDiffState(SequenceMatcher(tokenizer = wikitext_split),
304                                                     revert_radius=PERSISTENCE_RADIUS)
305
306                 elif self.persist == PersistMethod.segment:
307                     state = WikiqDiffState(SegmentMatcher(tokenizer = wikitext_split),
308                                                     revert_radius=PERSISTENCE_RADIUS)
309
310                 else:
311                     from mw.lib import persistence
312                     state = persistence.State()
313
314             # Iterate through a page's revisions
315             for rev in page:
316                 rev_data = {'revid' : rev.id,
317                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
318                             'articleid' : page.id,
319                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
320                             'title' : '"' + page.title + '"',
321                             'namespace' : page.namespace,
322                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
323
324                 # if revisions are deleted, /many/ things will be missing
325                 if rev.deleted.text:
326                     rev_data['text_chars'] = ""
327                     rev_data['sha1'] = ""
328                     rev_data['revert'] = ""
329                     rev_data['reverteds'] = ""
330
331                 else:
332                     # rev.text can be None if the page has no text
333                     if not rev.text:
334                         rev.text = ""
335                     # if text exists, we'll check for a sha1 and generate one otherwise
336
337                     if rev.sha1:
338                         text_sha1 = rev.sha1
339                     else:
340
341                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
342                     
343                     rev_data['sha1'] = text_sha1
344
345                     # TODO rev.bytes doesn't work.. looks like a bug
346                     rev_data['text_chars'] = len(rev.text)
347                
348                     # generate revert data
349                     revert = rev_detector.process(text_sha1, rev.id)
350                     
351                     if revert:
352                         rev_data['revert'] = "TRUE"
353                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
354                     else:
355                         rev_data['revert'] = "FALSE"
356                         rev_data['reverteds'] = ""
357
358                 # if the fact that the edit was minor can be hidden, this might be an issue
359                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
360
361                 if not rev.deleted.user:
362                     # wrap user-defined editors in quotes for fread
363                     rev_data['editor'] = '"' + rev.user.text + '"'
364                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
365                     
366                 else:
367                     rev_data['anon'] = ""
368                     rev_data['editor'] = ""
369
370                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
371                 #    redirect = True
372                 #else:
373                 #    redirect = False
374                 
375                 #TODO missing: additions_size deletions_size
376                 
377                 # if collapse user was on, lets run that
378                 # if self.collapse_user:
379                 #     rev_data.collapsed_revs = rev.collapsed_revs
380
381                 if self.persist != PersistMethod.none:
382                     if rev.deleted.text:
383                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
384                             old_rev_data[k] = None
385                     else:
386  
387                         if self.persist != PersistMethod.legacy:
388                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
389
390                         else:
391                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
392                             
393                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
394                         
395                         if len(window) == PERSISTENCE_RADIUS:
396                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
397                             
398                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
399
400                             old_rev_data["token_revs"] = num_token_revs
401                             old_rev_data["tokens_added"] = num_tokens
402                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
403                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
404
405                             self.print_rev_data(old_rev_data)
406
407                 else:
408                     self.print_rev_data(rev_data)
409
410                 rev_count += 1
411
412             if self.persist != PersistMethod.none:
413                 # print out metadata for the last RADIUS revisions
414                 for i, item in enumerate(window):
415                     # if the window was full, we've already printed item 0
416                     if len(window) == PERSISTENCE_RADIUS and i == 0:
417                         continue
418
419                     rev_id, rev_data, tokens_added, tokens_removed = item
420                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
421
422                     rev_data["token_revs"] = num_token_revs
423                     rev_data["tokens_added"] = num_tokens
424                     rev_data["tokens_removed"] = len(tokens_removed)
425                     rev_data["tokens_window"] = len(window)-(i+1)
426                     
427                     self.print_rev_data(rev_data)
428
429             page_count += 1
430
431         print("Done: %s revisions and %s pages." % (rev_count, page_count),
432               file=sys.stderr)
433
434     def print_rev_data(self, rev_data):
435         # if it's the first time through, print the header
436         if self.urlencode:
437             for field in TO_ENCODE:
438                 rev_data[field] = quote(str(rev_data[field]))
439
440         if not self.printed_header:
441             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
442             self.printed_header = True
443         
444         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
445
446
447 def open_input_file(input_filename):
448     if re.match(r'.*\.7z$', input_filename):
449         cmd = ["7za", "x", "-so", input_filename, '*'] 
450     elif re.match(r'.*\.gz$', input_filename):
451         cmd = ["zcat", input_filename] 
452     elif re.match(r'.*\.bz2$', input_filename):
453         cmd = ["bzcat", "-dk", input_filename] 
454
455     try:
456         input_file = Popen(cmd, stdout=PIPE).stdout
457     except NameError:
458         input_file = open(input_filename, 'r')
459
460     return input_file
461
462 def open_output_file(input_filename):
463     # create a regex that creates the output filename
464     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
465     output_filename = re.sub(r'\.xml', '', output_filename)
466     output_filename = output_filename + ".tsv"
467     output_file = open(output_filename, "w")
468
469     return output_file
470
471 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
472
473 # arguments for the input direction
474 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
475                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
476
477 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
478                     help="Directory for output files.")
479
480 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
481                     help="Write output to standard out (do not create dump file)")
482
483 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
484                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
485
486 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
487                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
488
489 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
490                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
491
492 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
493                     help="Id number of namspace to include. Can be specified more than once.")
494
495
496
497 args = parser.parse_args()
498
499 # set persistence method
500
501 if args.persist is None:
502     persist = PersistMethod.none
503 elif args.persist == "segment":
504     persist = PersistMethod.segment
505 elif args.persist == "legacy":
506     persist = PersistMethod.legacy
507 else:
508     persist = PersistMethod.sequence
509
510 if args.namespace_filter is not None:
511     namespaces = args.namespace_filter
512 else:
513     namespaces = None
514
515 if len(args.dumpfiles) > 0:
516     for filename in args.dumpfiles:
517         input_file = open_input_file(filename)
518
519         # open directory for output
520         if args.output_dir:
521             output_dir = args.output_dir[0]
522         else:
523             output_dir = "."
524
525         print("Processing file: %s" % filename, file=sys.stderr)
526
527         if args.stdout:
528             output_file = sys.stdout
529         else:
530             filename = os.path.join(output_dir, os.path.basename(filename))
531             output_file = open_output_file(filename)
532
533         wikiq = WikiqParser(input_file, output_file, 
534                             collapse_user=args.collapse_user,
535                             persist=persist,
536                             urlencode=args.urlencode,
537                             namespaces = namespaces)
538
539         wikiq.process()
540
541         # close things 
542         input_file.close()
543         output_file.close()
544 else:
545     wikiq = WikiqParser(sys.stdin, sys.stdout,
546                         collapse_user=args.collapse_user,
547                         persist=persist,
548                         persist_legacy=args.persist_legacy,
549                         urlencode=args.urlencode,
550                         namespaces = namespaces)
551     wikiq.process()
552
553 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
554 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?