]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
add test files
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os, os.path
9 import re
10
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
14
15 from mwxml import Dump, Page
16
17 from deltas.tokenizers import wikitext_split
18 from mwdiffs.utilities import dump2diffs
19 import mwpersistence
20 from mwpersistence.state import DiffState
21
22 from mwpersistence import Token
23 from  mwpersistence.utilities import diffs2persistence 
24 import mwreverts
25 from urllib.parse import quote
26
27 from deltas import SequenceMatcher
28 from deltas import SegmentMatcher
29 TO_ENCODE = ('title', 'editor')
30 PERSISTENCE_RADIUS=7
31
32 ws_lex = ['break','whitespace']
33 punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
34
35 class PersistMethod:
36     none = 0
37     sequence = 1
38     segment = 2
39     legacy = 3
40
41 def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False):
42
43     if not legacy:
44         cond =  lambda t: not (exclude_punct and (t.type in punct_lex)) \
45                 and not(exclude_ws and (t.type in ws_lex))
46
47         tokens_added = [t for t in tokens_added if cond(t)]
48         tokens_removed = [t for t in tokens_removed if cond(t)]
49
50     return(sum([(len(x.revisions)-1) for x in tokens_added]),
51            len(tokens_added),
52            len(tokens_removed)
53     )
54
55 class WikiqIterator(Dump):
56
57     @classmethod
58     def from_file(cls, fh, collapse_user = False):
59         cls.fh = fh
60         cls.collapse_user = collapse_user
61         cls = super(WikiqIterator, cls).from_file(fh)
62         return cls
63
64     @classmethod
65     def process_item(cls, item_element, namespace_map):
66         if not hasattr(cls,'inv_namespace_map'):
67             cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()}
68
69         if item_element.tag == "page":
70             return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user)
71         elif item_element.tag == "logitem":
72             return LogItem.from_element(item_element, namespace_map)
73         else:
74             raise MalformedXML("Expected to see <page> or <logitem>.  " +
75                                "Instead saw <{0}>".format(item_element.tag))
76
77 class WikiqPage(Page):
78     __slots__ = ('id', 'title', 'namespace', 'redirect',
79                  'restrictions','collapse_user')
80         
81     @classmethod
82     def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False):
83         cls.prev_rev = None
84
85         cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
86
87         # following mwxml, we assume namespace 0 in cases where
88         # page.namespace is inconsistent with namespace_map
89         # this undoes the "correction" of the namespace in mwxml
90         
91         if cls.namespace not in inv_namespace_map:
92             cls.namespace = 0
93         if cls.namespace != 0:
94             cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
95
96         cls.collapse_user = collapse_user
97         cls.revisions = cls._Page__revisions
98         return cls
99
100     @staticmethod
101     def _correct_sha(rev_data):
102
103         if rev_data.deleted.text:
104             rev_data.text = ""
105             rev_data.text_chars = 0
106             rev_data.sha1 = ""
107             rev_data.revert = ""
108             rev_data.reverteds = ""
109
110         else:
111             if rev_data.text is None :
112                 rev_data.text = ""
113                 
114         rev_data.text_chars = len(rev_data.text)
115
116         if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
117             text_sha1 = rev_data.sha1
118
119         else:
120             text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
121
122         rev_data.sha1 = text_sha1
123
124         return rev_data 
125
126     # Outline for how we want to handle collapse_user=True
127     # iteration   rev.user   prev_rev.user   add prev_rev?
128     #         0          A            None           Never
129     #         1          A               A           False
130     #         2          B               A            True
131     #         3          A               B            True
132     #         4          A               A           False
133     # Post-loop                          A          Always
134     def __find_next_revision(self):
135
136         if self.prev_rev is None:
137             prev_rev = WikiqPage._correct_sha(next(self.revisions))
138             self.prev_rev = prev_rev
139         else:
140             prev_rev = self.prev_rev
141
142         if self.collapse_user: 
143             collapsed_revs = 1
144             self.prev_rev.collapsed_revs = collapsed_revs
145             prev_rev = self.prev_rev
146
147         for rev in self.revisions:
148             rev = WikiqPage._correct_sha(rev)
149             if self.collapse_user:
150                 # yield if this is the last edit in a seq by a user and reset
151                 # also yield if we do know who the user is
152
153                 if rev.deleted.user or prev_rev.deleted.user:
154                     self.prev_rev = rev
155                     if prev_rev is not None:
156                         prev_rev.collapsed_revs = collapsed_revs
157                         return prev_rev
158
159                 elif not rev.user.text == prev_rev.user.text:
160                     self.prev_rev = rev
161                     if prev_rev is not None:
162                         prev_rev.collapsed_revs = collapsed_revs
163                         return prev_rev
164
165                 # otherwise, add one to the counter
166                 else:
167                     collapsed_revs += 1
168                     rev.collapsed_revs = collapsed_revs
169                 # if collapse_user is false, we always yield
170             else:
171                 self.prev_rev = rev
172                 if prev_rev is not None:
173                     return prev_rev
174             prev_rev = rev
175
176         self.prev_rev = None
177
178         if self.collapse_user:
179             prev_rev.collapsed_revs = collapsed_revs
180         return prev_rev
181
182
183     def __next__(self):
184         revision = self.__find_next_revision()
185         revision.page = self
186         return revision
187
188     def __iter__(self):
189         while(True):
190             revision = self.__find_next_revision()
191             revision.page = self
192             yield revision
193
194 class WikiqParser():
195     
196     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
197         """ 
198         Parameters:
199            persist : what persistence method to use. Takes a PersistMethod value
200         """
201         self.input_file = input_file
202         self.output_file = output_file
203         self.collapse_user = collapse_user
204         self.persist = persist
205         self.printed_header = False
206         self.namespaces = []
207         self.urlencode = urlencode
208         if namespaces is not None:
209             self.namespace_filter = set(namespaces)
210         else:
211             self.namespace_filter = None
212
213         # create a regex that creates the output filename
214         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
215         #                         r'output/wikiq-\1-\2.tsv',
216         #                         input_filename)
217
218         # Construct dump file iterator
219         self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
220       
221         self.diff_engine = None
222
223         if self.persist == PersistMethod.sequence:  
224             self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
225
226         if self.persist == PersistMethod.segment:
227             self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
228
229     # def __get_namespace_from_title(self, title):
230     #     default_ns = None
231
232     #     for ns in self.namespaces:
233     #         # skip if the namespace is not defined
234     #         if ns == None:
235     #             default_ns = self.namespaces[ns]
236     #             continue
237
238     #         if title.startswith(ns + ":"):
239     #             return self.namespaces[ns]
240
241     #     # if we've made it this far with no matches, we return the default namespace
242     #     return default_ns
243
244     # def _set_namespace(self, rev_docs):
245         
246     #     for rev_data in rev_docs:
247     #         if 'namespace' not in rev_data['page']:
248     #             namespace = self.__get_namespace_from_title(page['title'])
249     #             rev_data['page']['namespace'] = namespace
250     #         yield rev_data
251
252     def process(self):
253         page_count = 0
254         rev_count = 0
255
256         for page in self.dump:
257
258             # skip pages not in the namespaces we want
259             if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
260                 continue
261
262             rev_detector = mwreverts.Detector()
263
264             if self.persist != PersistMethod.none:
265                 window = deque(maxlen=PERSISTENCE_RADIUS)
266
267                 if self.persist == PersistMethod.sequence:
268                     state = DiffState(SequenceMatcher(tokenizer = wikitext_split),
269                                                     revert_radius=PERSISTENCE_RADIUS)
270
271                 elif self.persist == PersistMethod.segment:
272                     state = DiffState(SegmentMatcher(tokenizer = wikitext_split),
273                                                     revert_radius=PERSISTENCE_RADIUS)
274
275                 else:
276                     from mw.lib import persistence
277                     state = persistence.State()
278
279             # Iterate through a page's revisions
280             for rev in page:
281                 rev_data = {'revid' : rev.id,
282                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
283                             'articleid' : page.id,
284                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
285                             'title' : '"' + page.title + '"',
286                             'namespace' : page.namespace,
287                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
288
289                 # if revisions are deleted, /many/ things will be missing
290                 if rev.deleted.text:
291                     rev_data['text_chars'] = ""
292                     rev_data['sha1'] = ""
293                     rev_data['revert'] = ""
294                     rev_data['reverteds'] = ""
295
296                 else:
297                     # rev.text can be None if the page has no text
298                     if not rev.text:
299                         rev.text = ""
300                     # if text exists, we'll check for a sha1 and generate one otherwise
301
302                     if rev.sha1:
303                         text_sha1 = rev.sha1
304                     else:
305
306                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
307                     
308                     rev_data['sha1'] = text_sha1
309
310                     # TODO rev.bytes doesn't work.. looks like a bug
311                     rev_data['text_chars'] = len(rev.text)
312                
313                     # generate revert data
314                     revert = rev_detector.process(text_sha1, rev.id)
315                     
316                     if revert:
317                         rev_data['revert'] = "TRUE"
318                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
319                     else:
320                         rev_data['revert'] = "FALSE"
321                         rev_data['reverteds'] = ""
322
323                 # if the fact that the edit was minor can be hidden, this might be an issue
324                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
325
326                 if not rev.deleted.user:
327                     # wrap user-defined editors in quotes for fread
328                     rev_data['editor'] = '"' + rev.user.text + '"'
329                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
330                     
331                 else:
332                     rev_data['anon'] = ""
333                     rev_data['editor'] = ""
334
335                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
336                 #    redirect = True
337                 #else:
338                 #    redirect = False
339                 
340                 #TODO missing: additions_size deletions_size
341                 
342                 # if collapse user was on, lets run that
343                 # if self.collapse_user:
344                 #     rev_data.collapsed_revs = rev.collapsed_revs
345
346                 if self.persist != PersistMethod.none:
347                     if rev.deleted.text:
348                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
349                             old_rev_data[k] = None
350                     else:
351  
352                         if self.persist != PersistMethod.legacy:
353                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
354
355                         else:
356                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
357                             
358                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
359                         
360                         if len(window) == PERSISTENCE_RADIUS:
361                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
362                             
363                             num_token_revs, num_tokens_added, num_tokens_removed  = calculate_persistence(old_tokens_added, old_tokens_removed, legacy = self.persist == PersistMethod.legacy)
364
365                             old_rev_data["token_revs"] = num_token_revs
366                             old_rev_data["tokens_added"] = num_tokens_added
367                             old_rev_data["tokens_removed"] = num_tokens_removed
368                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
369
370                             self.print_rev_data(old_rev_data)
371
372                 else:
373                     self.print_rev_data(rev_data)
374
375                 rev_count += 1
376
377             if self.persist != PersistMethod.none:
378                 # print out metadata for the last RADIUS revisions
379                 for i, item in enumerate(window):
380                     # if the window was full, we've already printed item 0
381                     if len(window) == PERSISTENCE_RADIUS and i == 0:
382                         continue
383
384                     rev_id, rev_data, tokens_added, tokens_removed = item
385
386                     num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed, legacy = self.persist == PersistMethod.legacy)
387
388                     rev_data["token_revs"] = num_token_revs
389                     rev_data["tokens_added"] = num_tokens_added
390                     rev_data["tokens_removed"] = num_tokens_removed
391                     rev_data["tokens_window"] = len(window)-(i+1)
392                     
393                     self.print_rev_data(rev_data)
394
395             page_count += 1
396
397         print("Done: %s revisions and %s pages." % (rev_count, page_count),
398               file=sys.stderr)
399
400     def print_rev_data(self, rev_data):
401         # if it's the first time through, print the header
402         if self.urlencode:
403             for field in TO_ENCODE:
404                 rev_data[field] = quote(str(rev_data[field]))
405
406         if not self.printed_header:
407             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
408             self.printed_header = True
409         
410         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
411
412
413 def open_input_file(input_filename):
414     if re.match(r'.*\.7z$', input_filename):
415         cmd = ["7za", "x", "-so", input_filename, '*'] 
416     elif re.match(r'.*\.gz$', input_filename):
417         cmd = ["zcat", input_filename] 
418     elif re.match(r'.*\.bz2$', input_filename):
419         cmd = ["bzcat", "-dk", input_filename] 
420
421     try:
422         input_file = Popen(cmd, stdout=PIPE).stdout
423     except NameError:
424         input_file = open(input_filename, 'r')
425
426     return input_file
427
428 def open_output_file(input_filename):
429     # create a regex that creates the output filename
430     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
431     output_filename = re.sub(r'\.xml', '', output_filename)
432     output_filename = output_filename + ".tsv"
433     output_file = open(output_filename, "w")
434
435     return output_file
436
437 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
438
439 # arguments for the input direction
440 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
441                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
442
443 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
444                     help="Directory for output files.")
445
446 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
447                     help="Write output to standard out (do not create dump file)")
448
449 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
450                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
451
452 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
453                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
454
455 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
456                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
457
458 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
459                     help="Id number of namspace to include. Can be specified more than once.")
460
461
462
463 args = parser.parse_args()
464
465 # set persistence method
466
467 if args.persist is None:
468     persist = PersistMethod.none
469 elif args.persist == "segment":
470     persist = PersistMethod.segment
471 elif args.persist == "legacy":
472     persist = PersistMethod.legacy
473 else:
474     persist = PersistMethod.sequence
475
476 if args.namespace_filter is not None:
477     namespaces = args.namespace_filter
478 else:
479     namespaces = None
480
481 if len(args.dumpfiles) > 0:
482     for filename in args.dumpfiles:
483         input_file = open_input_file(filename)
484
485         # open directory for output
486         if args.output_dir:
487             output_dir = args.output_dir[0]
488         else:
489             output_dir = "."
490
491         print("Processing file: %s" % filename, file=sys.stderr)
492
493         if args.stdout:
494             output_file = sys.stdout
495         else:
496             filename = os.path.join(output_dir, os.path.basename(filename))
497             output_file = open_output_file(filename)
498
499         wikiq = WikiqParser(input_file, output_file, 
500                             collapse_user=args.collapse_user,
501                             persist=persist,
502                             urlencode=args.urlencode,
503                             namespaces = namespaces)
504
505         wikiq.process()
506
507         # close things 
508         input_file.close()
509         output_file.close()
510 else:
511     wikiq = WikiqParser(sys.stdin, sys.stdout,
512                         collapse_user=args.collapse_user,
513                         persist=persist,
514                         persist_legacy=args.persist_legacy,
515                         urlencode=args.urlencode,
516                         namespaces = namespaces)
517     wikiq.process()
518
519 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
520 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?