]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
add flag for excluding whitespace and punctuation
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os, os.path
9 import re
10
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
14
15 from mwxml import Dump, Page
16
17 from deltas.tokenizers import wikitext_split
18 from mwdiffs.utilities import dump2diffs
19 import mwpersistence
20 from mwpersistence.state import DiffState
21
22 from mwpersistence import Token
23 from  mwpersistence.utilities import diffs2persistence 
24 import mwreverts
25 from urllib.parse import quote
26
27 from deltas import SequenceMatcher
28 from deltas import SegmentMatcher
29 TO_ENCODE = ('title', 'editor')
30 PERSISTENCE_RADIUS=7
31
32 ws_lex = ['break','whitespace']
33 punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
34
35 class PersistMethod:
36     none = 0
37     sequence = 1
38     segment = 2
39     legacy = 3
40
41 def calculate_persistence(tokens_added, tokens_removed, exclude_ws = False, exclude_punct = False, legacy = False):
42
43     if not legacy:
44         cond =  lambda t: not (exclude_punct and (t.type in punct_lex)) \
45                 and not(exclude_ws and (t.type in ws_lex))
46
47         tokens_added = [t for t in tokens_added if cond(t)]
48         tokens_removed = [t for t in tokens_removed if cond(t)]
49
50     return(sum([(len(x.revisions)-1) for x in tokens_added]),
51            len(tokens_added),
52            len(tokens_removed)
53     )
54
55 class WikiqIterator(Dump):
56
57     @classmethod
58     def from_file(cls, fh, collapse_user = False):
59         cls.fh = fh
60         cls.collapse_user = collapse_user
61         cls = super(WikiqIterator, cls).from_file(fh)
62         return cls
63
64     @classmethod
65     def process_item(cls, item_element, namespace_map):
66         if not hasattr(cls,'inv_namespace_map'):
67             cls.inv_namespace_map = {ns.id:name for name, ns in namespace_map.items()}
68
69         if item_element.tag == "page":
70             return WikiqPage.from_element(item_element, namespace_map, cls.inv_namespace_map, cls.collapse_user)
71         elif item_element.tag == "logitem":
72             return LogItem.from_element(item_element, namespace_map)
73         else:
74             raise MalformedXML("Expected to see <page> or <logitem>.  " +
75                                "Instead saw <{0}>".format(item_element.tag))
76
77 class WikiqPage(Page):
78     __slots__ = ('id', 'title', 'namespace', 'redirect',
79                  'restrictions','collapse_user')
80         
81     @classmethod
82     def from_element(cls, item_element, namespace_map, inv_namespace_map, collapse_user = False):
83         cls.prev_rev = None
84
85         cls = super(WikiqPage, cls).from_element(item_element, namespace_map)
86
87         # following mwxml, we assume namespace 0 in cases where
88         # page.namespace is inconsistent with namespace_map
89         # this undoes the "correction" of the namespace in mwxml
90         
91         if cls.namespace not in inv_namespace_map:
92             cls.namespace = 0
93         if cls.namespace != 0:
94             cls.title = ':'.join([inv_namespace_map[cls.namespace], cls.title])
95
96         cls.collapse_user = collapse_user
97         cls.revisions = cls._Page__revisions
98         return cls
99
100     @staticmethod
101     def _correct_sha(rev_data):
102
103         if rev_data.deleted.text:
104             rev_data.text = ""
105             rev_data.text_chars = 0
106             rev_data.sha1 = ""
107             rev_data.revert = ""
108             rev_data.reverteds = ""
109
110         else:
111             if rev_data.text is None :
112                 rev_data.text = ""
113                 
114         rev_data.text_chars = len(rev_data.text)
115
116         if hasattr(rev_data,"sha1") and rev_data.sha1 is not None:
117             text_sha1 = rev_data.sha1
118
119         else:
120             text_sha1 = sha1(bytes(rev_data.text, "utf8")).hexdigest()
121
122         rev_data.sha1 = text_sha1
123
124         return rev_data 
125
126     # Outline for how we want to handle collapse_user=True
127     # iteration   rev.user   prev_rev.user   add prev_rev?
128     #         0          A            None           Never
129     #         1          A               A           False
130     #         2          B               A            True
131     #         3          A               B            True
132     #         4          A               A           False
133    # Post-loop                          A          Always
134     def __find_next_revision(self):
135         if self.prev_rev is None:
136             prev_rev = WikiqPage._correct_sha(next(self.revisions))
137             self.prev_rev = prev_rev
138         else:
139             prev_rev = self.prev_rev
140
141         if self.collapse_user: 
142             collapsed_revs = 1
143             self.prev_rev.collapsed_revs = collapsed_revs
144             prev_rev = self.prev_rev
145
146         for rev in self.revisions:
147             rev = WikiqPage._correct_sha(rev)
148             if self.collapse_user:
149                 # yield if this is the last edit in a seq by a user and reset
150                 # also yield if we do know who the user is
151
152                 if rev.deleted.user or prev_rev.deleted.user:
153                     self.prev_rev = rev
154                     if prev_rev is not None:
155                         prev_rev.collapsed_revs = collapsed_revs
156                         return prev_rev
157
158                 elif not rev.user.text == prev_rev.user.text:
159                     self.prev_rev = rev
160                     if prev_rev is not None:
161                         prev_rev.collapsed_revs = collapsed_revs
162                         return prev_rev
163
164                 # otherwise, add one to the counter
165                 else:
166                     collapsed_revs += 1
167                     rev.collapsed_revs = collapsed_revs
168                 # if collapse_user is false, we always yield
169             else:
170                 self.prev_rev = rev
171                 if prev_rev is not None:
172                     return prev_rev
173             prev_rev = rev
174
175         self.prev_rev = None
176
177         if self.collapse_user:
178             prev_rev.collapsed_revs = collapsed_revs
179         return prev_rev
180
181
182     def __next__(self):
183         revision = self.__find_next_revision()
184         revision.page = self
185         return revision
186
187     def __iter__(self):
188         while(True):
189             revision = self.__find_next_revision()
190             revision.page = self
191             yield revision
192
193 class WikiqParser():
194     
195     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None, exclude_punct = False, exclude_ws = False):
196         """ 
197         Parameters:
198            persist : what persistence method to use. Takes a PersistMethod value
199         """
200         self.input_file = input_file
201         self.output_file = output_file
202         self.collapse_user = collapse_user
203         self.persist = persist
204         self.printed_header = False
205         self.namespaces = []
206         self.urlencode = urlencode
207         if namespaces is not None:
208             self.namespace_filter = set(namespaces)
209         else:
210             self.namespace_filter = None
211
212         self.exclude_punct = exclude_punct
213         self.exclude_ws = exclude_ws
214         
215         # Construct dump file iterator
216         self.dump = WikiqIterator.from_file(self.input_file, self.collapse_user)
217       
218         self.diff_engine = None
219
220         if self.persist == PersistMethod.sequence:  
221             self.diff_engine = SequenceMatcher(tokenizer = wikitext_split)
222
223         if self.persist == PersistMethod.segment:
224             self.diff_engine = SegmentMatcher(tokenizer = wikitext_split)
225
226     def process(self):
227         page_count = 0
228         rev_count = 0
229
230         for page in self.dump:
231
232             # skip pages not in the namespaces we want
233             if self.namespace_filter is not None and page.namespace not in self.namespace_filter:
234                 continue
235
236             rev_detector = mwreverts.Detector()
237
238             if self.persist != PersistMethod.none:
239                 window = deque(maxlen=PERSISTENCE_RADIUS)
240
241                 if self.persist == PersistMethod.sequence:
242                     state = DiffState(SequenceMatcher(tokenizer = wikitext_split),
243                                                     revert_radius=PERSISTENCE_RADIUS)
244
245                 elif self.persist == PersistMethod.segment:
246                     state = DiffState(SegmentMatcher(tokenizer = wikitext_split),
247                                                     revert_radius=PERSISTENCE_RADIUS)
248
249                 else:
250                     from mw.lib import persistence
251                     state = persistence.State()
252
253             # Iterate through a page's revisions
254             for rev in page:
255                 rev_data = {'revid' : rev.id,
256                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
257                             'articleid' : page.id,
258                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
259                             'title' : '"' + page.title + '"',
260                             'namespace' : page.namespace,
261                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
262
263                 # if revisions are deleted, /many/ things will be missing
264                 if rev.deleted.text:
265                     rev_data['text_chars'] = ""
266                     rev_data['sha1'] = ""
267                     rev_data['revert'] = ""
268                     rev_data['reverteds'] = ""
269
270                 else:
271                     # rev.text can be None if the page has no text
272                     if not rev.text:
273                         rev.text = ""
274                     # if text exists, we'll check for a sha1 and generate one otherwise
275
276                     if rev.sha1:
277                         text_sha1 = rev.sha1
278                     else:
279
280                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
281                     
282                     rev_data['sha1'] = text_sha1
283
284                     # TODO rev.bytes doesn't work.. looks like a bug
285                     rev_data['text_chars'] = len(rev.text)
286                
287                     # generate revert data
288                     revert = rev_detector.process(text_sha1, rev.id)
289                     
290                     if revert:
291                         rev_data['revert'] = "TRUE"
292                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
293                     else:
294                         rev_data['revert'] = "FALSE"
295                         rev_data['reverteds'] = ""
296
297                 # if the fact that the edit was minor can be hidden, this might be an issue
298                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
299
300                 if not rev.deleted.user:
301                     # wrap user-defined editors in quotes for fread
302                     rev_data['editor'] = '"' + rev.user.text + '"'
303                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
304                     
305                 else:
306                     rev_data['anon'] = ""
307                     rev_data['editor'] = ""
308
309                 # we can easily add redirect info
310                 # rev_data['redirect'] = rev.page.redirect
311                 
312                 if self.collapse_user:
313                     rev_data['collapsed_revs'] = rev.collapsed_revs
314
315                 if self.persist != PersistMethod.none:
316                     if rev.deleted.text:
317                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
318                             old_rev_data[k] = None
319                     else:
320  
321                         if self.persist != PersistMethod.legacy:
322                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
323
324                         else:
325                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
326                             
327                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
328                         
329                         if len(window) == PERSISTENCE_RADIUS:
330                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
331                             
332                             num_token_revs, \
333                                 num_tokens_added, \
334                                 num_tokens_removed = \
335                                                      calculate_persistence(
336                                                          old_tokens_added,
337                                                          old_tokens_removed,
338                                                          exclude_ws = self.exclude_ws,
339                                                          exclude_punct = self.exclude_punct,
340                                                          legacy = self.persist == PersistMethod.legacy)
341                             
342                             old_rev_data["token_revs"] = num_token_revs
343                             old_rev_data["tokens_added"] = num_tokens_added
344                             old_rev_data["tokens_removed"] = num_tokens_removed
345                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
346
347                             self.print_rev_data(old_rev_data)
348
349                 else:
350                     self.print_rev_data(rev_data)
351
352                 rev_count += 1
353
354             if self.persist != PersistMethod.none:
355                 # print out metadata for the last RADIUS revisions
356                 for i, item in enumerate(window):
357                     # if the window was full, we've already printed item 0
358                     if len(window) == PERSISTENCE_RADIUS and i == 0:
359                         continue
360
361                     rev_id, rev_data, tokens_added, tokens_removed = item
362
363                     num_token_revs, \
364                         num_tokens_added, \
365                         num_tokens_removed = calculate_persistence(
366                             tokens_added,
367                             tokens_removed,
368                             exclude_ws = self.exclude_ws,
369                             exclude_punct = self.exclude_punct,
370                             legacy = self.persist == PersistMethod.legacy)
371
372
373                     rev_data["token_revs"] = num_token_revs
374                     rev_data["tokens_added"] = num_tokens_added
375                     rev_data["tokens_removed"] = num_tokens_removed
376                     rev_data["tokens_window"] = len(window)-(i+1)
377                     
378                     self.print_rev_data(rev_data)
379
380             page_count += 1
381
382         print("Done: %s revisions and %s pages." % (rev_count, page_count),
383               file=sys.stderr)
384
385     def print_rev_data(self, rev_data):
386         # if it's the first time through, print the header
387         if self.urlencode:
388             for field in TO_ENCODE:
389                 rev_data[field] = quote(str(rev_data[field]))
390
391         if not self.printed_header:
392             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
393             self.printed_header = True
394         
395         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
396
397
398 def open_input_file(input_filename):
399     if re.match(r'.*\.7z$', input_filename):
400         cmd = ["7za", "x", "-so", input_filename, '*'] 
401     elif re.match(r'.*\.gz$', input_filename):
402         cmd = ["zcat", input_filename] 
403     elif re.match(r'.*\.bz2$', input_filename):
404         cmd = ["bzcat", "-dk", input_filename] 
405
406     try:
407         input_file = Popen(cmd, stdout=PIPE).stdout
408     except NameError:
409         input_file = open(input_filename, 'r')
410
411     return input_file
412
413 def open_output_file(input_filename):
414     # create a regex that creates the output filename
415     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
416     output_filename = re.sub(r'\.xml', '', output_filename)
417     output_filename = output_filename + ".tsv"
418     output_file = open(output_filename, "w")
419
420     return output_file
421
422 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
423
424 # arguments for the input direction
425 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
426                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
427
428 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
429                     help="Directory for output files.")
430
431 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
432                     help="Write output to standard out (do not create dump file)")
433
434 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
435                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
436
437 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
438                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
439
440 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
441                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
442
443 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
444                     help="Id number of namspace to include. Can be specified more than once.")
445
446 parser.add_argument('--exclude-whitespace', dest="exclude_ws", action="store_true",
447                     help="Flag to remove whitespace from persistence measures.")
448
449 parser.add_argument('--exclude-punctuation', dest="exclude_punct", action="store_true",
450                     help="Flag to remove punctuation from persistence measures.")
451
452 args = parser.parse_args()
453
454 # set persistence method
455
456 if args.persist is None:
457     persist = PersistMethod.none
458 elif args.persist == "segment":
459     persist = PersistMethod.segment
460 elif args.persist == "legacy":
461     persist = PersistMethod.legacy
462 else:
463     persist = PersistMethod.sequence
464
465 if args.namespace_filter is not None:
466     namespaces = args.namespace_filter
467 else:
468     namespaces = None
469
470 if len(args.dumpfiles) > 0:
471     for filename in args.dumpfiles:
472         input_file = open_input_file(filename)
473
474         # open directory for output
475         if args.output_dir:
476             output_dir = args.output_dir[0]
477         else:
478             output_dir = "."
479
480         print("Processing file: %s" % filename, file=sys.stderr)
481
482         if args.stdout:
483             output_file = sys.stdout
484         else:
485             filename = os.path.join(output_dir, os.path.basename(filename))
486             output_file = open_output_file(filename)
487
488         wikiq = WikiqParser(input_file, output_file, 
489                             collapse_user=args.collapse_user,
490                             persist=persist,
491                             urlencode=args.urlencode,
492                             namespaces = namespaces,
493                             exclude_punct = args.exclude_punct,
494                             exclude_ws = args.exclude_ws)
495
496         wikiq.process()
497
498         # close things 
499         input_file.close()
500         output_file.close()
501 else:
502     wikiq = WikiqParser(sys.stdin, sys.stdout,
503                         collapse_user=args.collapse_user,
504                         persist=persist,
505                         persist_legacy=args.persist_legacy,
506                         urlencode=args.urlencode,
507                         namespaces = namespaces)
508     wikiq.process()
509
510 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
511 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?