]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
add namespace filter parameter
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 class WikiqIterator():
38     def __init__(self, fh, collapse_user=False):
39         self.fh = fh
40         self.collapse_user = collapse_user
41         self.mwiterator = Dump.from_file(self.fh)
42         self.namespace_map = { ns.id : ns.name for ns in
43                                self.mwiterator.site_info.namespaces }
44         self.__pages = self.load_pages()
45
46     def load_pages(self):
47         for page in self.mwiterator:
48             yield WikiqPage(page,
49                             namespace_map = self.namespace_map,
50                             collapse_user=self.collapse_user)
51
52     def __iter__(self):
53         return self.__pages
54
55     def __next__(self):
56         return next(self._pages)
57
58 class WikiqPage():
59     __slots__ = ('id', 'title', 'namespace', 'redirect',
60                  'restrictions', 'mwpage', '__revisions',
61                  'collapse_user')
62     
63     def __init__(self, page, namespace_map, collapse_user=False):
64         self.id = page.id
65         self.namespace = page.namespace
66         # following mwxml, we assume namespace 0 in cases where
67         # page.namespace is inconsistent with namespace_map
68         if page.namespace not in namespace_map:
69             self.title = page.title
70             page.namespace = 0
71         if page.namespace != 0:
72             self.title = ':'.join([namespace_map[page.namespace], page.title])
73         else:
74             self.title = page.title
75         self.restrictions = page.restrictions
76         self.collapse_user = collapse_user
77         self.mwpage = page
78         self.__revisions = self.rev_list()
79
80     def rev_list(self):
81         # Outline for how we want to handle collapse_user=True
82         # iteration   rev.user   prev_rev.user   add prev_rev?
83         #         0          A            None           Never
84         #         1          A               A           False
85         #         2          B               A            True
86         #         3          A               B            True
87         #         4          A               A           False
88         # Post-loop                          A          Always
89         for i, rev in enumerate(self.mwpage):
90             # never yield the first time
91             if i == 0:
92                 if self.collapse_user: 
93                     collapsed_revs = 1
94                     rev.collapsed_revs = collapsed_revs
95
96             else:
97                 if self.collapse_user:
98                     # yield if this is the last edit in a seq by a user and reset
99                     # also yield if we do know who the user is
100
101                     if rev.deleted.user or prev_rev.deleted.user:
102                         yield prev_rev
103                         collapsed_revs = 1
104                         rev.collapsed_revs = collapsed_revs
105
106                     elif not rev.user.text == prev_rev.user.text:
107                         yield prev_rev
108                         collapsed_revs = 1
109                         rev.collapsed_revs = collapsed_revs
110                     # otherwise, add one to the counter
111                     else:
112                         collapsed_revs += 1
113                         rev.collapsed_revs = collapsed_revs
114                 # if collapse_user is false, we always yield
115                 else:
116                     yield prev_rev
117
118             prev_rev = rev
119
120         # also yield the final time
121         yield prev_rev
122
123     def __iter__(self):
124         return self.__revisions
125
126     def __next__(self):
127         return next(self.__revisions)
128
129 class WikiqParser():
130     
131     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
132         """ 
133         Parameters:
134            persist : what persistence method to use. Takes a PersistMethod value
135         """
136
137         self.input_file = input_file
138         self.output_file = output_file
139         self.collapse_user = collapse_user
140         self.persist = persist
141         self.printed_header = False
142         self.namespaces = []
143         self.urlencode = urlencode
144         if namespaces is not None:
145             self.namespace_filter = set(namespaces)
146         else:
147             self.namespace_filter = None
148
149     def __get_namespace_from_title(self, title):
150         default_ns = None
151
152         for ns in self.namespaces:
153             # skip if the namespace is not defined
154             if ns == None:
155                 default_ns = self.namespaces[ns]
156                 continue
157
158             if title.startswith(ns + ":"):
159                 return self.namespaces[ns]
160
161         # if we've made it this far with no matches, we return the default namespace
162         return default_ns
163
164     def process(self):
165
166         # create a regex that creates the output filename
167         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
168         #                         r'output/wikiq-\1-\2.tsv',
169         #                         input_filename)
170
171         # Construct dump file iterator
172         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
173
174         # extract list of namspaces
175         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
176
177         page_count = 0
178         rev_count = 0
179
180
181         # Iterate through pages
182         for page in dump:
183             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
184
185             # skip namespaces not in the filter
186             if self.namespace_filter is not None:
187                 if namespace in self.namespace_filter:
188                     continue
189
190             rev_detector = mwreverts.Detector()
191
192             if self.persist != PersistMethod.none:
193                 window = deque(maxlen=PERSISTENCE_RADIUS)
194
195                 if self.persist == PersistMethod.sequence:
196                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
197                                                     revert_radius=PERSISTENCE_RADIUS)
198
199                 elif self.persist == PersistMethod.segment:
200                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
201                                                     revert_radius=PERSISTENCE_RADIUS)
202
203                 # self.persist == PersistMethod.legacy
204                 else:
205                     from mw.lib import persistence
206                     state = persistence.State()
207
208                 
209
210             # Iterate through a page's revisions
211             for rev in page:
212
213                 rev_data = {'revid' : rev.id,
214                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
215                             'articleid' : page.id,
216                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
217                             'title' : '"' + page.title + '"',
218                             'namespace' : namespace,
219                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
220
221                 # if revisions are deleted, /many/ things will be missing
222                 if rev.deleted.text:
223                     rev_data['text_chars'] = ""
224                     rev_data['sha1'] = ""
225                     rev_data['revert'] = ""
226                     rev_data['reverteds'] = ""
227
228                 else:
229                     # rev.text can be None if the page has no text
230                     if not rev.text:
231                         rev.text = ""
232                     # if text exists, we'll check for a sha1 and generate one otherwise
233
234                     if rev.sha1:
235                         text_sha1 = rev.sha1
236                     else:
237
238                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
239                     
240                     rev_data['sha1'] = text_sha1
241
242                     # TODO rev.bytes doesn't work.. looks like a bug
243                     rev_data['text_chars'] = len(rev.text)
244                
245                     # generate revert data
246                     revert = rev_detector.process(text_sha1, rev.id)
247                     
248                     if revert:
249                         rev_data['revert'] = "TRUE"
250                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
251                     else:
252                         rev_data['revert'] = "FALSE"
253                         rev_data['reverteds'] = ""
254
255                 # if the fact that the edit was minor can be hidden, this might be an issue
256                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
257
258                 if not rev.deleted.user:
259                     # wrap user-defined editors in quotes for fread
260                     rev_data['editor'] = '"' + rev.user.text + '"'
261                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
262                     
263                 else:
264                     rev_data['anon'] = ""
265                     rev_data['editor'] = ""
266
267                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
268                 #    redirect = True
269                 #else:
270                 #    redirect = False
271                 
272                 #TODO missing: additions_size deletions_size
273                 
274                 # if collapse user was on, lets run that
275                 if self.collapse_user:
276                     rev_data['collapsed_revs'] = rev.collapsed_revs
277
278                 if self.persist != PersistMethod.none:
279                     if rev.deleted.text:
280                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
281                             old_rev_data[k] = None
282                     else:
283
284                         if self.persist != PersistMethod.legacy:
285                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
286
287                         else:
288                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
289                             
290                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
291                         
292                         if len(window) == PERSISTENCE_RADIUS:
293                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
294                             
295                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
296
297                             old_rev_data["token_revs"] = num_token_revs
298                             old_rev_data["tokens_added"] = num_tokens
299                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
300                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
301
302                             self.print_rev_data(old_rev_data)
303
304                 else:
305                     self.print_rev_data(rev_data)
306
307                 rev_count += 1
308
309             if self.persist != PersistMethod.none:
310                 # print out metadata for the last RADIUS revisions
311                 for i, item in enumerate(window):
312                     # if the window was full, we've already printed item 0
313                     if len(window) == PERSISTENCE_RADIUS and i == 0:
314                         continue
315
316                     rev_id, rev_data, tokens_added, tokens_removed = item
317                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
318
319                     rev_data["token_revs"] = num_token_revs
320                     rev_data["tokens_added"] = num_tokens
321                     rev_data["tokens_removed"] = len(tokens_removed)
322                     rev_data["tokens_window"] = len(window)-(i+1)
323                     
324                     self.print_rev_data(rev_data)
325
326             page_count += 1
327
328         print("Done: %s revisions and %s pages." % (rev_count, page_count),
329               file=sys.stderr)
330
331     def print_rev_data(self, rev_data):
332         # if it's the first time through, print the header
333         if self.urlencode:
334             for field in TO_ENCODE:
335                 rev_data[field] = quote(str(rev_data[field]))
336
337         if not self.printed_header:
338             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
339             self.printed_header = True
340         
341         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
342
343
344 def open_input_file(input_filename):
345     if re.match(r'.*\.7z$', input_filename):
346         cmd = ["7za", "x", "-so", input_filename, '*'] 
347     elif re.match(r'.*\.gz$', input_filename):
348         cmd = ["zcat", input_filename] 
349     elif re.match(r'.*\.bz2$', input_filename):
350         cmd = ["bzcat", "-dk", input_filename] 
351
352     try:
353         input_file = Popen(cmd, stdout=PIPE).stdout
354     except NameError:
355         input_file = open(input_filename, 'r')
356
357     return input_file
358
359 def open_output_file(input_filename):
360     # create a regex that creates the output filename
361     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
362     output_filename = re.sub(r'\.xml', '', output_filename)
363     output_filename = output_filename + ".tsv"
364     output_file = open(output_filename, "w")
365
366     return output_file
367
368 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
369
370 # arguments for the input direction
371 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
372                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
373
374 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
375                     help="Directory for output files.")
376
377 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
378                     help="Write output to standard out (do not create dump file)")
379
380 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
381                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
382
383 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
384                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
385
386 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
387                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
388
389 parser.add_argument('-ns', '--namespace-filter', dest="namespace_filter", type=str, help="Comma-seperate list of namespaces numbers to include", default=None)
390
391
392 args = parser.parse_args()
393
394 # set persistence method
395
396 if args.persist is None:
397     persist = PersistMethod.none
398 elif args.persist == "segment":
399     persist = PersistMethod.segment
400 elif args.persist == "legacy":
401     persist = PersistMethod.legacy
402 else:
403     persist = PersistMethod.sequence
404
405 if args.namespace_filter is not None:
406     namespaces = [int(ns) for ns in args.namespace_filter.split(',')]
407 else:
408     namespaces = None
409
410 if len(args.dumpfiles) > 0:
411     for filename in args.dumpfiles:
412         input_file = open_input_file(filename)
413
414         # open directory for output
415         if args.output_dir:
416             output_dir = args.output_dir[0]
417         else:
418             output_dir = "."
419
420         print("Processing file: %s" % filename, file=sys.stderr)
421
422         if args.stdout:
423             output_file = sys.stdout
424         else:
425             filename = os.path.join(output_dir, os.path.basename(filename))
426             output_file = open_output_file(filename)
427
428             wikiq = WikiqParser(input_file, output_file, 
429                                 collapse_user=args.collapse_user,
430                                 persist=persist,
431                                 urlencode=args.urlencode,
432                                 namespaces = namespaces)
433
434
435         wikiq.process()
436
437         # close things 
438         input_file.close()
439         output_file.close()
440 else:
441     wikiq = WikiqParser(sys.stdin, sys.stdout,
442                         collapse_user=args.collapse_user,
443                         persist=persist,
444                         persist_legacy=args.persist_legacy,
445                         urlencode=args.urlencode,
446                         namespaces = namespaces)
447     wikiq.process()
448
449 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
450 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?