]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
Add parameter for selecting specific namespaces.
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os, os.path
9 import re
10
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
14
15 from mwxml import Dump
16
17 from deltas.tokenizers import wikitext_split
18 import mwpersistence
19 import mwreverts
20 from urllib.parse import quote
21 TO_ENCODE = ('title', 'editor')
22 PERSISTENCE_RADIUS=7
23 from deltas import SequenceMatcher
24 from deltas import SegmentMatcher
25
26 class PersistMethod:
27     none = 0
28     sequence = 1
29     segment = 2
30     legacy = 3
31
32 def calculate_persistence(tokens_added):
33     return(sum([(len(x.revisions)-1) for x in tokens_added]),
34            len(tokens_added))
35
36 class WikiqIterator():
37     def __init__(self, fh, collapse_user=False):
38         self.fh = fh
39         self.collapse_user = collapse_user
40         self.mwiterator = Dump.from_file(self.fh)
41         self.namespace_map = { ns.id : ns.name for ns in
42                                self.mwiterator.site_info.namespaces }
43         self.__pages = self.load_pages()
44
45     def load_pages(self):
46         for page in self.mwiterator:
47             yield WikiqPage(page,
48                             namespace_map = self.namespace_map,
49                             collapse_user=self.collapse_user)
50
51     def __iter__(self):
52         return self.__pages
53
54     def __next__(self):
55         return next(self._pages)
56
57 class WikiqPage():
58     __slots__ = ('id', 'title', 'namespace', 'redirect',
59                  'restrictions', 'mwpage', '__revisions',
60                  'collapse_user')
61     
62     def __init__(self, page, namespace_map, collapse_user=False):
63         self.id = page.id
64         self.namespace = page.namespace
65         # following mwxml, we assume namespace 0 in cases where
66         # page.namespace is inconsistent with namespace_map
67         if page.namespace not in namespace_map:
68             self.title = page.title
69             page.namespace = 0
70         if page.namespace != 0:
71             self.title = ':'.join([namespace_map[page.namespace], page.title])
72         else:
73             self.title = page.title
74         self.restrictions = page.restrictions
75         self.collapse_user = collapse_user
76         self.mwpage = page
77         self.__revisions = self.rev_list()
78
79     def rev_list(self):
80         # Outline for how we want to handle collapse_user=True
81         # iteration   rev.user   prev_rev.user   add prev_rev?
82         #         0          A            None           Never
83         #         1          A               A           False
84         #         2          B               A            True
85         #         3          A               B            True
86         #         4          A               A           False
87         # Post-loop                          A          Always
88         for i, rev in enumerate(self.mwpage):
89             # never yield the first time
90             if i == 0:
91                 if self.collapse_user: 
92                     collapsed_revs = 1
93                     rev.collapsed_revs = collapsed_revs
94
95             else:
96                 if self.collapse_user:
97                     # yield if this is the last edit in a seq by a user and reset
98                     # also yield if we do know who the user is
99
100                     if rev.deleted.user or prev_rev.deleted.user:
101                         yield prev_rev
102                         collapsed_revs = 1
103                         rev.collapsed_revs = collapsed_revs
104
105                     elif not rev.user.text == prev_rev.user.text:
106                         yield prev_rev
107                         collapsed_revs = 1
108                         rev.collapsed_revs = collapsed_revs
109                     # otherwise, add one to the counter
110                     else:
111                         collapsed_revs += 1
112                         rev.collapsed_revs = collapsed_revs
113                 # if collapse_user is false, we always yield
114                 else:
115                     yield prev_rev
116
117             prev_rev = rev
118
119         # also yield the final time
120         yield prev_rev
121
122     def __iter__(self):
123         return self.__revisions
124
125     def __next__(self):
126         return next(self.__revisions)
127
128 class WikiqParser():
129     
130     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
131         """ 
132         Parameters:
133            persist : what persistence method to use. Takes a PersistMethod value
134         """
135
136         self.input_file = input_file
137         self.output_file = output_file
138         self.collapse_user = collapse_user
139         self.persist = persist
140         self.printed_header = False
141         self.namespaces = []
142         self.urlencode = urlencode
143         if namespaces is not None:
144             self.namespace_filter = set(namespaces)
145         else:
146             self.namespace_filter = None
147
148     def __get_namespace_from_title(self, title):
149         default_ns = None
150
151         for ns in self.namespaces:
152             # skip if the namespace is not defined
153             if ns == None:
154                 default_ns = self.namespaces[ns]
155                 continue
156
157             if title.startswith(ns + ":"):
158                 return self.namespaces[ns]
159
160         # if we've made it this far with no matches, we return the default namespace
161         return default_ns
162
163     def process(self):
164
165         # create a regex that creates the output filename
166         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
167         #                         r'output/wikiq-\1-\2.tsv',
168         #                         input_filename)
169
170         # Construct dump file iterator
171         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
172
173         # extract list of namspaces
174         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
175
176         page_count = 0
177         rev_count = 0
178
179
180         # Iterate through pages
181         for page in dump:
182             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
183
184             # skip namespaces not in the filter
185             if self.namespace_filter is not None:
186                 if namespace not in self.namespace_filter:
187                     continue
188
189             rev_detector = mwreverts.Detector()
190
191             if self.persist != PersistMethod.none:
192                 window = deque(maxlen=PERSISTENCE_RADIUS)
193
194                 if self.persist == PersistMethod.sequence:
195                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
196                                                     revert_radius=PERSISTENCE_RADIUS)
197
198                 elif self.persist == PersistMethod.segment:
199                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
200                                                     revert_radius=PERSISTENCE_RADIUS)
201
202                 # self.persist == PersistMethod.legacy
203                 else:
204                     from mw.lib import persistence
205                     state = persistence.State()
206
207                 
208
209             # Iterate through a page's revisions
210             for rev in page:
211
212                 rev_data = {'revid' : rev.id,
213                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
214                             'articleid' : page.id,
215                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
216                             'title' : '"' + page.title + '"',
217                             'namespace' : namespace,
218                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
219
220                 # if revisions are deleted, /many/ things will be missing
221                 if rev.deleted.text:
222                     rev_data['text_chars'] = ""
223                     rev_data['sha1'] = ""
224                     rev_data['revert'] = ""
225                     rev_data['reverteds'] = ""
226
227                 else:
228                     # rev.text can be None if the page has no text
229                     if not rev.text:
230                         rev.text = ""
231                     # if text exists, we'll check for a sha1 and generate one otherwise
232
233                     if rev.sha1:
234                         text_sha1 = rev.sha1
235                     else:
236
237                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
238                     
239                     rev_data['sha1'] = text_sha1
240
241                     # TODO rev.bytes doesn't work.. looks like a bug
242                     rev_data['text_chars'] = len(rev.text)
243                
244                     # generate revert data
245                     revert = rev_detector.process(text_sha1, rev.id)
246                     
247                     if revert:
248                         rev_data['revert'] = "TRUE"
249                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
250                     else:
251                         rev_data['revert'] = "FALSE"
252                         rev_data['reverteds'] = ""
253
254                 # if the fact that the edit was minor can be hidden, this might be an issue
255                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
256
257                 if not rev.deleted.user:
258                     # wrap user-defined editors in quotes for fread
259                     rev_data['editor'] = '"' + rev.user.text + '"'
260                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
261                     
262                 else:
263                     rev_data['anon'] = ""
264                     rev_data['editor'] = ""
265
266                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
267                 #    redirect = True
268                 #else:
269                 #    redirect = False
270                 
271                 #TODO missing: additions_size deletions_size
272                 
273                 # if collapse user was on, lets run that
274                 if self.collapse_user:
275                     rev_data['collapsed_revs'] = rev.collapsed_revs
276
277                 if self.persist != PersistMethod.none:
278                     if rev.deleted.text:
279                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
280                             old_rev_data[k] = None
281                     else:
282
283                         if self.persist != PersistMethod.legacy:
284                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
285
286                         else:
287                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
288                             
289                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
290                         
291                         if len(window) == PERSISTENCE_RADIUS:
292                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
293                             
294                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
295
296                             old_rev_data["token_revs"] = num_token_revs
297                             old_rev_data["tokens_added"] = num_tokens
298                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
299                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
300
301                             self.print_rev_data(old_rev_data)
302
303                 else:
304                     self.print_rev_data(rev_data)
305
306                 rev_count += 1
307
308             if self.persist != PersistMethod.none:
309                 # print out metadata for the last RADIUS revisions
310                 for i, item in enumerate(window):
311                     # if the window was full, we've already printed item 0
312                     if len(window) == PERSISTENCE_RADIUS and i == 0:
313                         continue
314
315                     rev_id, rev_data, tokens_added, tokens_removed = item
316                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
317
318                     rev_data["token_revs"] = num_token_revs
319                     rev_data["tokens_added"] = num_tokens
320                     rev_data["tokens_removed"] = len(tokens_removed)
321                     rev_data["tokens_window"] = len(window)-(i+1)
322                     
323                     self.print_rev_data(rev_data)
324
325             page_count += 1
326
327         print("Done: %s revisions and %s pages." % (rev_count, page_count),
328               file=sys.stderr)
329
330     def print_rev_data(self, rev_data):
331         # if it's the first time through, print the header
332         if self.urlencode:
333             for field in TO_ENCODE:
334                 rev_data[field] = quote(str(rev_data[field]))
335
336         if not self.printed_header:
337             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
338             self.printed_header = True
339         
340         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
341
342
343 def open_input_file(input_filename):
344     if re.match(r'.*\.7z$', input_filename):
345         cmd = ["7za", "x", "-so", input_filename, '*'] 
346     elif re.match(r'.*\.gz$', input_filename):
347         cmd = ["zcat", input_filename] 
348     elif re.match(r'.*\.bz2$', input_filename):
349         cmd = ["bzcat", "-dk", input_filename] 
350
351     try:
352         input_file = Popen(cmd, stdout=PIPE).stdout
353     except NameError:
354         input_file = open(input_filename, 'r')
355
356     return input_file
357
358 def open_output_file(input_filename):
359     # create a regex that creates the output filename
360     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
361     output_filename = re.sub(r'\.xml', '', output_filename)
362     output_filename = output_filename + ".tsv"
363     output_file = open(output_filename, "w")
364
365     return output_file
366
367 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
368
369 # arguments for the input direction
370 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
371                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
372
373 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
374                     help="Directory for output files.")
375
376 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
377                     help="Write output to standard out (do not create dump file)")
378
379 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
380                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
381
382 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
383                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
384
385 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
386                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
387
388 parser.add_argument('-n', '--namespace-include', dest="namespace_include", type=int, action='append',
389                     help="Id number of namspace to include.")
390
391
392 args = parser.parse_args()
393
394 # set persistence method
395
396 if args.persist is None:
397     persist = PersistMethod.none
398 elif args.persist == "segment":
399     persist = PersistMethod.segment
400 elif args.persist == "legacy":
401     persist = PersistMethod.legacy
402 else:
403     persist = PersistMethod.sequence
404
405 if args.namespace_include is not None:
406     namespaces = args.namespace_include
407 else:
408     namespaces = None
409
410 if len(args.dumpfiles) > 0:
411     for filename in args.dumpfiles:
412         input_file = open_input_file(filename)
413
414         # open directory for output
415         if args.output_dir:
416             output_dir = args.output_dir[0]
417         else:
418             output_dir = "."
419
420         print("Processing file: %s" % filename, file=sys.stderr)
421
422         if args.stdout:
423             output_file = sys.stdout
424         else:
425             filename = os.path.join(output_dir, os.path.basename(filename))
426             output_file = open_output_file(filename)
427
428         wikiq = WikiqParser(input_file, output_file, 
429                             collapse_user=args.collapse_user,
430                             persist=persist,
431                             urlencode=args.urlencode,
432                             namespaces = namespaces)
433
434         wikiq.process()
435
436         # close things 
437         input_file.close()
438         output_file.close()
439 else:
440     wikiq = WikiqParser(sys.stdin, sys.stdout,
441                         collapse_user=args.collapse_user,
442                         persist=persist,
443                         persist_legacy=args.persist_legacy,
444                         urlencode=args.urlencode)
445     wikiq.process()
446
447 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
448 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?