]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
add namespace filter parameter
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 class WikiqIterator():
38     def __init__(self, fh, collapse_user=False):
39         self.fh = fh
40         self.collapse_user = collapse_user
41         self.mwiterator = Dump.from_file(self.fh)
42         self.namespace_map = { ns.id : ns.name for ns in
43                                self.mwiterator.site_info.namespaces }
44         self.__pages = self.load_pages()
45
46     def load_pages(self):
47         for page in self.mwiterator:
48             yield WikiqPage(page,
49                             namespace_map = self.namespace_map,
50                             collapse_user=self.collapse_user)
51
52     def __iter__(self):
53         return self.__pages
54
55     def __next__(self):
56         return next(self._pages)
57
58 class WikiqPage():
59     __slots__ = ('id', 'title', 'namespace', 'redirect',
60                  'restrictions', 'mwpage', '__revisions',
61                  'collapse_user')
62     
63     def __init__(self, page, namespace_map, collapse_user=False):
64         self.id = page.id
65         self.namespace = page.namespace
66         # following mwxml, we assume namespace 0 in cases where
67         # page.namespace is inconsistent with namespace_map
68         if page.namespace not in namespace_map:
69             self.title = page.title
70             page.namespace = 0
71         if page.namespace != 0:
72             self.title = ':'.join([namespace_map[page.namespace], page.title])
73         else:
74             self.title = page.title
75         self.restrictions = page.restrictions
76         self.collapse_user = collapse_user
77         self.mwpage = page
78         self.__revisions = self.rev_list()
79
80     def rev_list(self):
81         # Outline for how we want to handle collapse_user=True
82         # iteration   rev.user   prev_rev.user   add prev_rev?
83         #         0          A            None           Never
84         #         1          A               A           False
85         #         2          B               A            True
86         #         3          A               B            True
87         #         4          A               A           False
88         # Post-loop                          A          Always
89         for i, rev in enumerate(self.mwpage):
90             # never yield the first time
91             if i == 0:
92                 if self.collapse_user: 
93                     collapsed_revs = 1
94                     rev.collapsed_revs = collapsed_revs
95
96             else:
97                 if self.collapse_user:
98                     # yield if this is the last edit in a seq by a user and reset
99                     # also yield if we do know who the user is
100
101                     if rev.deleted.user or prev_rev.deleted.user:
102                         yield prev_rev
103                         collapsed_revs = 1
104                         rev.collapsed_revs = collapsed_revs
105
106                     elif not rev.user.text == prev_rev.user.text:
107                         yield prev_rev
108                         collapsed_revs = 1
109                         rev.collapsed_revs = collapsed_revs
110                     # otherwise, add one to the counter
111                     else:
112                         collapsed_revs += 1
113                         rev.collapsed_revs = collapsed_revs
114                 # if collapse_user is false, we always yield
115                 else:
116                     yield prev_rev
117
118             prev_rev = rev
119
120         # also yield the final time
121         yield prev_rev
122
123     def __iter__(self):
124         return self.__revisions
125
126     def __next__(self):
127         return next(self.__revisions)
128
129 class WikiqParser():
130     
131     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False):
132         """ 
133         Parameters:
134            persist : what persistence method to use. Takes a PersistMethod value
135         """
136
137         self.input_file = input_file
138         self.output_file = output_file
139         self.collapse_user = collapse_user
140         self.persist = persist
141         self.printed_header = False
142         self.namespaces = []
143         self.urlencode = urlencode
144         
145     def __get_namespace_from_title(self, title):
146         default_ns = None
147
148         for ns in self.namespaces:
149             # skip if the namespace is not defined
150             if ns == None:
151                 default_ns = self.namespaces[ns]
152                 continue
153
154             if title.startswith(ns + ":"):
155                 return self.namespaces[ns]
156
157         # if we've made it this far with no matches, we return the default namespace
158         return default_ns
159
160     def process(self):
161
162         # create a regex that creates the output filename
163         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
164         #                         r'output/wikiq-\1-\2.tsv',
165         #                         input_filename)
166
167         # Construct dump file iterator
168         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
169
170         # extract list of namspaces
171         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
172
173         page_count = 0
174         rev_count = 0
175
176
177         # Iterate through pages
178         for page in dump:
179             rev_detector = mwreverts.Detector()
180
181             if self.persist != PersistMethod.none:
182                 window = deque(maxlen=PERSISTENCE_RADIUS)
183
184                 if self.persist == PersistMethod.sequence:
185                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
186                                                     revert_radius=PERSISTENCE_RADIUS)
187
188                 elif self.persist == PersistMethod.segment:
189                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
190                                                     revert_radius=PERSISTENCE_RADIUS)
191
192                 # self.persist == PersistMethod.legacy
193                 else:
194                     from mw.lib import persistence
195                     state = persistence.State()
196
197             # Iterate through a page's revisions
198             for rev in page:
199
200                 rev_data = {'revid' : rev.id,
201                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
202                             'articleid' : page.id,
203                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
204                             'title' : '"' + page.title + '"',
205                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
206                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
207
208                 # if revisions are deleted, /many/ things will be missing
209                 if rev.deleted.text:
210                     rev_data['text_chars'] = ""
211                     rev_data['sha1'] = ""
212                     rev_data['revert'] = ""
213                     rev_data['reverteds'] = ""
214
215                 else:
216                     # rev.text can be None if the page has no text
217                     if not rev.text:
218                         rev.text = ""
219                     # if text exists, we'll check for a sha1 and generate one otherwise
220
221                     if rev.sha1:
222                         text_sha1 = rev.sha1
223                     else:
224
225                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
226                     
227                     rev_data['sha1'] = text_sha1
228
229                     # TODO rev.bytes doesn't work.. looks like a bug
230                     rev_data['text_chars'] = len(rev.text)
231                
232                     # generate revert data
233                     revert = rev_detector.process(text_sha1, rev.id)
234                     
235                     if revert:
236                         rev_data['revert'] = "TRUE"
237                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
238                     else:
239                         rev_data['revert'] = "FALSE"
240                         rev_data['reverteds'] = ""
241
242                 # if the fact that the edit was minor can be hidden, this might be an issue
243                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
244
245                 if not rev.deleted.user:
246                     # wrap user-defined editors in quotes for fread
247                     rev_data['editor'] = '"' + rev.user.text + '"'
248                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
249                     
250                 else:
251                     rev_data['anon'] = ""
252                     rev_data['editor'] = ""
253
254                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
255                 #    redirect = True
256                 #else:
257                 #    redirect = False
258                 
259                 #TODO missing: additions_size deletions_size
260                 
261                 # if collapse user was on, lets run that
262                 if self.collapse_user:
263                     rev_data['collapsed_revs'] = rev.collapsed_revs
264
265                 if self.persist != PersistMethod.none:
266                     if rev.deleted.text:
267                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
268                             old_rev_data[k] = None
269                     else:
270
271                         if self.persist != PersistMethod.legacy:
272                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
273
274                         else:
275                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
276                             
277                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
278                         
279                         if len(window) == PERSISTENCE_RADIUS:
280                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
281                             
282                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
283
284                             old_rev_data["token_revs"] = num_token_revs
285                             old_rev_data["tokens_added"] = num_tokens
286                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
287                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
288
289                             self.print_rev_data(old_rev_data)
290
291                 else:
292                     self.print_rev_data(rev_data)
293
294                 rev_count += 1
295
296             if self.persist != PersistMethod.none:
297                 # print out metadata for the last RADIUS revisions
298                 for i, item in enumerate(window):
299                     # if the window was full, we've already printed item 0
300                     if len(window) == PERSISTENCE_RADIUS and i == 0:
301                         continue
302
303                     rev_id, rev_data, tokens_added, tokens_removed = item
304                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
305
306                     rev_data["token_revs"] = num_token_revs
307                     rev_data["tokens_added"] = num_tokens
308                     rev_data["tokens_removed"] = len(tokens_removed)
309                     rev_data["tokens_window"] = len(window)-(i+1)
310                     
311                     self.print_rev_data(rev_data)
312
313             page_count += 1
314
315         print("Done: %s revisions and %s pages." % (rev_count, page_count),
316               file=sys.stderr)
317
318     def print_rev_data(self, rev_data):
319         # if it's the first time through, print the header
320         if self.urlencode:
321             for field in TO_ENCODE:
322                 rev_data[field] = quote(str(rev_data[field]))
323
324         if not self.printed_header:
325             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
326             self.printed_header = True
327         
328         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
329
330
331 def open_input_file(input_filename):
332     if re.match(r'.*\.7z$', input_filename):
333         cmd = ["7za", "x", "-so", input_filename, '*'] 
334     elif re.match(r'.*\.gz$', input_filename):
335         cmd = ["zcat", input_filename] 
336     elif re.match(r'.*\.bz2$', input_filename):
337         cmd = ["bzcat", "-dk", input_filename] 
338
339     try:
340         input_file = Popen(cmd, stdout=PIPE).stdout
341     except NameError:
342         input_file = open(input_filename, 'r')
343
344     return input_file
345
346 def open_output_file(input_filename):
347     # create a regex that creates the output filename
348     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
349     output_filename = re.sub(r'\.xml', '', output_filename)
350     output_filename = output_filename + ".tsv"
351     output_file = open(output_filename, "w")
352
353     return output_file
354
355 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
356
357 # arguments for the input direction
358 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
359                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
360
361 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
362                     help="Directory for output files.")
363
364 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
365                     help="Write output to standard out (do not create dump file)")
366
367 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
368                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
369
370 parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
371                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
372
373 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
374                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
375
376 parser.add_argument('-ns', '--namespace-filter', dest="namespace_filter", type=str, help="Comma-seperate list of namespaces numbers to include")
377
378
379 args = parser.parse_args()
380
381 # set persistence method
382
383 if args.persist is None:
384     persist = PersistMethod.none
385 elif args.persist == "segment":
386     persist = PersistMethod.segment
387 elif args.persist == "legacy":
388     persist = PersistMethod.legacy
389 else:
390     persist = PersistMethod.sequence
391
392 if len(args.dumpfiles) > 0:
393     for filename in args.dumpfiles:
394         input_file = open_input_file(filename)
395
396         # open directory for output
397         if args.output_dir:
398             output_dir = args.output_dir[0]
399         else:
400             output_dir = "."
401
402         print("Processing file: %s" % filename, file=sys.stderr)
403
404         if args.stdout:
405             output_file = sys.stdout
406         else:
407             filename = os.path.join(output_dir, os.path.basename(filename))
408             output_file = open_output_file(filename)
409
410             wikiq = WikiqParser(input_file, output_file, 
411                             collapse_user=args.collapse_user,
412                                 persist=persist,
413                             urlencode=args.urlencode)
414
415
416         wikiq.process()
417
418         # close things 
419         input_file.close()
420         output_file.close()
421 else:
422     wikiq = WikiqParser(sys.stdin, sys.stdout,
423                         collapse_user=args.collapse_user,
424                         persist=persist,
425                         persist_legacy=args.persist_legacy,
426                         urlencode=args.urlencode)
427     wikiq.process()
428
429 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
430 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?