]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
migrate to mwxml. This completes the migration away from python-mediawiki-utilities...
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os, os.path
9 import re
10
11 from subprocess import Popen, PIPE
12 from collections import deque
13 from hashlib import sha1
14
15 from mwxml import Dump
16
17 from deltas.tokenizers import wikitext_split
18 import mwpersistence
19 import mwreverts
20 from urllib.parse import quote
21 TO_ENCODE = ('title', 'editor')
22 PERSISTENCE_RADIUS=7
23 from deltas import SequenceMatcher
24
25 def calculate_persistence(tokens_added):
26     return(sum([(len(x.revisions)-1) for x in tokens_added]),
27            len(tokens_added))
28
29
30 class WikiqIterator():
31     def __init__(self, fh, collapse_user=False):
32         self.fh = fh
33         self.collapse_user = collapse_user
34         self.mwiterator = Dump.from_file(self.fh)
35         self.__pages = self.load_pages()
36
37     def load_pages(self):
38         for page in self.mwiterator:
39             yield WikiqPage(page, collapse_user=self.collapse_user)
40
41     def __iter__(self):
42         return self.__pages
43
44     def __next__(self):
45         return next(self._pages)
46
47 class WikiqPage():
48     __slots__ = ('id', 'title', 'namespace', 'redirect',
49                  'restrictions', 'mwpage', '__revisions',
50                  'collapse_user')
51     
52     def __init__(self, page, collapse_user=False):
53         self.id = page.id
54         self.title = page.title
55         self.namespace = page.namespace
56         self.redirect = page.redirect
57         self.restrictions = page.restrictions
58         
59         self.collapse_user = collapse_user
60         self.mwpage = page
61         self.__revisions = self.rev_list()
62
63     def rev_list(self):
64         # Outline for how we want to handle collapse_user=True
65         # iteration   rev.user   prev_rev.user   add prev_rev?
66         #         0          A            None           Never
67         #         1          A               A           False
68         #         2          B               A            True
69         #         3          A               B            True
70         #         4          A               A           False
71         # Post-loop                          A          Always
72         for i, rev in enumerate(self.mwpage):
73             # never yield the first time
74             if i == 0:
75                 if self.collapse_user: 
76                     collapsed_revs = 1
77                     rev.collapsed_revs = collapsed_revs
78
79             else:
80                 if self.collapse_user:
81                     # yield if this is the last edit in a seq by a user and reset
82                     # also yield if we do know who the user is
83
84                     if rev.deleted.user or prev_rev.deleted.user:
85                         yield prev_rev
86                         collapsed_revs = 1
87                         rev.collapsed_revs = collapsed_revs
88
89                     elif not rev.user.text == prev_rev.user.text:
90                         yield prev_rev
91                         collapsed_revs = 1
92                         rev.collapsed_revs = collapsed_revs
93                     # otherwise, add one to the counter
94                     else:
95                         collapsed_revs += 1
96                         rev.collapsed_revs = collapsed_revs
97                 # if collapse_user is false, we always yield
98                 else:
99                     yield prev_rev
100
101             prev_rev = rev
102
103         # also yield the final time
104         yield prev_rev
105
106     def __iter__(self):
107         return self.__revisions
108
109     def __next__(self):
110         return next(self.__revisions)
111
112 class WikiqParser():
113
114
115     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
116         
117         self.input_file = input_file
118         self.output_file = output_file
119         self.collapse_user = collapse_user
120         self.persist = persist
121         self.persist_legacy = persist_legacy
122         self.printed_header = False
123         self.namespaces = []
124         self.urlencode = urlencode
125         
126     def __get_namespace_from_title(self, title):
127         default_ns = None
128
129         for ns in self.namespaces:
130             # skip if the namespace is not defined
131             if ns == None:
132                 default_ns = self.namespaces[ns]
133                 continue
134
135             if title.startswith(ns + ":"):
136                 return self.namespaces[ns]
137
138         # if we've made it this far with no matches, we return the default namespace
139         return default_ns
140
141     def process(self):
142
143         # create a regex that creates the output filename
144         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
145         #                         r'output/wikiq-\1-\2.tsv',
146         #                         input_filename)
147
148         # Construct dump file iterator
149         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
150
151         # extract list of namspaces
152         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
153
154         page_count = 0
155         rev_count = 0
156
157
158         # Iterate through pages
159         for page in dump:
160             rev_detector = mwreverts.Detector()
161
162             if self.persist or self.persist_legacy:
163                 window = deque(maxlen=PERSISTENCE_RADIUS)
164
165                 if not self.persist_legacy:
166                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
167                                                     revert_radius=PERSISTENCE_RADIUS)
168
169                 else:
170                     from mw.lib import persistence
171                     state = persistence.State()
172
173             # Iterate through a page's revisions
174             for rev in page:
175
176                 rev_data = {'revid' : rev.id,
177                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
178                             'articleid' : page.id,
179                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
180                             'title' : '"' + page.title + '"',
181                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
182                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
183
184                 # if revisions are deleted, /many/ things will be missing
185                 if rev.deleted.text:
186                     rev_data['text_chars'] = ""
187                     rev_data['sha1'] = ""
188                     rev_data['revert'] = ""
189                     rev_data['reverteds'] = ""
190
191                 else:
192                     # rev.text can be None if the page has no text
193                     if not rev.text:
194                         rev.text = ""
195                     # if text exists, we'll check for a sha1 and generate one otherwise
196
197                     if rev.sha1:
198                         text_sha1 = rev.sha1
199                     else:
200
201                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
202                     
203                     rev_data['sha1'] = text_sha1
204
205                     # TODO rev.bytes doesn't work.. looks like a bug
206                     rev_data['text_chars'] = len(rev.text)
207                
208                     # generate revert data
209                     revert = rev_detector.process(text_sha1, rev.id)
210                     
211                     if revert:
212                         rev_data['revert'] = "TRUE"
213                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
214                     else:
215                         rev_data['revert'] = "FALSE"
216                         rev_data['reverteds'] = ""
217
218                 # if the fact that the edit was minor can be hidden, this might be an issue
219                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
220
221                 if not rev.deleted.user:
222                     # wrap user-defined editors in quotes for fread
223                     rev_data['editor'] = '"' + rev.user.text + '"'
224                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
225                     
226                 else:
227                     rev_data['anon'] = ""
228                     rev_data['editor'] = ""
229
230                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
231                 #    redirect = True
232                 #else:
233                 #    redirect = False
234                 
235                 #TODO missing: additions_size deletions_size
236                 
237                 # if collapse user was on, lets run that
238                 if self.collapse_user:
239                     rev_data['collapsed_revs'] = rev.collapsed_revs
240
241                 if self.persist or self.persist_legacy:
242                     if rev.deleted.text:
243
244                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
245                             old_rev_data[k] = None
246                     else:
247
248                         if not self.persist_legacy:
249                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
250
251                         else:
252                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
253                             
254                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
255                         
256                         if len(window) == PERSISTENCE_RADIUS:
257                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
258                             
259                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
260
261                             old_rev_data["token_revs"] = num_token_revs
262                             old_rev_data["tokens_added"] = num_tokens
263                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
264                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
265
266                             self.print_rev_data(old_rev_data)
267
268                 else:
269                     self.print_rev_data(rev_data)
270
271                 rev_count += 1
272
273             if self.persist or self.persist_legacy:
274                 # print out metadata for the last RADIUS revisions
275                 for i, item in enumerate(window):
276                     # if the window was full, we've already printed item 0
277                     if len(window) == PERSISTENCE_RADIUS and i == 0:
278                         continue
279
280                     rev_id, rev_data, tokens_added, tokens_removed = item
281                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
282
283                     rev_data["token_revs"] = num_token_revs
284                     rev_data["tokens_added"] = num_tokens
285                     rev_data["tokens_removed"] = len(tokens_removed)
286                     rev_data["tokens_window"] = len(window)-(i+1)
287                     
288                     self.print_rev_data(rev_data)
289
290             page_count += 1
291
292         print("Done: %s revisions and %s pages." % (rev_count, page_count),
293               file=sys.stderr)
294
295     def print_rev_data(self, rev_data):
296         # if it's the first time through, print the header
297         if self.urlencode:
298             for field in TO_ENCODE:
299                 rev_data[field] = quote(str(rev_data[field]))
300
301         if not self.printed_header:
302             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
303             self.printed_header = True
304         
305         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
306
307
308 def open_input_file(input_filename):
309     if re.match(r'.*\.7z$', input_filename):
310         cmd = ["7za", "x", "-so", input_filename, '*'] 
311     elif re.match(r'.*\.gz$', input_filename):
312         cmd = ["zcat", input_filename] 
313     elif re.match(r'.*\.bz2$', input_filename):
314         cmd = ["bzcat", "-dk", input_filename] 
315
316     try:
317         input_file = Popen(cmd, stdout=PIPE).stdout
318     except NameError:
319         input_file = open(input_filename, 'r')
320
321     return input_file
322
323 def open_output_file(input_filename):
324     # create a regex that creates the output filename
325     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
326     output_filename = re.sub(r'\.xml', '', output_filename)
327     output_filename = output_filename + ".tsv"
328     output_file = open(output_filename, "w")
329
330     return output_file
331
332 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
333
334 # arguments for the input direction
335 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
336                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
337
338 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
339                     help="Directory for output files.")
340
341 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
342                     help="Write output to standard out (do not create dump file)")
343
344 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
345                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
346
347 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
348                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
349
350 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
351                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
352
353 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
354                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
355
356 args = parser.parse_args()
357
358 if len(args.dumpfiles) > 0:
359     for filename in args.dumpfiles:
360         input_file = open_input_file(filename)
361
362         # open directory for output
363         if args.output_dir:
364             output_dir = args.output_dir[0]
365         else:
366             output_dir = "."
367
368         print("Processing file: %s" % filename, file=sys.stderr)
369
370         if args.stdout:
371             output_file = sys.stdout
372         else:
373             filename = os.path.join(output_dir, os.path.basename(filename))
374             output_file = open_output_file(filename)
375
376         wikiq = WikiqParser(input_file, output_file, 
377                             collapse_user=args.collapse_user,
378                             persist=args.persist,
379                             persist_legacy=args.persist_legacy,
380                             urlencode=args.urlencode)
381
382
383         wikiq.process()
384
385         # close things 
386         input_file.close()
387         output_file.close()
388 else:
389     wikiq = WikiqParser(sys.stdin, sys.stdout,
390                         collapse_user=args.collapse_user,
391                         persist=args.persist,
392                         persist_legacy=args.persist_legacy,
393                         urlencode=args.urlencode)
394     wikiq.process()
395
396 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
397 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?