]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
migrate to mwpersistence. this fixes many issues. We preserve legacy persistence...
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import pdb
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mw.xml_dump import Iterator
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25
26 def calculate_persistence(tokens_added):
27     return(sum([(len(x.revisions)-1) for x in tokens_added]),
28            len(tokens_added))
29
30
31 class WikiqIterator():
32     def __init__(self, fh, collapse_user=False):
33         self.fh = fh
34         self.collapse_user = collapse_user
35         self.mwiterator = Iterator.from_file(self.fh)
36         self.__pages = self.load_pages()
37
38     def load_pages(self):
39         for page in self.mwiterator:
40             yield WikiqPage(page, collapse_user=self.collapse_user)
41
42     def __iter__(self):
43         return self.__pages
44
45     def __next__(self):
46         return next(self._pages)
47
48 class WikiqPage():
49     __slots__ = ('id', 'title', 'namespace', 'redirect',
50                  'restrictions', 'mwpage', '__revisions',
51                  'collapse_user')
52     
53     def __init__(self, page, collapse_user=False):
54         self.id = page.id
55         self.title = page.title
56         self.namespace = page.namespace
57         self.redirect = page.redirect
58         self.restrictions = page.restrictions
59         
60         self.collapse_user = collapse_user
61         self.mwpage = page
62         self.__revisions = self.rev_list()
63
64     def rev_list(self):
65         # Outline for how we want to handle collapse_user=True
66         # iteration   rev.user   prev_rev.user   add prev_rev?
67         #         0          A            None           Never
68         #         1          A               A           False
69         #         2          B               A            True
70         #         3          A               B            True
71         #         4          A               A           False
72         # Post-loop                          A          Always
73         for i, rev in enumerate(self.mwpage):
74             # never yield the first time
75             if i == 0:
76                 if self.collapse_user: 
77                     collapsed_revs = 1
78                     rev.collapsed_revs = collapsed_revs
79
80             else:
81                 if self.collapse_user:
82                     # yield if this is the last edit in a seq by a user and reset
83                     if not rev.contributor.user_text == prev_rev.contributor.user_text:
84                         yield prev_rev
85                         collapsed_revs = 1
86                         rev.collapsed_revs = collapsed_revs
87                     # otherwise, add one to the counter
88                     else:
89                         collapsed_revs += 1
90                         rev.collapsed_revs = collapsed_revs
91                 # if collapse_user is false, we always yield
92                 else:
93                     yield prev_rev
94
95             prev_rev = rev
96         # also yield the final time
97         yield prev_rev
98
99     def __iter__(self):
100         return self.__revisions
101
102     def __next__(self):
103         return next(self.__revisions)
104
105 class WikiqParser():
106
107
108     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
109         
110         self.input_file = input_file
111         self.output_file = output_file
112         self.collapse_user = collapse_user
113         self.persist = persist
114         self.persist_legacy = persist_legacy
115         self.printed_header = False
116         self.namespaces = []
117         self.urlencode = urlencode
118         
119     def __get_namespace_from_title(self, title):
120         default_ns = None
121
122         for ns in self.namespaces:
123             # skip if the namespace is not defined
124             if ns == None:
125                 default_ns = self.namespaces[ns]
126                 continue
127
128             if title.startswith(ns + ":"):
129                 return self.namespaces[ns]
130
131         # if we've made it this far with no matches, we return the default namespace
132         return default_ns
133
134     def process(self):
135
136         # create a regex that creates the output filename
137         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
138         #                         r'output/wikiq-\1-\2.tsv',
139         #                         input_filename)
140
141         # Construct dump file iterator
142         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
143
144         # extract list of namspaces
145         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
146
147         page_count = 0
148         rev_count = 0
149
150
151         # Iterate through pages
152         for page in dump:
153             rev_detector = mwreverts.Detector()
154
155             if self.persist or self.persist_legacy:
156                 window = deque(maxlen=PERSISTENCE_RADIUS)
157
158                 if not self.persist_legacy:
159                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
160                                                     revert_radius=PERSISTENCE_RADIUS)
161
162                 else:
163                     from mw.lib import persistence
164                     state = persistence.State()
165
166             # Iterate through a page's revisions
167             for rev in page:
168
169                 rev_data = {'revid' : rev.id,
170                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
171                             'articleid' : page.id,
172                             'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
173                             'title' : '"' + page.title + '"',
174                             'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
175                             'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
176
177                 # if revisions are deleted, /many/ things will be missing
178                 if rev.text.deleted:
179                     rev_data['text_chars'] = ""
180                     rev_data['sha1'] = ""
181                     rev_data['revert'] = ""
182                     rev_data['reverteds'] = ""
183
184                 else:
185                     # if text exists, we'll check for a sha1 and generate one otherwise
186                     if rev.sha1:
187                         text_sha1 = rev.sha1
188                     else:
189                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
190                     
191                     rev_data['sha1'] = text_sha1
192
193                     # TODO rev.bytes doesn't work.. looks like a bug
194                     rev_data['text_chars'] = len(rev.text)
195                
196                     # generate revert data
197                     revert = rev_detector.process(text_sha1, rev.id)
198                     
199                     if revert:
200                         rev_data['revert'] = "TRUE"
201                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
202                     else:
203                         rev_data['revert'] = "FALSE"
204                         rev_data['reverteds'] = ""
205
206                 # if the fact that the edit was minor can be hidden, this might be an issue
207                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
208
209                 if rev.contributor.user_text:
210                     # wrap user-defined editors in quotes for fread
211                     rev_data['editor'] = '"' + rev.contributor.user_text + '"'
212                     rev_data['anon'] = "TRUE" if rev.contributor.id == None else "FALSE"
213                     
214                 else:
215                     rev_data['anon'] = ""
216                     rev_data['editor'] = ""
217
218                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
219                 #    redirect = True
220                 #else:
221                 #    redirect = False
222                 
223                 #TODO missing: additions_size deletions_size
224                 
225                 # if collapse user was on, lets run that
226                 if self.collapse_user:
227                     rev_data['collapsed_revs'] = rev.collapsed_revs
228
229                 if self.persist or self.persist_legacy:
230                     if rev.text.deleted:
231                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
232                             old_rev_data[k] = None
233                     else:
234
235                         if not self.persist_legacy:
236                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
237
238                         else:
239                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id,text_sha1)
240                             
241                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
242                         
243                         if len(window) == PERSISTENCE_RADIUS:
244                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
245                             
246                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
247
248                             old_rev_data["token_revs"] = num_token_revs
249                             old_rev_data["tokens_added"] = num_tokens
250                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
251                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
252
253                             self.print_rev_data(old_rev_data)
254
255                 else:
256                     self.print_rev_data(rev_data)
257
258                 rev_count += 1
259
260             if self.persist or self.persist_legacy:
261                 # print out metadata for the last RADIUS revisions
262                 for i, item in enumerate(window):
263                     # if the window was full, we've already printed item 0
264                     if len(window) == PERSISTENCE_RADIUS and i == 0:
265                         continue
266
267                     rev_id, rev_data, tokens_added, tokens_removed = item
268                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
269
270                     rev_data["token_revs"] = num_token_revs
271                     rev_data["tokens_added"] = num_tokens
272                     rev_data["tokens_removed"] = len(tokens_removed)
273                     rev_data["tokens_window"] = len(window)-(i+1)
274                     
275                     self.print_rev_data(rev_data)
276
277             page_count += 1
278
279         print("Done: %s revisions and %s pages." % (rev_count, page_count),
280               file=sys.stderr)
281
282     def print_rev_data(self, rev_data):
283         # if it's the first time through, print the header
284         if self.urlencode:
285             for field in TO_ENCODE:
286                 rev_data[field] = quote(str(rev_data[field]))
287             
288         if not self.printed_header:
289             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
290             self.printed_header = True
291         
292         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
293
294
295 def open_input_file(input_filename):
296     if re.match(r'.*\.7z$', input_filename):
297         cmd = ["7za", "x", "-so", input_filename, '*'] 
298     elif re.match(r'.*\.gz$', input_filename):
299         cmd = ["zcat", input_filename] 
300     elif re.match(r'.*\.bz2$', input_filename):
301         cmd = ["bzcat", "-dk", input_filename] 
302
303     try:
304         input_file = Popen(cmd, stdout=PIPE).stdout
305     except NameError:
306         input_file = open(input_filename, 'r')
307
308     return input_file
309
310 def open_output_file(input_filename):
311     # create a regex that creates the output filename
312     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
313     output_filename = re.sub(r'\.xml', '', output_filename)
314     output_filename = output_filename + ".tsv"
315     output_file = open(output_filename, "w")
316
317     return output_file
318
319 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
320
321 # arguments for the input direction
322 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
323                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
324
325 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
326                     help="Directory for output files.")
327
328 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
329                     help="Write output to standard out (do not create dump file)")
330
331 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
332                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
333
334 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
335                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
336
337 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
338                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
339
340 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
341                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
342
343 args = parser.parse_args()
344
345 if len(args.dumpfiles) > 0:
346     for filename in args.dumpfiles:
347         input_file = open_input_file(filename)
348
349         # open directory for output
350         if args.output_dir:
351             output_dir = args.output_dir[0]
352         else:
353             output_dir = "."
354
355         print("Processing file: %s" % filename, file=sys.stderr)
356
357         if args.stdout:
358             output_file = sys.stdout
359         else:
360             filename = os.path.join(output_dir, os.path.basename(filename))
361             output_file = open_output_file(filename)
362
363         wikiq = WikiqParser(input_file, output_file, 
364                             collapse_user=args.collapse_user,
365                             persist=args.persist,
366                             persist_legacy=args.persist_legacy,
367                             urlencode=args.urlencode)
368
369
370         wikiq.process()
371
372         # close things 
373         input_file.close()
374         output_file.close()
375 else:
376     wikiq = WikiqParser(sys.stdin, sys.stdout,
377                         collapse_user=args.collapse_user,
378                         persist=args.persist,
379                         persist_legacy=args.persist_legacy,
380                         urlencode=args.urlencode)
381     wikiq.process()
382
383 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
384 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?