]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq_util.py
Merge branch 'user_level_wikiq' of code.communitydata.cc:mediawiki_dump_tools into...
[mediawiki_dump_tools.git] / wikiq_util.py
1 import sys
2 import re
3 from subprocess import Popen, PIPE
4 from collections import deque
5 from hashlib import sha1
6 from deltas.tokenizers import wikitext_split
7 from mwxml import Dump
8 import mwpersistence
9 import mwreverts
10 from urllib.parse import quote
11 from urllib.parse import unquote
12 from deltas import SequenceMatcher
13
14 TO_ENCODE = ('title', 'editor')
15 PERSISTENCE_RADIUS = 7
16
17 def try_unquote(obj):
18     if type(obj) is str:
19         obj = unquote(obj)
20         return obj.strip('\"')
21     else:
22         return
23
24 def calculate_persistence(tokens_added):
25     return(sum([(len(x.revisions)-1) for x in tokens_added]),
26            len(tokens_added))
27
28 class WikiqIterator():
29     def __init__(self, fh, collapse_user=False):
30         self.fh = fh
31         self.collapse_user = collapse_user
32         self.mwiterator = Dump.from_file(self.fh)
33         self.namespace_map = { ns.id : ns.name for ns in
34                                self.mwiterator.site_info.namespaces }
35         self.__pages = self.load_pages()
36
37     def load_pages(self):
38         for page in self.mwiterator:
39             yield WikiqPage(page,
40                             namespace_map = self.namespace_map,
41                             collapse_user=self.collapse_user)
42
43     def __iter__(self):
44         return self.__pages
45
46     def __next__(self):
47         return next(self._pages)
48
49
50 class WikiqPage():
51     __slots__ = ('id', 'title', 'namespace', 'redirect',
52                  'restrictions', 'mwpage', '__revisions',
53                  'collapse_user')
54     
55     def __init__(self, page, namespace_map, collapse_user=False):
56         self.id = page.id
57         self.namespace = page.namespace
58         if page.namespace != 0:
59             self.title = ':'.join([namespace_map[page.namespace], page.title])
60         else:
61             self.title = page.title
62         self.restrictions = page.restrictions
63         self.collapse_user = collapse_user
64         self.mwpage = page
65         self.__revisions = self.rev_list()
66
67     def rev_list(self):
68         # Outline for how we want to handle collapse_user=True
69         # iteration   rev.user   prev_rev.user   add prev_rev?
70         #         0          A            None           Never
71         #         1          A               A           False
72         #         2          B               A            True
73         #         3          A               B            True
74         #         4          A               A           False
75         # Post-loop                          A          Always
76         for i, rev in enumerate(self.mwpage):
77             # never yield the first time
78             if i == 0:
79                 if self.collapse_user: 
80                     collapsed_revs = 1
81                     rev.collapsed_revs = collapsed_revs
82
83             else:
84                 if self.collapse_user:
85                     # yield if this is the last edit in a seq by a user and reset
86                     # also yield if we do know who the user is
87
88                     if rev.deleted.user or prev_rev.deleted.user:
89                         yield prev_rev
90                         collapsed_revs = 1
91                         rev.collapsed_revs = collapsed_revs
92
93                     elif not rev.user.text == prev_rev.user.text:
94                         yield prev_rev
95                         collapsed_revs = 1
96                         rev.collapsed_revs = collapsed_revs
97                     # otherwise, add one to the counter
98                     else:
99                         collapsed_revs += 1
100                         rev.collapsed_revs = collapsed_revs
101                 # if collapse_user is false, we always yield
102                 else:
103                     yield prev_rev
104
105             prev_rev = rev
106
107         # also yield the final time
108         yield prev_rev
109
110     def __iter__(self):
111         return self.__revisions
112
113     def __next__(self):
114         return next(self.__revisions)
115
116 class WikiqParser():
117
118     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
119         
120         self.input_file = input_file
121         self.output_file = output_file
122         self.collapse_user = collapse_user
123         self.persist = persist
124         self.persist_legacy = persist_legacy
125         self.printed_header = False
126         self.namespaces = []
127         self.urlencode = urlencode
128         
129     def __get_namespace_from_title(self, title):
130         default_ns = None
131
132         for ns in self.namespaces:
133             # skip if the namespace is not defined
134             if ns == None:
135                 default_ns = self.namespaces[ns]
136                 continue
137
138             if title.startswith(ns + ":"):
139                 return self.namespaces[ns]
140
141         # if we've made it this far with no matches, we return the default namespace
142         return default_ns
143
144     def process(self):
145
146         # create a regex that creates the output filename
147         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
148         #                         r'output/wikiq-\1-\2.tsv',
149         #                         input_filename)
150
151         # Construct dump file iterator
152         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
153
154         # extract list of namspaces
155         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
156
157         page_count = 0
158         rev_count = 0
159
160
161         # Iterate through pages
162         for page in dump:
163             rev_detector = mwreverts.Detector()
164
165             if self.persist or self.persist_legacy:
166                 window = deque(maxlen=PERSISTENCE_RADIUS)
167
168                 if not self.persist_legacy:
169                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
170                                                     revert_radius=PERSISTENCE_RADIUS)
171
172                 else:
173                     from mw.lib import persistence
174                     state = persistence.State()
175
176             # Iterate through a page's revisions
177             for rev in page:
178
179                 rev_data = {'revid' : rev.id,
180                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
181                             'articleid' : page.id,
182                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
183                             'title' : '"' + page.title + '"',
184                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
185                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
186
187                 # if revisions are deleted, /many/ things will be missing
188                 if rev.deleted.text:
189                     rev_data['text_chars'] = ""
190                     rev_data['sha1'] = ""
191                     rev_data['revert'] = ""
192                     rev_data['reverteds'] = ""
193
194                 else:
195                     # rev.text can be None if the page has no text
196                     if not rev.text:
197                         rev.text = ""
198                     # if text exists, we'll check for a sha1 and generate one otherwise
199
200                     if rev.sha1:
201                         text_sha1 = rev.sha1
202                     else:
203
204                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
205                     
206                     rev_data['sha1'] = text_sha1
207
208                     # TODO rev.bytes doesn't work.. looks like a bug
209                     rev_data['text_chars'] = len(rev.text)
210                
211                     # generate revert data
212                     revert = rev_detector.process(text_sha1, rev.id)
213                     
214                     if revert:
215                         rev_data['revert'] = "TRUE"
216                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
217                     else:
218                         rev_data['revert'] = "FALSE"
219                         rev_data['reverteds'] = ""
220
221                 # if the fact that the edit was minor can be hidden, this might be an issue
222                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
223
224                 if not rev.deleted.user:
225                     # wrap user-defined editors in quotes for fread
226                     rev_data['editor'] = '"' + rev.user.text + '"'
227                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
228                     
229                 else:
230                     rev_data['anon'] = ""
231                     rev_data['editor'] = ""
232
233                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
234                 #    redirect = True
235                 #else:
236                 #    redirect = False
237                 
238                 #TODO missing: additions_size deletions_size
239                 
240                 # if collapse user was on, lets run that
241                 if self.collapse_user:
242                     rev_data['collapsed_revs'] = rev.collapsed_revs
243
244                 if self.persist or self.persist_legacy:
245                     if rev.deleted.text:
246
247                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
248                             old_rev_data[k] = None
249                     else:
250
251                         if not self.persist_legacy:
252                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
253
254                         else:
255                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
256                             
257                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
258                         
259                         if len(window) == PERSISTENCE_RADIUS:
260                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
261                             
262                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
263
264                             old_rev_data["token_revs"] = num_token_revs
265                             old_rev_data["tokens_added"] = num_tokens
266                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
267                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
268
269                             self.print_rev_data(old_rev_data)
270
271                 else:
272                     self.print_rev_data(rev_data)
273
274                 rev_count += 1
275
276             if self.persist or self.persist_legacy:
277                 # print out metadata for the last RADIUS revisions
278                 for i, item in enumerate(window):
279                     # if the window was full, we've already printed item 0
280                     if len(window) == PERSISTENCE_RADIUS and i == 0:
281                         continue
282
283                     rev_id, rev_data, tokens_added, tokens_removed = item
284                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
285
286                     rev_data["token_revs"] = num_token_revs
287                     rev_data["tokens_added"] = num_tokens
288                     rev_data["tokens_removed"] = len(tokens_removed)
289                     rev_data["tokens_window"] = len(window)-(i+1)
290                     
291                     self.print_rev_data(rev_data)
292
293             page_count += 1
294
295         print("Done: %s revisions and %s pages." % (rev_count, page_count),
296               file=sys.stderr)
297
298     def print_rev_data(self, rev_data):
299         # if it's the first time through, print the header
300         if self.urlencode:
301             for field in TO_ENCODE:
302                 rev_data[field] = quote(str(rev_data[field]))
303
304         if not self.printed_header:
305             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
306             self.printed_header = True
307         
308         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
309
310
311 def open_input_file(input_filename):
312     if re.match(r'.*\.7z$', input_filename):
313         cmd = ["7za", "x", "-so", input_filename, '*'] 
314     elif re.match(r'.*\.gz$', input_filename):
315         cmd = ["zcat", input_filename] 
316     elif re.match(r'.*\.bz2$', input_filename):
317         cmd = ["bzcat", "-dk", input_filename] 
318
319     try:
320         input_file = Popen(cmd, stdout=PIPE).stdout
321     except NameError:
322         input_file = open(input_filename, 'r')
323
324     return input_file
325
326 def open_output_file(input_filename):
327     # create a regex that creates the output filename
328     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
329     output_filename = re.sub(r'\.xml', '', output_filename)
330     output_filename = output_filename + ".tsv"
331     output_file = open(output_filename, "w")
332
333     return output_file
334
335
336 class IPCheck(object):
337
338     # IP address regexes taken from https://gist.github.com/mnordhoff/2213179
339     ipv4_address = re.compile('^(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
340
341     ipv6_address_or_addrz = re.compile('^(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)(?:%25(?:[A-Za-z0-9\\-._~]|%[0-9A-Fa-f]{2})+)?$')
342
343     @staticmethod
344     def is_ip(username):
345         if not type(username) is str:
346             return False
347
348         '''Check if a username is an ip (v4 or v6) address. We use this as
349         a marker of whether the user is anonymous.'''
350         if IPCheck.ipv4_address.match(username) or  IPCheck.ipv6_address_or_addrz.match(username):
351             return True
352         else:
353             return False

Community Data Science Collective || Want to submit a patch?