elaborate docstring for persistence
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 class WikiqIterator():
38     def __init__(self, fh, collapse_user=False):
39         self.fh = fh
40         self.collapse_user = collapse_user
41         self.mwiterator = Dump.from_file(self.fh)
42         self.namespace_map = { ns.id : ns.name for ns in
43                                self.mwiterator.site_info.namespaces }
44         self.__pages = self.load_pages()
45
46     def load_pages(self):
47         for page in self.mwiterator:
48             yield WikiqPage(page,
49                             namespace_map = self.namespace_map,
50                             collapse_user=self.collapse_user)
51
52     def __iter__(self):
53         return self.__pages
54
55     def __next__(self):
56         return next(self._pages)
57
58 class WikiqPage():
59     __slots__ = ('id', 'title', 'namespace', 'redirect',
60                  'restrictions', 'mwpage', '__revisions',
61                  'collapse_user')
62     
63     def __init__(self, page, namespace_map, collapse_user=False):
64         self.id = page.id
65         self.namespace = page.namespace
66         # following mwxml, we assume namespace 0 in cases where
67         # page.namespace is inconsistent with namespace_map
68         if page.namespace not in namespace_map:
69             self.title = page.title
70             page.namespace = 0
71         if page.namespace != 0:
72             self.title = ':'.join([namespace_map[page.namespace], page.title])
73         else:
74             self.title = page.title
75         self.restrictions = page.restrictions
76         self.collapse_user = collapse_user
77         self.mwpage = page
78         self.__revisions = self.rev_list()
79
80     def rev_list(self):
81         # Outline for how we want to handle collapse_user=True
82         # iteration   rev.user   prev_rev.user   add prev_rev?
83         #         0          A            None           Never
84         #         1          A               A           False
85         #         2          B               A            True
86         #         3          A               B            True
87         #         4          A               A           False
88         # Post-loop                          A          Always
89         for i, rev in enumerate(self.mwpage):
90             # never yield the first time
91             if i == 0:
92                 if self.collapse_user: 
93                     collapsed_revs = 1
94                     rev.collapsed_revs = collapsed_revs
95
96             else:
97                 if self.collapse_user:
98                     # yield if this is the last edit in a seq by a user and reset
99                     # also yield if we do know who the user is
100
101                     if rev.deleted.user or prev_rev.deleted.user:
102                         yield prev_rev
103                         collapsed_revs = 1
104                         rev.collapsed_revs = collapsed_revs
105
106                     elif not rev.user.text == prev_rev.user.text:
107                         yield prev_rev
108                         collapsed_revs = 1
109                         rev.collapsed_revs = collapsed_revs
110                     # otherwise, add one to the counter
111                     else:
112                         collapsed_revs += 1
113                         rev.collapsed_revs = collapsed_revs
114                 # if collapse_user is false, we always yield
115                 else:
116                     yield prev_rev
117
118             prev_rev = rev
119
120         # also yield the final time
121         yield prev_rev
122
123     def __iter__(self):
124         return self.__revisions
125
126     def __next__(self):
127         return next(self.__revisions)
128
129 class WikiqParser():
130     
131     def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
132         """ 
133         Parameters:
134            persist : what persistence method to use. Takes a PersistMethod value
135         """
136
137         self.input_file = input_file
138         self.output_file = output_file
139         self.collapse_user = collapse_user
140         self.persist = persist
141         self.printed_header = False
142         self.namespaces = []
143         self.urlencode = urlencode
144         if namespaces is not None:
145             self.namespace_filter = set(namespaces)
146         else:
147             self.namespace_filter = None
148
149     def __get_namespace_from_title(self, title):
150         default_ns = None
151
152         for ns in self.namespaces:
153             # skip if the namespace is not defined
154             if ns == None:
155                 default_ns = self.namespaces[ns]
156                 continue
157
158             if title.startswith(ns + ":"):
159                 return self.namespaces[ns]
160
161         # if we've made it this far with no matches, we return the default namespace
162         return default_ns
163
164     def process(self):
165
166         # create a regex that creates the output filename
167         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
168         #                         r'output/wikiq-\1-\2.tsv',
169         #                         input_filename)
170
171         # Construct dump file iterator
172         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
173
174         # extract list of namspaces
175         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
176
177         page_count = 0
178         rev_count = 0
179
180
181         # Iterate through pages
182         for page in dump:
183             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
184
185             # skip namespaces not in the filter
186             if self.namespace_filter is not None:
187                 if namespace not in self.namespace_filter:
188                     continue
189
190             rev_detector = mwreverts.Detector()
191
192             if self.persist != PersistMethod.none:
193                 window = deque(maxlen=PERSISTENCE_RADIUS)
194
195                 if self.persist == PersistMethod.sequence:
196                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
197                                                     revert_radius=PERSISTENCE_RADIUS)
198
199                 elif self.persist == PersistMethod.segment:
200                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
201                                                     revert_radius=PERSISTENCE_RADIUS)
202
203                 # self.persist == PersistMethod.legacy
204                 else:
205                     from mw.lib import persistence
206                     state = persistence.State()
207
208             # Iterate through a page's revisions
209             for rev in page:
210
211                 rev_data = {'revid' : rev.id,
212                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
213                             'articleid' : page.id,
214                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
215                             'title' : '"' + page.title + '"',
216                             'namespace' : namespace,
217                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
218
219                 # if revisions are deleted, /many/ things will be missing
220                 if rev.deleted.text:
221                     rev_data['text_chars'] = ""
222                     rev_data['sha1'] = ""
223                     rev_data['revert'] = ""
224                     rev_data['reverteds'] = ""
225
226                 else:
227                     # rev.text can be None if the page has no text
228                     if not rev.text:
229                         rev.text = ""
230                     # if text exists, we'll check for a sha1 and generate one otherwise
231
232                     if rev.sha1:
233                         text_sha1 = rev.sha1
234                     else:
235
236                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
237                     
238                     rev_data['sha1'] = text_sha1
239
240                     # TODO rev.bytes doesn't work.. looks like a bug
241                     rev_data['text_chars'] = len(rev.text)
242                
243                     # generate revert data
244                     revert = rev_detector.process(text_sha1, rev.id)
245                     
246                     if revert:
247                         rev_data['revert'] = "TRUE"
248                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
249                     else:
250                         rev_data['revert'] = "FALSE"
251                         rev_data['reverteds'] = ""
252
253                 # if the fact that the edit was minor can be hidden, this might be an issue
254                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
255
256                 if not rev.deleted.user:
257                     # wrap user-defined editors in quotes for fread
258                     rev_data['editor'] = '"' + rev.user.text + '"'
259                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
260                     
261                 else:
262                     rev_data['anon'] = ""
263                     rev_data['editor'] = ""
264
265                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
266                 #    redirect = True
267                 #else:
268                 #    redirect = False
269                 
270                 #TODO missing: additions_size deletions_size
271                 
272                 # if collapse user was on, lets run that
273                 if self.collapse_user:
274                     rev_data['collapsed_revs'] = rev.collapsed_revs
275
276                 if self.persist != PersistMethod.none:
277                     if rev.deleted.text:
278                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
279                             old_rev_data[k] = None
280                     else:
281
282                         if self.persist != PersistMethod.legacy:
283                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
284
285                         else:
286                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
287                             
288                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
289                         
290                         if len(window) == PERSISTENCE_RADIUS:
291                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
292                             
293                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
294
295                             old_rev_data["token_revs"] = num_token_revs
296                             old_rev_data["tokens_added"] = num_tokens
297                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
298                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
299
300                             self.print_rev_data(old_rev_data)
301
302                 else:
303                     self.print_rev_data(rev_data)
304
305                 rev_count += 1
306
307             if self.persist != PersistMethod.none:
308                 # print out metadata for the last RADIUS revisions
309                 for i, item in enumerate(window):
310                     # if the window was full, we've already printed item 0
311                     if len(window) == PERSISTENCE_RADIUS and i == 0:
312                         continue
313
314                     rev_id, rev_data, tokens_added, tokens_removed = item
315                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
316
317                     rev_data["token_revs"] = num_token_revs
318                     rev_data["tokens_added"] = num_tokens
319                     rev_data["tokens_removed"] = len(tokens_removed)
320                     rev_data["tokens_window"] = len(window)-(i+1)
321                     
322                     self.print_rev_data(rev_data)
323
324             page_count += 1
325
326         print("Done: %s revisions and %s pages." % (rev_count, page_count),
327               file=sys.stderr)
328
329     def print_rev_data(self, rev_data):
330         # if it's the first time through, print the header
331         if self.urlencode:
332             for field in TO_ENCODE:
333                 rev_data[field] = quote(str(rev_data[field]))
334
335         if not self.printed_header:
336             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
337             self.printed_header = True
338         
339         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
340
341
342 def open_input_file(input_filename):
343     if re.match(r'.*\.7z$', input_filename):
344         cmd = ["7za", "x", "-so", input_filename, '*'] 
345     elif re.match(r'.*\.gz$', input_filename):
346         cmd = ["zcat", input_filename] 
347     elif re.match(r'.*\.bz2$', input_filename):
348         cmd = ["bzcat", "-dk", input_filename] 
349
350     try:
351         input_file = Popen(cmd, stdout=PIPE).stdout
352     except NameError:
353         input_file = open(input_filename, 'r')
354
355     return input_file
356
357 def open_output_file(input_filename):
358     # create a regex that creates the output filename
359     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
360     output_filename = re.sub(r'\.xml', '', output_filename)
361     output_filename = output_filename + ".tsv"
362     output_file = open(output_filename, "w")
363
364     return output_file
365
366 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
367
368 # arguments for the input direction
369 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
370                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
371
372 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
373                     help="Directory for output files.")
374
375 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
376                     help="Write output to standard out (do not create dump file)")
377
378 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
379                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
380
381 parser.add_argument('-p', '--persistence', dest="persist", default="sequence", const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
382                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
383
384 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
385                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
386
387 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
388                     help="Id number of namspace to include. Can be specified more than once.")
389
390
391
392 args = parser.parse_args()
393
394 # set persistence method
395
396 if args.persist is None:
397     persist = PersistMethod.none
398 elif args.persist == "segment":
399     persist = PersistMethod.segment
400 elif args.persist == "legacy":
401     persist = PersistMethod.legacy
402 else:
403     persist = PersistMethod.sequence
404
405 if args.namespace_filter is not None:
406     namespaces = args.namespace_filter
407 else:
408     namespaces = None
409
410 if len(args.dumpfiles) > 0:
411     for filename in args.dumpfiles:
412         input_file = open_input_file(filename)
413
414         # open directory for output
415         if args.output_dir:
416             output_dir = args.output_dir[0]
417         else:
418             output_dir = "."
419
420         print("Processing file: %s" % filename, file=sys.stderr)
421
422         if args.stdout:
423             output_file = sys.stdout
424         else:
425             filename = os.path.join(output_dir, os.path.basename(filename))
426             output_file = open_output_file(filename)
427
428         wikiq = WikiqParser(input_file, output_file, 
429                             collapse_user=args.collapse_user,
430                             persist=persist,
431                             urlencode=args.urlencode,
432                             namespaces = namespaces)
433
434         wikiq.process()
435
436         # close things 
437         input_file.close()
438         output_file.close()
439 else:
440     wikiq = WikiqParser(sys.stdin, sys.stdout,
441                         collapse_user=args.collapse_user,
442                         persist=persist,
443                         persist_legacy=args.persist_legacy,
444                         urlencode=args.urlencode,
445                         namespaces = namespaces)
446     wikiq.process()
447
448 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
449 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?