]> code.communitydata.science - mediawiki_dump_tools.git/blob - wikiq
changes for regex scanner addition
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mwxml import Dump
17
18 from deltas.tokenizers import wikitext_split
19 import mwpersistence
20 import mwreverts
21 from urllib.parse import quote
22 TO_ENCODE = ('title', 'editor')
23 PERSISTENCE_RADIUS=7
24 from deltas import SequenceMatcher
25 from deltas import SegmentMatcher
26
27 class PersistMethod:
28     none = 0
29     sequence = 1
30     segment = 2
31     legacy = 3
32
33 def calculate_persistence(tokens_added):
34     return(sum([(len(x.revisions)-1) for x in tokens_added]),
35            len(tokens_added))
36
37 def matchmaker(rev_data, regular_expression, scanner, rev): #rev_data,self.regex,self.scanner, rev
38     for location in scanner: #presumably 'comment' 'text' 'comment text' made into a list by args
39         if location == "comment":
40             matching_string = rev.comment
41         elif location == "text":
42             matching_string = rev.text
43         else:
44             sys.exit("regex scanner location must be 'comment' or 'text'.")
45
46         if (re.search(regular_expression, matching_string) is not None): # we know that there is a match somewhere
47             m = re.finditer(regular_expression, matching_string) # all our matchObjects in a list
48             blob=""
49             for result in m:
50                 blob = blob + "," + result.group(0)
51             # columns we want
52             rev_data['matches'] = blob #### the list of matchObjects. gleaned in post-processing       
53         else:
54             rev_data['matches'] = None
55         
56     return rev_data
57
58
59
60 class WikiqIterator():
61     def __init__(self, fh, collapse_user=False):
62         self.fh = fh
63         self.collapse_user = collapse_user
64         self.mwiterator = Dump.from_file(self.fh)
65         self.namespace_map = { ns.id : ns.name for ns in
66                                self.mwiterator.site_info.namespaces }
67         self.__pages = self.load_pages()
68
69     def load_pages(self):
70         for page in self.mwiterator:
71             yield WikiqPage(page,
72                             namespace_map = self.namespace_map,
73                             collapse_user=self.collapse_user)
74
75     def __iter__(self):
76         return self.__pages
77
78     def __next__(self):
79         return next(self._pages)
80
81 class WikiqPage():
82     __slots__ = ('id', 'title', 'namespace', 'redirect',
83                  'restrictions', 'mwpage', '__revisions',
84                  'collapse_user')
85     
86     def __init__(self, page, namespace_map, collapse_user=False):
87         self.id = page.id
88         self.namespace = page.namespace
89         # following mwxml, we assume namespace 0 in cases where
90         # page.namespace is inconsistent with namespace_map
91         if page.namespace not in namespace_map:
92             self.title = page.title
93             page.namespace = 0
94         if page.namespace != 0:
95             self.title = ':'.join([namespace_map[page.namespace], page.title])
96         else:
97             self.title = page.title
98         self.restrictions = page.restrictions
99         self.collapse_user = collapse_user
100         self.mwpage = page
101         self.__revisions = self.rev_list()
102
103     def rev_list(self):
104         # Outline for how we want to handle collapse_user=True
105         # iteration   rev.user   prev_rev.user   add prev_rev?
106         #         0          A            None           Never
107         #         1          A               A           False
108         #         2          B               A            True
109         #         3          A               B            True
110         #         4          A               A           False
111         # Post-loop                          A          Always
112         for i, rev in enumerate(self.mwpage):
113             # never yield the first time
114             if i == 0:
115                 if self.collapse_user: 
116                     collapsed_revs = 1
117                     rev.collapsed_revs = collapsed_revs
118
119             else:
120                 if self.collapse_user:
121                     # yield if this is the last edit in a seq by a user and reset
122                     # also yield if we do know who the user is
123
124                     if rev.deleted.user or prev_rev.deleted.user:
125                         yield prev_rev
126                         collapsed_revs = 1
127                         rev.collapsed_revs = collapsed_revs
128
129                     elif not rev.user.text == prev_rev.user.text:
130                         yield prev_rev
131                         collapsed_revs = 1
132                         rev.collapsed_revs = collapsed_revs
133                     # otherwise, add one to the counter
134                     else:
135                         collapsed_revs += 1
136                         rev.collapsed_revs = collapsed_revs
137                 # if collapse_user is false, we always yield
138                 else:
139                     yield prev_rev
140
141             prev_rev = rev
142
143         # also yield the final time
144         yield prev_rev
145
146     def __iter__(self):
147         return self.__revisions
148
149     def __next__(self):
150         return next(self.__revisions)
151
152 class WikiqParser():
153     
154     def __init__(self, input_file, output_file, scanner, match_regex, collapse_user=False, persist=None, urlencode=False, namespaces = None):
155         """ 
156         Parameters:
157            persist : what persistence method to use. Takes a PersistMethod value
158         """
159
160         self.input_file = input_file
161         self.output_file = output_file
162         self.collapse_user = collapse_user
163         self.persist = persist
164         self.printed_header = False
165         self.namespaces = []
166         self.urlencode = urlencode
167         self.scanner = scanner
168         self.match_regex = match_regex
169
170         if namespaces is not None:
171             self.namespace_filter = set(namespaces)
172         else:
173             self.namespace_filter = None
174
175     def __get_namespace_from_title(self, title):
176         default_ns = None
177
178         for ns in self.namespaces:
179             # skip if the namespace is not defined
180             if ns == None:
181                 default_ns = self.namespaces[ns]
182                 continue
183
184             if title.startswith(ns + ":"):
185                 return self.namespaces[ns]
186
187         # if we've made it this far with no matches, we return the default namespace
188         return default_ns
189
190
191     def process(self):
192
193         # create a regex that creates the output filename
194         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
195         #                         r'output/wikiq-\1-\2.tsv',
196         #                         input_filename)
197
198         # Construct dump file iterator
199         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
200
201         # extract list of namspaces
202         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
203
204         page_count = 0
205         rev_count = 0
206
207
208         # Iterate through pages
209         for page in dump:
210             namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
211
212             # skip namespaces not in the filter
213             if self.namespace_filter is not None:
214                 if namespace not in self.namespace_filter:
215                     continue
216
217             rev_detector = mwreverts.Detector()
218
219             if self.persist != PersistMethod.none:
220                 window = deque(maxlen=PERSISTENCE_RADIUS)
221
222                 if self.persist == PersistMethod.sequence:
223                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
224                                                     revert_radius=PERSISTENCE_RADIUS)
225
226                 elif self.persist == PersistMethod.segment:
227                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
228                                                     revert_radius=PERSISTENCE_RADIUS)
229
230                 # self.persist == PersistMethod.legacy
231                 else:
232                     from mw.lib import persistence
233                     state = persistence.State()
234
235             # Iterate through a page's revisions
236             for rev in page:
237                 ## m = re.finditer() #so we can find all instances
238                 ## m.groupdict() #so we can look at them all with their names
239
240                 # initialize rev_dat
241                 rev_data = {}
242
243                 if self.scanner is not None: # we know we want to do a regex search 
244                     ## comment = want to look in comment attached to revision
245                     ## text = want to look in revision text
246
247                     ### call the scanner function
248                     rev_data = matchmaker(rev_data, self.match_regex, self.scanner, rev)
249         
250                 if self.scanner is not None and rev_data['matches'] is None:
251                     next
252
253                 # we fill out the rest of the data structure now
254                 rev_data['revid'] = rev.id
255                 rev_data['date_time'] = rev.timestamp.strftime('%Y-%m-%d %H:%M:%S')
256                 rev_data['articleid'] = page.id
257                 rev_data['editor_id'] = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id
258                 rev_data['title'] = '"' + page.title + '"'
259                 rev_data['namespace'] = namespace
260                 rev_data['deleted'] = "TRUE" if rev.deleted.text else "FALSE"
261
262                 # if revisions are deleted, /many/ things will be missing
263                 if rev.deleted.text:
264                     rev_data['text_chars'] = ""
265                     rev_data['sha1'] = ""
266                     rev_data['revert'] = ""
267                     rev_data['reverteds'] = ""
268
269                 else:
270                     # rev.text can be None if the page has no text
271                     if not rev.text:
272                         rev.text = ""
273                     # if text exists, we'll check for a sha1 and generate one otherwise
274
275                     if rev.sha1:
276                         text_sha1 = rev.sha1
277                     else:
278
279                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
280                     
281                     rev_data['sha1'] = text_sha1
282
283                     # TODO rev.bytes doesn't work.. looks like a bug
284                     rev_data['text_chars'] = len(rev.text)
285
286                     # generate revert data
287                     revert = rev_detector.process(text_sha1, rev.id)
288                     
289                     if revert:
290                         rev_data['revert'] = "TRUE"
291                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
292                     else:
293                         rev_data['revert'] = "FALSE"
294                         rev_data['reverteds'] = ""
295
296                 # if the fact that the edit was minor can be hidden, this might be an issue
297                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
298
299                 if not rev.deleted.user:
300                     # wrap user-defined editors in quotes for fread
301                     rev_data['editor'] = '"' + rev.user.text + '"'
302                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
303                     
304                 else:
305                     rev_data['anon'] = ""
306                     rev_data['editor'] = ""
307
308                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
309                 #    redirect = True
310                 #else:
311                 #    redirect = False
312                 
313                 #TODO missing: additions_size deletions_size
314                 
315                 # if collapse user was on, lets run that
316                 if self.collapse_user:
317                     rev_data['collapsed_revs'] = rev.collapsed_revs
318
319                 if self.persist != PersistMethod.none:
320                     if rev.deleted.text:
321                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
322                             old_rev_data[k] = None
323                     else:
324
325                         if self.persist != PersistMethod.legacy:
326                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
327
328                         else:
329                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
330                             
331                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
332                         
333                         if len(window) == PERSISTENCE_RADIUS:
334                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
335                             
336                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
337
338                             old_rev_data["token_revs"] = num_token_revs
339                             old_rev_data["tokens_added"] = num_tokens
340                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
341                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
342
343                             self.print_rev_data(old_rev_data)
344
345                 else:
346                     self.print_rev_data(rev_data)
347
348                 rev_count += 1
349
350             if self.persist != PersistMethod.none:
351                 # print out metadata for the last RADIUS revisions
352                 for i, item in enumerate(window):
353                     # if the window was full, we've already printed item 0
354                     if len(window) == PERSISTENCE_RADIUS and i == 0:
355                         continue
356
357                     rev_id, rev_data, tokens_added, tokens_removed = item
358                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
359
360                     rev_data["token_revs"] = num_token_revs
361                     rev_data["tokens_added"] = num_tokens
362                     rev_data["tokens_removed"] = len(tokens_removed)
363                     rev_data["tokens_window"] = len(window)-(i+1)
364                     
365                     self.print_rev_data(rev_data)
366
367             page_count += 1
368
369         print("Done: %s revisions and %s pages." % (rev_count, page_count),
370               file=sys.stderr)
371
372     def print_rev_data(self, rev_data):
373         # if it's the first time through, print the header
374         if self.urlencode:
375             for field in TO_ENCODE:
376                 rev_data[field] = quote(str(rev_data[field]))
377
378         if not self.printed_header:
379             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
380             self.printed_header = True
381         
382         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
383
384
385 def open_input_file(input_filename):
386     if re.match(r'.*\.7z$', input_filename):
387         cmd = ["7za", "x", "-so", input_filename, '*'] 
388     elif re.match(r'.*\.gz$', input_filename):
389         cmd = ["zcat", input_filename] 
390     elif re.match(r'.*\.bz2$', input_filename):
391         cmd = ["bzcat", "-dk", input_filename] 
392
393     try:
394         input_file = Popen(cmd, stdout=PIPE).stdout
395     except NameError:
396         input_file = open(input_filename, 'r')
397
398     return input_file
399
400 def open_output_file(input_filename):
401     # create a regex that creates the output filename
402     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
403     output_filename = re.sub(r'\.xml', '', output_filename)
404     output_filename = output_filename + ".tsv"
405     output_file = open(output_filename, "w")
406
407     return output_file
408
409 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
410
411 # arguments for the input direction
412 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
413                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
414
415 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
416                     help="Directory for output files.")
417
418 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
419                     help="Write output to standard out (do not create dump file)")
420
421 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
422                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
423
424 parser.add_argument('-p', '--persistence', dest="persist", default="", const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
425                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
426
427 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
428                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
429
430 parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
431                     help="Id number of namspace to include. Can be specified more than once.")
432
433 parser.add_argument('-rs', '--regex-scanner', dest="scanner",type=str, action='append',
434                     help="Find the regex match specified by -R/--match searching in: (1) comment (2) text.")
435
436 parser.add_argument('-R', '--match', dest="match_regex", type=str, 
437                     help="The regular expression you would like to find in the string and put in capture group")
438
439 args = parser.parse_args()
440
441 # set persistence method
442
443 if args.persist is None:
444     persist = PersistMethod.none
445 elif args.persist == "segment":
446     persist = PersistMethod.segment
447 elif args.persist == "legacy":
448     persist = PersistMethod.legacy
449 else:
450     persist = PersistMethod.sequence
451
452 if args.namespace_filter is not None:
453     namespaces = args.namespace_filter
454 else:
455     namespaces = None
456
457 if len(args.dumpfiles) > 0:
458     for filename in args.dumpfiles:
459         input_file = open_input_file(filename)
460
461         # open directory for output
462         if args.output_dir:
463             output_dir = args.output_dir[0]
464         else:
465             output_dir = "."
466
467         print("Processing file: %s" % filename, file=sys.stderr)
468
469         if args.stdout:
470             output_file = sys.stdout
471         else:
472             filename = os.path.join(output_dir, os.path.basename(filename))
473             output_file = open_output_file(filename)
474
475         wikiq = WikiqParser(input_file, output_file, 
476                             collapse_user=args.collapse_user,
477                             persist=persist,
478                             urlencode=args.urlencode,
479                             namespaces = namespaces,
480                             match_regex=args.match_regex, # adding in the new 2 args for regex searching
481                             scanner=args.scanner)
482
483         wikiq.process()
484
485         # close things 
486         input_file.close()
487         output_file.close()
488 else:
489     wikiq = WikiqParser(sys.stdin, sys.stdout,
490                         collapse_user=args.collapse_user,
491                         persist=persist,
492                         #persist_legacy=args.persist_legacy,
493                         urlencode=args.urlencode,
494                         namespaces = namespaces,
495                         match_regex=args.match_regex, # adding in the new 2 args for regex searching
496                         scanner=args.scanner)
497     wikiq.process()
498
499 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
500 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?