support 7z archives with multiple files. add urlencode paraeter
[mediawiki_dump_tools.git] / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6
7 import argparse
8 import sys
9 import os, os.path
10 import re
11
12 from subprocess import Popen, PIPE
13 from collections import deque
14 from hashlib import sha1
15
16 from mw.xml_dump import Iterator
17 from mw.lib import persistence
18 from mw.lib import reverts
19 from urllib.parse import quote
20 TO_ENCODE = ('title', 'editor')
21 PERSISTENCE_RADIUS=7
22
23 def calculate_persistence(tokens_added):
24     return(sum([(len(x.revisions)-1) for x in tokens_added]),
25            len(tokens_added))
26
27 class WikiqIterator():
28     def __init__(self, fh, collapse_user=False):
29         self.fh = fh
30         self.collapse_user = collapse_user
31         self.mwiterator = Iterator.from_file(self.fh)
32         self.__pages = self.load_pages()
33
34     def load_pages(self):
35         for page in self.mwiterator:
36             yield WikiqPage(page, collapse_user=self.collapse_user)
37
38     def __iter__(self):
39         return self.__pages
40
41     def __next__(self):
42         return next(self._pages)
43
44 class WikiqPage():
45     __slots__ = ('id', 'title', 'namespace', 'redirect',
46                  'restrictions', 'mwpage', '__revisions',
47                  'collapse_user')
48     
49     def __init__(self, page, collapse_user=False):
50         self.id = page.id
51         self.title = page.title
52         self.namespace = page.namespace
53         self.redirect = page.redirect
54         self.restrictions = page.restrictions
55         
56         self.collapse_user = collapse_user
57         self.mwpage = page
58         self.__revisions = self.rev_list()
59
60     def rev_list(self):
61         # Outline for how we want to handle collapse_user=True
62         # iteration   rev.user   prev_rev.user   add prev_rev?
63         #         0          A            None           Never
64         #         1          A               A           False
65         #         2          B               A            True
66         #         3          A               B            True
67         #         4          A               A           False
68         # Post-loop                          A          Always
69         for i, rev in enumerate(self.mwpage):
70             # never yield the first time
71             if i == 0:
72                 if self.collapse_user: 
73                     collapsed_revs = 1
74                     rev.collapsed_revs = collapsed_revs
75
76             else:
77                 if self.collapse_user:
78                     # yield if this is the last edit in a seq by a user and reset
79                     if not rev.contributor.user_text == prev_rev.contributor.user_text:
80                         yield prev_rev
81                         collapsed_revs = 1
82                         rev.collapsed_revs = collapsed_revs
83                     # otherwise, add one to the counter
84                     else:
85                         collapsed_revs += 1
86                         rev.collapsed_revs = collapsed_revs
87                 # if collapse_user is false, we always yield
88                 else:
89                     yield prev_rev
90
91             prev_rev = rev
92         # also yield the final time
93         yield prev_rev
94
95     def __iter__(self):
96         return self.__revisions
97
98     def __next__(self):
99         return next(self.__revisions)
100
101 class WikiqParser():
102
103
104     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False):
105         
106         self.input_file = input_file
107         self.output_file = output_file
108         self.collapse_user = collapse_user
109         self.persist = persist
110         self.printed_header = False
111         self.namespaces = []
112         self.urlencode = urlencode
113         
114     def __get_namespace_from_title(self, title):
115         default_ns = None
116
117         for ns in self.namespaces:
118             # skip if the namespace is not defined
119             if ns == None:
120                 default_ns = self.namespaces[ns]
121                 continue
122
123             if title.startswith(ns + ":"):
124                 return self.namespaces[ns]
125
126         # if we've made it this far with no matches, we return the default namespace
127         return default_ns
128
129     def process(self):
130
131         # create a regex that creates the output filename
132         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
133         #                         r'output/wikiq-\1-\2.tsv',
134         #                         input_filename)
135
136         # Construct dump file iterator
137         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
138
139         # extract list of namspaces
140         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
141
142         page_count = 0
143         rev_count = 0
144         # Iterate through pages
145         for page in dump:
146             if self.persist:
147                 state = persistence.State()
148                 window = deque(maxlen=PERSISTENCE_RADIUS)
149
150             rev_detector = reverts.Detector()
151
152             # Iterate through a page's revisions
153             for rev in page:
154
155                 rev_data = {'revid' : rev.id,
156                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
157                             'articleid' : page.id,
158                             'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
159                             'title' : '"' + page.title + '"',
160                             'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
161                             'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
162
163                 # if revisions are deleted, /many/ things will be missing
164                 if rev.text.deleted:
165                     rev_data['text_chars'] = ""
166                     rev_data['sha1'] = ""
167                     rev_data['revert'] = ""
168                     rev_data['reverteds'] = ""
169
170                 else:
171                     # if text exists, we'll check for a sha1 and generate one otherwise
172                     if rev.sha1:
173                         text_sha1 = rev.sha1
174                     else:
175                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
176                     
177                     rev_data['sha1'] = text_sha1
178
179                     # TODO rev.bytes doesn't work.. looks like a bug
180                     rev_data['text_chars'] = len(rev.text)
181                
182                     # generate revert data
183                     revert = rev_detector.process(text_sha1, rev.id)
184                     if revert:
185                         rev_data['revert'] = "TRUE"
186                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
187                     else:
188                         rev_data['revert'] = "FALSE"
189                         rev_data['reverteds'] = ""
190
191                 # if the fact that the edit was minor can be hidden, this might be an issue
192                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
193
194                 if rev.contributor.user_text:
195                     # wrap user-defined editors in quotes for fread
196                     rev_data['editor'] = '"' + rev.contributor.user_text + '"'
197                     rev_data['anon'] = "TRUE" if rev.contributor.id == None else "FALSE"
198                     
199                 else:
200                     rev_data['anon'] = ""
201                     rev_data['editor'] = ""
202
203                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
204                 #    redirect = True
205                 #else:
206                 #    redirect = False
207                 
208                 #TODO missing: additions_size deletions_size
209                 
210                 # if collapse user was on, lets run that
211                 if self.collapse_user:
212                     rev_data['collapsed_revs'] = rev.collapsed_revs
213
214                 if self.persist:
215                     if rev.text.deleted:
216                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
217                             old_rev_data[k] = None
218                     else:
219                         _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
220                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
221                         
222                         if len(window) == PERSISTENCE_RADIUS:
223                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
224                             
225                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
226
227                             old_rev_data["token_revs"] = num_token_revs
228                             old_rev_data["tokens_added"] = num_tokens
229                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
230                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
231
232                             self.print_rev_data(old_rev_data)
233
234                 else:
235                     self.print_rev_data(rev_data)
236
237                 rev_count += 1
238
239             if self.persist:
240                 # print out metadata for the last RADIUS revisions
241                 for i, item in enumerate(window):
242                     # if the window was full, we've already printed item 0
243                     if len(window) == PERSISTENCE_RADIUS and i == 0:
244                         continue
245
246                     rev_id, rev_data, tokens_added, tokens_removed = item
247                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
248
249                     rev_data["token_revs"] = num_token_revs
250                     rev_data["tokens_added"] = num_tokens
251                     rev_data["tokens_removed"] = len(tokens_removed)
252                     rev_data["tokens_window"] = len(window)-(i+1)
253                     
254                     self.print_rev_data(rev_data)
255
256             page_count += 1
257
258         print("Done: %s revisions and %s pages." % (rev_count, page_count),
259               file=sys.stderr)
260
261     def print_rev_data(self, rev_data):
262         # if it's the first time through, print the header
263         if self.urlencode:
264             for field in TO_ENCODE:
265                 rev_data[field] = quote(str(rev_data[field]))
266             
267         if not self.printed_header:
268             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
269             self.printed_header = True
270         
271         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
272
273
274 def open_input_file(input_filename):
275     if re.match(r'.*\.7z', input_filename):
276         cmd = ["7za", "x", "-so", input_filename, '*.xml'] 
277     elif re.match(r'.*\.gz', input_filename):
278         cmd = ["zcat", input_filename] 
279     elif re.match(r'.*\.bz2', input_filename):
280         cmd = ["zcat", input_filename] 
281
282     try:
283         input_file = Popen(cmd, stdout=PIPE).stdout
284     except NameError:
285         input_file = open(input_filename, 'r')
286
287     return input_file
288
289 def open_output_file(input_filename):
290     # create a regex that creates the output filename
291     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
292     output_filename = re.sub(r'\.xml', '', output_filename)
293     output_filename = output_filename + ".tsv"
294     output_file = open(output_filename, "w")
295
296     return output_file
297
298 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
299
300 # arguments for the input direction
301 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
302                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
303
304 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
305                     help="Directory for output files.")
306
307 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
308                     help="Write output to standard out (do not create dump file)")
309
310 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
311                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
312
313 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
314                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
315
316 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
317                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
318
319 args = parser.parse_args()
320
321 if len(args.dumpfiles) > 0:
322     for filename in args.dumpfiles:
323         input_file = open_input_file(filename)
324
325         # open file for output
326         if args.stdout:
327             output_file = sys.stdout
328         else:
329             if args.output_dir:
330                 output_dir = args.output_dir[0]
331             else:
332                 output_dir = "."
333
334             filename = os.path.join(output_dir, os.path.basename(filename))
335             output_file = open_output_file(filename)
336
337         wikiq = WikiqParser(input_file, output_file, 
338                            collapse_user=args.collapse_user,
339                             persist=args.persist,
340                             urlencode=args.urlencode)
341
342         print("Processing file: %s" % filename, file=sys.stderr)
343
344         wikiq.process()
345
346         # close things 
347         input_file.close()
348         output_file.close()
349 else:
350     wikiq = WikiqParser(sys.stdin, sys.stdout,
351                        collapse_user=args.collapse_user,
352                         persist=args.persist,
353                         urlencode=args.urlencode)
354     wikiq.process()
355
356 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
357 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?