3 # Obtain user roles data from the Wikia logevents api
4 # Copyright (C) 2018 Nathan TeBlunthuis
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
17 from json.decoder import JSONDecodeError
18 from scraper_utils import prepare_output, read_wikilist, add_parser_arguments
22 def write_logevents(logevents, out):
23 for logevent in logevents:
24 # if there is hidden information, we skip this one because there
25 # is nothing to report
26 if any(['userhidden' in logevent,
27 'actionhidden' in logevent,
28 'commenthidden' in logevent]):
31 le_output = [logevent['comment'],
32 str(logevent['logid']),
34 str(logevent['pageid']),
35 logevent['timestamp'],
38 str(logevent['user'])]
40 if "rights" in logevent:
41 le_output.extend(['false',
42 logevent['rights']['new'],
43 logevent['rights']['old']])
45 le_output.extend(['true', '', ''])
47 out.write("\t".join(le_output) + "\n")
52 def write_blockevents(logevents, out):
53 for logevent in logevents:
54 # if there is hidden information, we skip this one because there
55 # is nothing to report
56 if any(['userhidden' in logevent,
57 'actionhidden' in logevent,
58 'commenthidden' in logevent]):
61 le_output = [logevent['comment'],
62 str(logevent['logid']),
64 str(logevent['pageid']),
65 logevent['timestamp'],
68 str(logevent['user'])]
70 if "rights" in logevent:
71 le_output.extend(['false',
72 logevent['rights']['new'],
73 logevent['rights']['old']])
75 le_output.extend(['true', '', ''])
77 out.write("\t".join(le_output) + "\n")
82 def get_events_for_wiki(wikiname, url, output_dir, blocks_output=None, wikitype="wikia"):
83 out = open("{0}/{1}.tsv".format(output_dir, wikiname), "w")
85 out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
86 'title', 'type', 'user', 'ancient', 'rights-new',
89 if wikitype == "wikia":
90 api_url = url + '/api.php'
91 else: # wikitype == wikipedia
92 api_url = url + "/w/api.php"
96 if blocks_output is not None:
97 letype = 'rights|block|unblock'
98 blockout = open("{0}/{1}.tsv".format(blocks_output, wikiname), "w")
99 blockout.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
100 'title', 'type', 'user', 'ancient', 'rights-new',
103 query = {'action': 'query',
111 response = requests.get(api_url, params=query)
114 except (JSONDecodeError):
115 api_url = response.url
116 # print api_url # debug
117 if wikitype == "wikia":
118 re_str = "^http://(community|www).wikia.com/"
119 else: # wikitype == "wikipedia"
120 re_str = "^(http|https)://.*wikipedia.org/"
122 if re.match(re_str, api_url):
124 # 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
125 print("ERROR: %s no longer exists" % wikiname)
128 response = requests.get(api_url, params=query)
132 logevents = rv['query']['logevents']
134 blockevents = [e for e in logevents
135 if (e['action'] in ['block', 'unblock'])
136 or (e['type'] in ['block', 'unblock'])]
138 logevents = [e for e in logevents if e not in blockevents]
140 write_logevents(logevents, out)
142 write_blockevents(blockevents, blockout)
145 print("ERROR: %s contains no logevent data" % wikiname)
148 while 'query-continue' in rv or 'continue' in rv:
149 if 'query-continue' in rv:
150 query['lestart'] = rv['query-continue']['logevents']['lestart']
152 query['continue'] = str(rv['continue'])
153 query['lecontinue'] = str(rv['continue']['lecontinue'])
155 response = requests.get(api_url, params=query)
157 logevents = rv['query']['logevents']
158 write_logevents(logevents, out)
164 # %run userroles_from_logevents.py --sep=\\t --nuke-old --blocks-output=/com/projects/messagewalls/userroles/blockevents ../identifyWikis/wikiteamWikilist.tsv /com/projects/messagewalls/userroles/listusers
165 if __name__ == '__main__':
166 parser = argparse.ArgumentParser(
167 description="Get user roles for Wikis from the Mediawiki list users API")
169 parser = add_parser_arguments(parser)
171 parser.add_argument('--blocks-output',
173 help='Path to output block event logs. If empty, blocks are ignored.'
176 args = parser.parse_args()
177 output_path = args.output
178 blocks_output = args.blocks_output
179 header = not args.no_header
181 prepare_output(output_path, args.nuke_old)
183 if blocks_output is not None:
184 prepare_output(blocks_output, args.nuke_old)
186 wikilist = read_wikilist(args)
190 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
192 # interate through the list of wikis
193 # for line in ["anime,http://anime.wikia.com/"]:
194 # for line in ["blogging,http://blogging.wikia.com/"]:
195 wikilist = read_wikilist(args)
197 # for line in open("list_of_wikis.csv", "r").readlines():
199 for wiki, url, wikitype in wikilist:
200 if "{0}.{1}".format(path.join(output_path, wiki), 'tsv') in files:
201 print("SKIPPING: file \"%s\" already exists)" % wiki)
205 print("SKIPPING: file \"%s\" already exists)" % wiki)
208 print("Processing wiki: %s" % wiki)
212 output_dir=output_path,
213 blocks_output=blocks_output,