3 # Obtain user roles data from the Wikia logevents api
4 # Copyright (C) 2018 Nathan TeBlunthuis
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <https://www.gnu.org/licenses/>.
24 from json.decoder import JSONDecodeError
25 from scraper_utils import prepare_output, read_wikilist, add_parser_arguments
29 def write_logevents(logevents, out):
30 for logevent in logevents:
31 # if there is hidden information, we skip this one because there
32 # is nothing to report
33 if any(['userhidden' in logevent,
34 'actionhidden' in logevent,
35 'commenthidden' in logevent]):
38 le_output = [logevent['comment'],
39 str(logevent['logid']),
41 str(logevent['pageid']),
42 logevent['timestamp'],
45 str(logevent['user'])]
47 if "rights" in logevent:
48 le_output.extend(['false',
49 logevent['rights']['new'],
50 logevent['rights']['old']])
52 le_output.extend(['true', '', ''])
54 out.write("\t".join(le_output) + "\n")
59 def write_blockevents(logevents, out):
60 for logevent in logevents:
61 # if there is hidden information, we skip this one because there
62 # is nothing to report
63 if any(['userhidden' in logevent,
64 'actionhidden' in logevent,
65 'commenthidden' in logevent]):
68 le_output = [logevent['comment'],
69 str(logevent['logid']),
71 str(logevent['pageid']),
72 logevent['timestamp'],
75 str(logevent['user'])]
77 if "rights" in logevent:
78 le_output.extend(['false',
79 logevent['rights']['new'],
80 logevent['rights']['old']])
82 le_output.extend(['true', '', ''])
84 out.write("\t".join(le_output) + "\n")
89 def get_events_for_wiki(wikiname, url, output_dir, blocks_output=None, wikitype="wikia"):
90 out = open("{0}/{1}.tsv".format(output_dir, wikiname), "w")
92 out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
93 'title', 'type', 'user', 'ancient', 'rights-new',
96 if wikitype == "wikia":
97 api_url = url + '/api.php'
98 else: # wikitype == wikipedia
99 api_url = url + "/w/api.php"
103 if blocks_output is not None:
104 letype = 'rights|block|unblock'
105 blockout = open("{0}/{1}.tsv".format(blocks_output, wikiname), "w")
106 blockout.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
107 'title', 'type', 'user', 'ancient', 'rights-new',
110 query = {'action': 'query',
118 response = requests.get(api_url, params=query)
121 except (JSONDecodeError):
122 api_url = response.url
123 # print api_url # debug
124 if wikitype == "wikia":
125 re_str = "^http://(community|www).wikia.com/"
126 else: # wikitype == "wikipedia"
127 re_str = "^(http|https)://.*wikipedia.org/"
129 if re.match(re_str, api_url):
131 # 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
132 print("ERROR: %s no longer exists" % wikiname)
135 response = requests.get(api_url, params=query)
139 logevents = rv['query']['logevents']
141 blockevents = [e for e in logevents
142 if (e['action'] in ['block', 'unblock'])
143 or (e['type'] in ['block', 'unblock'])]
145 logevents = [e for e in logevents if e not in blockevents]
147 write_logevents(logevents, out)
149 write_blockevents(blockevents, blockout)
152 print("ERROR: %s contains no logevent data" % wikiname)
155 while 'query-continue' in rv or 'continue' in rv:
156 if 'query-continue' in rv:
157 query['lestart'] = rv['query-continue']['logevents']['lestart']
159 query['continue'] = str(rv['continue'])
160 query['lecontinue'] = str(rv['continue']['lecontinue'])
162 response = requests.get(api_url, params=query)
164 logevents = rv['query']['logevents']
165 write_logevents(logevents, out)
171 # %run userroles_from_logevents.py --sep=\\t --nuke-old --blocks-output=/com/projects/messagewalls/userroles/blockevents ../identifyWikis/wikiteamWikilist.tsv /com/projects/messagewalls/userroles/listusers
172 if __name__ == '__main__':
173 parser = argparse.ArgumentParser(
174 description="Get user roles for Wikis from the Mediawiki list users API")
176 parser = add_parser_arguments(parser)
178 parser.add_argument('--blocks-output',
180 help='Path to output block event logs. If empty, blocks are ignored.'
183 args = parser.parse_args()
184 output_path = args.output
185 blocks_output = args.blocks_output
186 header = not args.no_header
188 prepare_output(output_path, args.nuke_old)
190 if blocks_output is not None:
191 prepare_output(blocks_output, args.nuke_old)
193 wikilist = read_wikilist(args)
197 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
199 # interate through the list of wikis
200 # for line in ["anime,http://anime.wikia.com/"]:
201 # for line in ["blogging,http://blogging.wikia.com/"]:
202 wikilist = read_wikilist(args)
204 # for line in open("list_of_wikis.csv", "r").readlines():
206 for wiki, url, wikitype in wikilist:
207 if "{0}.{1}".format(path.join(output_path, wiki), 'tsv') in files:
208 print("SKIPPING: file \"%s\" already exists)" % wiki)
212 print("SKIPPING: file \"%s\" already exists)" % wiki)
215 print("Processing wiki: %s" % wiki)
219 output_dir=output_path,
220 blocks_output=blocks_output,