]> code.communitydata.science - wikia_userroles_scraper.git/blob - userroles_from_listusers.py
Initialize the repository for the wikia user roles scraper project.
[wikia_userroles_scraper.git] / userroles_from_listusers.py
1 #!/usr/bin/env python3
2 import argparse
3 import csv
4 import json
5 import sys
6 import time
7 import re
8 import os
9 from importlib import reload
10 from json.decoder import JSONDecodeError
11 from os import path
12 from scraper_utils import prepare_output, read_wikilist, add_parser_arguments
13
14 import requests
15
16 reload(sys)
17
18 roles = ['bot', 'sysop', 'bureaucrat', 'staff', 'rollback',  # 'util',
19          'helper', 'vstf', 'checkuser-global', 'bot-global',
20          'council', 'authenticated', 'checkuser', 'chatmoderator',
21          'adminmentor', 'steward', 'oversight', 'founder', 'rollbacker', 'checkuser', 'researcher']
22
23
24 class ListUserAPI():
25
26     def __init__(self, url_root, wikitype):
27         self.wikitype = wikitype
28         if self.wikitype == "wikia":
29             self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
30         else:  # wikitype == "wikipedia"
31             self._api_url = url_root + 'api.php'
32
33     def _fetch_http(self, url, params):
34         if self.wikitype == "wikia":
35             response = requests.get(url=url, params=params, headers={
36                                     'Accept-encoding': 'gzip'})
37             return(response.text)
38         else:  # wikitype == "wikipedia"
39             response = requests.get(url=url, params=params)
40             return(response)
41
42     def call(self, params):
43         response = self._fetch_http(self._api_url, params)
44         if self.wikitype == "wikia":
45             return json.loads(response)
46         else:
47             return response.json()
48
49
50 def write_user_csvfile(output_file, user_list):
51     csvfile = csv.writer(output_file, delimiter='\t',
52                          quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
53
54     # construct and output the header
55     csvfile.writerow(['username', 'groups',
56                       'edits', 'last.logged', 'last.edited'])
57
58     for user in user_list:
59         csvfile.writerow(user)
60
61
62 def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
63     increment_size = 500
64     offset = 0
65
66     if wikitype == "wikia":
67
68         query = {'groups': 'bot,sysop,bureaucrat,',
69                  'edits': 0,
70                  'limit': increment_size,
71                  'offset': offset,
72                  'numOrder': 1,
73                  'order': 'username:asc'}
74
75     else:  # wikitype == "wikipedia"
76         query = {'action': 'query',
77                  'list': 'allusers',
78                  'augroup': "|".join(roles),
79                  'auprop': 'groups',
80                  'aulimit': 500,
81                  'format': 'json'}
82
83     # FIND THE CORRECT URL (there may be redirects)
84
85     if wikitype == "wikia":
86         url_root = requests.get(url_root).url
87         re_str = "^http://(community|www).wikia.com/"
88         if re.match(re_str, url_root):
89             # api_url
90             # 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
91             print("ERROR: %s no longer exists" % wikiname)
92
93             return "deleted"
94     try:
95         wiki = ListUserAPI(url_root, wikitype=wikitype)
96         rv = wiki.call(query)
97
98     except requests.ConnectionError as e:
99         print("ERROR: cannot read the event log: %s" % wikiname)
100         notauthorized.append(wikiname)
101         return "notauthorized"
102
103     except JSONDecodeError as e:
104         print("ERROR: cannot read the event log: %s" % wikiname)
105         notauthorized.append(wikiname)
106         return "notauthorized"
107
108     output_file = open("{0}/{1}.tsv".format(output_path, wikiname), 'w')
109     if wikitype == "wikia":
110         raw_userlist = rv['aaData']
111
112         while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
113             # increment the offset and make a new query
114             offset = offset + increment_size
115             query['offset'] = offset
116             rv = wiki.call(query)
117             raw_userlist.extend(rv['aaData'])
118             print("Another one: offset is %s" % offset)
119
120         # go through and edit the html output of the json
121         processed_userlist = []
122         for row in raw_userlist:
123             row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
124
125             # work around change in wikia api that removed last.logged
126             if len(row) < 5:
127                 row.append(row[3])
128                 row[3] = None
129
130             row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
131             row[4] = re.sub(r'^\-$', r'', row[4])
132             processed_userlist.append(row)
133
134         write_user_csvfile(output_file, processed_userlist)
135         output_file.close()
136
137     else:
138         raw_userlist = rv['query']['allusers']
139         outlines = ['\t'.join(["username", "groups"])]
140         while 'continue' in rv:
141             query['continue'] = str(rv['continue'])
142             query['aufrom'] = str(rv['continue']['aufrom'])
143             rv = wiki.call(query)
144             raw_userlist = rv['query']['allusers']
145             outlines.extend(
146                 ['\t'.join([q['name'], ','.join(q['groups'])]) for q in raw_userlist])
147             output_file.write('\n'.join(outlines))
148             output_file.flush()
149             outlines = []
150
151     # open and then send data to the output data file
152
153 # the call is
154 # %run userroles_from_listusers.py --sep=\\t --nuke_old ../identifyWikis/wikiteamWikilist.tsv /com/projects/messagewalls/userroles/listusers
155
156 if __name__ == '__main__':
157     parser = argparse.ArgumentParser(
158         description="Get user roles for Wikis from the Mediawiki list users API")
159
160     parser = add_parser_arguments(parser)
161     args = parser.parse_args()
162     output_path = args.output
163     header = not args.no_header
164
165     prepare_output(output_path, args.nuke_old)
166
167     wikilist = read_wikilist(args)
168     deleted = []
169     notauthorized = []
170
171     files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
172
173     for wiki, url, wikitype in wikilist:
174         if "{0}.{1}".format(path.join(output_path, wiki), 'tsv') in files:
175             print("SKIPPING: file \"%s\" already exists)" % wiki)
176             continue
177         print("Processing wiki: %s" % wiki)
178
179         result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
180         if result == "deleted":
181             deleted.append(wiki)
182         elif result == "notauthorized":
183             notauthorized.append(wiki)
184         else:
185             pass
186         time.sleep(1)
187
188     df = open("allusers_error_deleted.txt", 'w')
189     df.write('\n'.join(deleted))
190     df.close()
191
192     na = open("allusers_error_notauthorized.txt", 'w')
193     na.write('\n'.join(notauthorized))
194     na.close()

Community Data Science Collective || Want to submit a patch?