3 # Copyright (C) 2018 Nathan TeBlunthuis
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
20 from importlib import reload
28 from pprint import pprint
29 from itertools import islice
32 roles = ['bot','sysop', 'bureaucrat','staff','rollback', # 'util',
33 'helper', 'vstf', 'checkuser-global', 'bot-global',
34 'council','authenticated', 'checkuser', 'chatmoderator',
35 'adminmentor','steward','oversight','founder','rollbacker','checkuser','researcher']
36 output_path = "userlist-2017/"
38 def __init__(self, url_root,wikitype):
39 self.wikitype = wikitype
40 if self.wikitype=="wikia":
41 self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
42 else: # wikitype == "wikipedia"
43 self._api_url = url_root + 'api.php'
45 def _fetch_http(self, url, params):
46 if self.wikitype == "wikia":
47 response = requests.get(url=url, params=params,headers={'Accept-encoding':'gzip'})
49 else: #wikitype == "wikipedia"
50 response = requests.get(url=url, params=params)
53 def call(self, params):
54 response = self._fetch_http(self._api_url, params)
55 if self.wikitype == "wikia":
56 return json.loads(response)
58 return response.json()
61 def write_user_csvfile(output_file, user_list):
62 csvfile = csv.writer(output_file, delimiter='\t',
63 quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
65 # construct and output the header
66 csvfile.writerow(['username', 'groups',
67 'edits', 'last.logged', 'last.edited'])
69 for user in user_list:
70 csvfile.writerow(user)
73 def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
77 if wikitype == "wikia":
79 query = {'groups' :'bot,sysop,bureaucrat,',
81 'limit' : increment_size,
84 'order' : 'username:asc' }
86 else: # wikitype == "wikipedia"
87 query = {'action': 'query',
89 'augroup' : "|".join(roles),
94 ## FIND THE CORRECT URL (there may be redirects)
97 url_root = requests.get(url_root).url
98 re_str = "^http://(community|www).wikia.com/"
99 if re.match(re_str, url_root):
100 # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
101 print("ERROR: %s no longer exists" % wikiname)
105 wiki = ListUserAPI(url_root,wikitype=wikitype)
106 rv = wiki.call(query)
108 except requests.ConnectionError as e:
109 print("ERROR: cannot read the event log: %s" % wikiname)
110 notauthorized.append(wikiname)
111 return "notauthorized"
113 if wikitype == "wikia":
114 raw_userlist = rv['aaData']
116 while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
117 # increment the offset and make a new query
118 offset = offset + increment_size
119 query['offset'] = offset
120 rv = wiki.call(query)
121 raw_userlist.extend(rv['aaData'])
122 print("Another one: offset is %s" % offset)
124 # go through and edit the html output of the json
125 processed_userlist = []
126 for row in raw_userlist:
127 row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
128 row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
129 row[4] = re.sub(r'^\-$', r'', row[4])
130 processed_userlist.append(row)
132 output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
133 write_user_csvfile(output_file, processed_userlist)
137 output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
138 raw_userlist = rv['query']['allusers']
139 outlines = ['\t'.join(["username","groups"])]
140 outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
141 output_file.write('\n'.join(outlines))
144 while 'continue' in rv:
145 query['continue'] = str(rv['continue'])
146 query['aufrom']= str(rv['continue']['aufrom'])
147 rv = wiki.call(query)
148 raw_userlist = rv['query']['allusers']
149 outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
150 output_file.write('\n'.join(outlines))
155 # open and then send data to the output data file
157 # read in the a list of files so we can skip them if we're already
159 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
161 # iterate through the list of files
163 # for line in open("list_of_wikis.csv", "r").readlines():
164 # next line useful for working with a reduced list:
165 d = [(line.split(",")[0], line.split(",")[1]) for line in islice(open("../wikis.needing.userroles.csv"),1,None)]
173 if os.path.join(output_path,wiki+".tsv") in files:
174 print("SKIPPING: file \"%s\" already exists)" % wiki)
177 print("Processing wiki: %s" % wiki)
178 if "wikipedia.org" in url:
179 wikitype = "wikipedia"
181 if "wikia.com" in url:
184 result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
185 if result == "deleted":
187 elif result == "notauthorized":
188 notauthorized.append(wiki)
193 df = open("allusers_WP_error_deleted.txt",'w')
194 df.write('\n'.join(deleted))
197 na = open("allusers_WP_error_notauthorized.txt",'w')
198 na.write('\n'.join(notauthorized))