]> code.communitydata.science - rises_declines_wikia_code.git/blob - userroles_scraper_scripts/userroles_from_listusers.py
add copy of the GPL
[rises_declines_wikia_code.git] / userroles_scraper_scripts / userroles_from_listusers.py
1 #!/usr/bin/env python3
2
3 # Copyright (C) 2018  Nathan TeBlunthuis
4
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14
15 # You should have received a copy of the GNU General Public License
16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
18 import time, re, os
19 import sys
20 from importlib import reload
21 reload(sys)
22
23 import urllib
24 import requests
25 import json
26 import gzip
27
28 from pprint import pprint
29 from itertools import islice
30 import csv
31
32 roles = ['bot','sysop', 'bureaucrat','staff','rollback', # 'util',
33          'helper', 'vstf', 'checkuser-global', 'bot-global',
34          'council','authenticated', 'checkuser', 'chatmoderator',
35          'adminmentor','steward','oversight','founder','rollbacker','checkuser','researcher']
36 output_path = "userlist-2017/"
37 class ListUserAPI():
38     def __init__(self, url_root,wikitype):
39         self.wikitype = wikitype
40         if self.wikitype=="wikia":
41             self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
42         else: # wikitype == "wikipedia"
43             self._api_url = url_root + 'api.php'
44
45     def _fetch_http(self, url, params):
46         if self.wikitype == "wikia":
47             response = requests.get(url=url, params=params,headers={'Accept-encoding':'gzip'})
48             return(response.text)
49         else: #wikitype == "wikipedia"
50             response = requests.get(url=url, params=params)
51             return(response)
52             
53     def call(self, params):
54         response = self._fetch_http(self._api_url, params)
55         if self.wikitype == "wikia":
56             return json.loads(response)
57         else:
58             return response.json()
59
60
61 def write_user_csvfile(output_file, user_list):
62     csvfile = csv.writer(output_file, delimiter='\t',
63                          quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
64
65     # construct and output the header
66     csvfile.writerow(['username', 'groups',
67                       'edits', 'last.logged', 'last.edited'])
68
69     for user in user_list:
70         csvfile.writerow(user)
71         
72
73 def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
74     increment_size = 500
75     offset = 0
76
77     if wikitype == "wikia":
78
79         query = {'groups' :'bot,sysop,bureaucrat,',
80                  'edits' : 0,
81                  'limit' : increment_size,
82                  'offset' : offset,
83                  'numOrder' : 1,
84                  'order' : 'username:asc' }
85
86     else: # wikitype == "wikipedia"
87         query = {'action': 'query',
88                  'list': 'allusers',
89                  'augroup' : "|".join(roles),
90                  'auprop' : 'groups',
91                  'aulimit' : 500,
92                  'format' : 'json'}
93
94     ## FIND THE CORRECT URL (there may be redirects)
95
96     if wikitype=="wikia":
97         url_root = requests.get(url_root).url
98         re_str = "^http://(community|www).wikia.com/"
99         if re.match(re_str, url_root):
100             # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
101             print("ERROR: %s no longer exists" % wikiname)
102
103             return "deleted"
104     try:
105         wiki = ListUserAPI(url_root,wikitype=wikitype)
106         rv = wiki.call(query)
107
108     except requests.ConnectionError as e:
109          print("ERROR: cannot read the event log: %s" % wikiname)
110          notauthorized.append(wikiname)
111          return "notauthorized"
112
113     if wikitype == "wikia":
114         raw_userlist = rv['aaData']
115
116         while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
117             # increment the offset and make a new query
118             offset = offset + increment_size
119             query['offset'] = offset
120             rv = wiki.call(query)
121             raw_userlist.extend(rv['aaData'])
122             print("Another one: offset is %s" % offset)
123
124         # go through and edit the html output of the json
125         processed_userlist = []
126         for row in raw_userlist:
127             row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
128             row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
129             row[4] = re.sub(r'^\-$', r'', row[4])
130             processed_userlist.append(row)
131
132         output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
133         write_user_csvfile(output_file, processed_userlist)
134         output_file.close()
135
136     else: 
137         output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
138         raw_userlist = rv['query']['allusers']        
139         outlines = ['\t'.join(["username","groups"])]
140         outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
141         output_file.write('\n'.join(outlines))
142         outlines = []
143
144         while 'continue' in rv:
145             query['continue'] = str(rv['continue'])
146             query['aufrom']= str(rv['continue']['aufrom'])
147             rv = wiki.call(query)
148             raw_userlist = rv['query']['allusers']
149             outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
150             output_file.write('\n'.join(outlines))
151             output_file.flush()
152             outlines = []
153
154
155     # open and then send data to the output data file
156     
157 # read in the a list of files so we can skip them if we're already
158 # downloaded them
159 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
160
161 # iterate through the list of files
162
163 # for line in open("list_of_wikis.csv", "r").readlines():
164 # next line useful for working with a reduced list:
165 d = [(line.split(",")[0], line.split(",")[1]) for line in  islice(open("../wikis.needing.userroles.csv"),1,None)]
166
167 deleted = []
168 notauthorized = []
169 for wiki, url in d:
170     wiki = wiki.strip()
171     url = url.strip()
172     print(url)
173     if os.path.join(output_path,wiki+".tsv") in files:
174         print("SKIPPING: file \"%s\" already exists)" % wiki)
175         continue
176
177     print("Processing wiki: %s" % wiki)
178     if "wikipedia.org" in url:
179         wikitype = "wikipedia"
180         url = url + '/w/'
181     if "wikia.com" in url:
182         wikitype = "wikia"
183
184     result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
185     if result == "deleted":
186         deleted.append(wiki)
187     elif result == "notauthorized":
188         notauthorized.append(wiki)
189     else:
190         pass
191     time.sleep(1)
192
193 df = open("allusers_WP_error_deleted.txt",'w')
194 df.write('\n'.join(deleted))
195 df.close()
196
197 na = open("allusers_WP_error_notauthorized.txt",'w')
198 na.write('\n'.join(notauthorized))
199 na.close()

Community Data Science Collective || Want to submit a patch?