userroles_scraper_scripts/userroles_from_listusers.py

   1 #!/usr/bin/env python3
   2
   3 # Copyright (C) 2018  Nathan TeBlunthuis
   4
   5 # This program is free software: you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation, either version 3 of the License, or
   8 # (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18 import time, re, os
  19 import sys
  20 from importlib import reload
  21 reload(sys)
  22
  23 import urllib
  24 import requests
  25 import json
  26 import gzip
  27
  28 from pprint import pprint
  29 from itertools import islice
  30 import csv
  31
  32 roles = ['bot','sysop', 'bureaucrat','staff','rollback', # 'util',
  33          'helper', 'vstf', 'checkuser-global', 'bot-global',
  34          'council','authenticated', 'checkuser', 'chatmoderator',
  35          'adminmentor','steward','oversight','founder','rollbacker','checkuser','researcher']
  36 output_path = "userlist-2017/"
  37 class ListUserAPI():
  38     def __init__(self, url_root,wikitype):
  39         self.wikitype = wikitype
  40         if self.wikitype=="wikia":
  41             self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
  42         else: # wikitype == "wikipedia"
  43             self._api_url = url_root + 'api.php'
  44
  45     def _fetch_http(self, url, params):
  46         if self.wikitype == "wikia":
  47             response = requests.get(url=url, params=params,headers={'Accept-encoding':'gzip'})
  48             return(response.text)
  49         else: #wikitype == "wikipedia"
  50             response = requests.get(url=url, params=params)
  51             return(response)
  52
  53     def call(self, params):
  54         response = self._fetch_http(self._api_url, params)
  55         if self.wikitype == "wikia":
  56             return json.loads(response)
  57         else:
  58             return response.json()
  59
  60
  61 def write_user_csvfile(output_file, user_list):
  62     csvfile = csv.writer(output_file, delimiter='\t',
  63                          quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
  64
  65     # construct and output the header
  66     csvfile.writerow(['username', 'groups',
  67                       'edits', 'last.logged', 'last.edited'])
  68
  69     for user in user_list:
  70         csvfile.writerow(user)
  71
  72
  73 def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
  74     increment_size = 500
  75     offset = 0
  76
  77     if wikitype == "wikia":
  78
  79         query = {'groups' :'bot,sysop,bureaucrat,',
  80                  'edits' : 0,
  81                  'limit' : increment_size,
  82                  'offset' : offset,
  83                  'numOrder' : 1,
  84                  'order' : 'username:asc' }
  85
  86     else: # wikitype == "wikipedia"
  87         query = {'action': 'query',
  88                  'list': 'allusers',
  89                  'augroup' : "|".join(roles),
  90                  'auprop' : 'groups',
  91                  'aulimit' : 500,
  92                  'format' : 'json'}
  93
  94     ## FIND THE CORRECT URL (there may be redirects)
  95
  96     if wikitype=="wikia":
  97         url_root = requests.get(url_root).url
  98         re_str = "^http://(community|www).wikia.com/"
  99         if re.match(re_str, url_root):
 100             # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
 101             print("ERROR: %s no longer exists" % wikiname)
 102
 103             return "deleted"
 104     try:
 105         wiki = ListUserAPI(url_root,wikitype=wikitype)
 106         rv = wiki.call(query)
 107
 108     except requests.ConnectionError as e:
 109          print("ERROR: cannot read the event log: %s" % wikiname)
 110          notauthorized.append(wikiname)
 111          return "notauthorized"
 112
 113     if wikitype == "wikia":
 114         raw_userlist = rv['aaData']
 115
 116         while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
 117             # increment the offset and make a new query
 118             offset = offset + increment_size
 119             query['offset'] = offset
 120             rv = wiki.call(query)
 121             raw_userlist.extend(rv['aaData'])
 122             print("Another one: offset is %s" % offset)
 123
 124         # go through and edit the html output of the json
 125         processed_userlist = []
 126         for row in raw_userlist:
 127             row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
 128             row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
 129             row[4] = re.sub(r'^\-$', r'', row[4])
 130             processed_userlist.append(row)
 131
 132         output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
 133         write_user_csvfile(output_file, processed_userlist)
 134         output_file.close()
 135
 136     else:
 137         output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
 138         raw_userlist = rv['query']['allusers']
 139         outlines = ['\t'.join(["username","groups"])]
 140         outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
 141         output_file.write('\n'.join(outlines))
 142         outlines = []
 143
 144         while 'continue' in rv:
 145             query['continue'] = str(rv['continue'])
 146             query['aufrom']= str(rv['continue']['aufrom'])
 147             rv = wiki.call(query)
 148             raw_userlist = rv['query']['allusers']
 149             outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
 150             output_file.write('\n'.join(outlines))
 151             output_file.flush()
 152             outlines = []
 153
 154
 155     # open and then send data to the output data file
 156
 157 # read in the a list of files so we can skip them if we're already
 158 # downloaded them
 159 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
 160
 161 # iterate through the list of files
 162
 163 # for line in open("list_of_wikis.csv", "r").readlines():
 164 # next line useful for working with a reduced list:
 165 d = [(line.split(",")[0], line.split(",")[1]) for line in  islice(open("../wikis.needing.userroles.csv"),1,None)]
 166
 167 deleted = []
 168 notauthorized = []
 169 for wiki, url in d:
 170     wiki = wiki.strip()
 171     url = url.strip()
 172     print(url)
 173     if os.path.join(output_path,wiki+".tsv") in files:
 174         print("SKIPPING: file \"%s\" already exists)" % wiki)
 175         continue
 176
 177     print("Processing wiki: %s" % wiki)
 178     if "wikipedia.org" in url:
 179         wikitype = "wikipedia"
 180         url = url + '/w/'
 181     if "wikia.com" in url:
 182         wikitype = "wikia"
 183
 184     result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
 185     if result == "deleted":
 186         deleted.append(wiki)
 187     elif result == "notauthorized":
 188         notauthorized.append(wiki)
 189     else:
 190         pass
 191     time.sleep(1)
 192
 193 df = open("allusers_WP_error_deleted.txt",'w')
 194 df.write('\n'.join(deleted))
 195 df.close()
 196
 197 na = open("allusers_WP_error_notauthorized.txt",'w')
 198 na.write('\n'.join(notauthorized))
 199 na.close()