userroles_from_listusers.py

   1 #!/usr/bin/env python3
   2 import argparse
   3 import csv
   4 import json
   5 import sys
   6 import time
   7 import re
   8 import os
   9 from importlib import reload
  10 from json.decoder import JSONDecodeError
  11 from os import path
  12 from scraper_utils import prepare_output, read_wikilist, add_parser_arguments
  13
  14 import requests
  15
  16 reload(sys)
  17
  18 roles = ['bot', 'sysop', 'bureaucrat', 'staff', 'rollback',  # 'util',
  19          'helper', 'vstf', 'checkuser-global', 'bot-global',
  20          'council', 'authenticated', 'checkuser', 'chatmoderator',
  21          'adminmentor', 'steward', 'oversight', 'founder', 'rollbacker', 'checkuser', 'researcher']
  22
  23
  24 class ListUserAPI():
  25
  26     def __init__(self, url_root, wikitype):
  27         self.wikitype = wikitype
  28         if self.wikitype == "wikia":
  29             self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
  30         else:  # wikitype == "wikipedia"
  31             self._api_url = url_root + 'api.php'
  32
  33     def _fetch_http(self, url, params):
  34         if self.wikitype == "wikia":
  35             response = requests.get(url=url, params=params, headers={
  36                                     'Accept-encoding': 'gzip'})
  37             return(response.text)
  38         else:  # wikitype == "wikipedia"
  39             response = requests.get(url=url, params=params)
  40             return(response)
  41
  42     def call(self, params):
  43         response = self._fetch_http(self._api_url, params)
  44         if self.wikitype == "wikia":
  45             return json.loads(response)
  46         else:
  47             return response.json()
  48
  49
  50 def write_user_csvfile(output_file, user_list):
  51     csvfile = csv.writer(output_file, delimiter='\t',
  52                          quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
  53
  54     # construct and output the header
  55     csvfile.writerow(['username', 'groups',
  56                       'edits', 'last.logged', 'last.edited'])
  57
  58     for user in user_list:
  59         csvfile.writerow(user)
  60
  61
  62 def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
  63     increment_size = 500
  64     offset = 0
  65
  66     if wikitype == "wikia":
  67
  68         query = {'groups': 'bot,sysop,bureaucrat,',
  69                  'edits': 0,
  70                  'limit': increment_size,
  71                  'offset': offset,
  72                  'numOrder': 1,
  73                  'order': 'username:asc'}
  74
  75     else:  # wikitype == "wikipedia"
  76         query = {'action': 'query',
  77                  'list': 'allusers',
  78                  'augroup': "|".join(roles),
  79                  'auprop': 'groups',
  80                  'aulimit': 500,
  81                  'format': 'json'}
  82
  83     # FIND THE CORRECT URL (there may be redirects)
  84
  85     if wikitype == "wikia":
  86         url_root = requests.get(url_root).url
  87         re_str = "^http://(community|www).wikia.com/"
  88         if re.match(re_str, url_root):
  89             # api_url
  90             # 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
  91             print("ERROR: %s no longer exists" % wikiname)
  92
  93             return "deleted"
  94     try:
  95         wiki = ListUserAPI(url_root, wikitype=wikitype)
  96         rv = wiki.call(query)
  97
  98     except requests.ConnectionError as e:
  99         print("ERROR: cannot read the event log: %s" % wikiname)
 100         notauthorized.append(wikiname)
 101         return "notauthorized"
 102
 103     except JSONDecodeError as e:
 104         print("ERROR: cannot read the event log: %s" % wikiname)
 105         notauthorized.append(wikiname)
 106         return "notauthorized"
 107
 108     output_file = open("{0}/{1}.tsv".format(output_path, wikiname), 'w')
 109     if wikitype == "wikia":
 110         raw_userlist = rv['aaData']
 111
 112         while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
 113             # increment the offset and make a new query
 114             offset = offset + increment_size
 115             query['offset'] = offset
 116             rv = wiki.call(query)
 117             raw_userlist.extend(rv['aaData'])
 118             print("Another one: offset is %s" % offset)
 119
 120         # go through and edit the html output of the json
 121         processed_userlist = []
 122         for row in raw_userlist:
 123             row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
 124
 125             # work around change in wikia api that removed last.logged
 126             if len(row) < 5:
 127                 row.append(row[3])
 128                 row[3] = None
 129
 130             row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
 131             row[4] = re.sub(r'^\-$', r'', row[4])
 132             processed_userlist.append(row)
 133
 134         write_user_csvfile(output_file, processed_userlist)
 135         output_file.close()
 136
 137     else:
 138         raw_userlist = rv['query']['allusers']
 139         outlines = ['\t'.join(["username", "groups"])]
 140         while 'continue' in rv:
 141             query['continue'] = str(rv['continue'])
 142             query['aufrom'] = str(rv['continue']['aufrom'])
 143             rv = wiki.call(query)
 144             raw_userlist = rv['query']['allusers']
 145             outlines.extend(
 146                 ['\t'.join([q['name'], ','.join(q['groups'])]) for q in raw_userlist])
 147             output_file.write('\n'.join(outlines))
 148             output_file.flush()
 149             outlines = []
 150
 151     # open and then send data to the output data file
 152
 153 # the call is
 154 # %run userroles_from_listusers.py --sep=\\t --nuke_old ../identifyWikis/wikiteamWikilist.tsv /com/projects/messagewalls/userroles/listusers
 155
 156 if __name__ == '__main__':
 157     parser = argparse.ArgumentParser(
 158         description="Get user roles for Wikis from the Mediawiki list users API")
 159
 160     parser = add_parser_arguments(parser)
 161     args = parser.parse_args()
 162     output_path = args.output
 163     header = not args.no_header
 164
 165     prepare_output(output_path, args.nuke_old)
 166
 167     wikilist = read_wikilist(args)
 168     deleted = []
 169     notauthorized = []
 170
 171     files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
 172
 173     for wiki, url, wikitype in wikilist:
 174         if "{0}.{1}".format(path.join(output_path, wiki), 'tsv') in files:
 175             print("SKIPPING: file \"%s\" already exists)" % wiki)
 176             continue
 177         print("Processing wiki: %s" % wiki)
 178
 179         result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
 180         if result == "deleted":
 181             deleted.append(wiki)
 182         elif result == "notauthorized":
 183             notauthorized.append(wiki)
 184         else:
 185             pass
 186         time.sleep(1)
 187
 188     df = open("allusers_error_deleted.txt", 'w')
 189     df.write('\n'.join(deleted))
 190     df.close()
 191
 192     na = open("allusers_error_notauthorized.txt", 'w')
 193     na.write('\n'.join(notauthorized))
 194     na.close()