#!/usr/bin/env python3 # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . import time, re, os import sys import requests from mw import api from pprint import pprint from json.decoder import JSONDecodeError from itertools import islice def write_logevents(logevents,out): for logevent in logevents: # if there is hidden information, we skip this one because there # is nothing to report if 'userhidden' in logevent or 'actionhidden' in logevent or 'commenthidden' in logevent: continue le_output = [logevent['comment'], str(logevent['logid']), str(logevent['ns']), str(logevent['pageid']), logevent['timestamp'], logevent['title'], logevent['type'], str(logevent['user'])] if "params" in logevent: params = logevent["params"] else: params = {} if "rights" in logevent: le_output.extend(['false', logevent['rights']['new'], logevent['rights']['old']]) elif "newgroups" in params and "oldgroups" in params: le_output.extend(['false', ','.join(params['newgroups']), ','.join(params['oldgroups'])]) else: le_output.extend(['true', '', '']) out.write("\t".join(le_output) + "\n") out.flush() # output data def get_events_for_wiki(wikiname, url, wikitype="wikia"): if url[-1] != '/': url = url + '/' #out = open("../wikipedias/adminlist_output/logevents/nobackup/%s.tsv" % wikiname, "w") out = open("logevents-2017/%s.tsv" % wikiname, "w") out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp','title', 'type', 'user', 'ancient', 'rights-new', 'rights-old\n'])) if wikitype == "wikia": api_url = url + 'api.php' else: #wikitype == wikipedia api_url = url + "w/api.php" query = {'action': 'query', 'list': 'logevents', 'letype' : 'rights', 'lelimit' : '500', 'format':'json', 'ledir':'newer'} response = requests.get(api_url, params=query) hit_url = response.url if wikitype == "wikia": re_str = "^http://(community|www)\.wikia\.com/" if re.match(re_str, hit_url): # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia': print("ERROR: %s no longer exists" % wikiname) return else: re_str = "^(http|https)://.*\.wikia.com/api\.php" if re.match(re_str, hit_url): try: ## this is the only way out rv = response.json() ## check that we hit the right wiki except (JSONDecodeError): print(" New Error! ") else: re_str = "^((http|https)://.*\.wikia\.com)" new_url = re.findall(re_str, hit_url)[0][0] return get_events_for_wiki(wikiname, new_url, wikitype=wikitype) try: logevents = rv['query']['logevents'] write_logevents(logevents, out) except KeyError as e: print("ERROR: %s contains no logevent data" % wikiname) print(e) return while 'query-continue' in rv or 'continue' in rv: if 'query-continue' in rv: query['lestart'] = rv['query-continue']['logevents']['lestart'] else: query['continue'] = str(rv['continue']) query['lecontinue'] = str(rv['continue']['lecontinue']) response = requests.get(api_url,params=query) rv = response.json() logevents=rv['query']['logevents'] write_logevents(logevents, out) out.close() files = [re.sub('\.tsv$', '', i) for i in os.listdir("logevents-2017")] # interate through the list of wikis #for line in ["anime,http://anime.wikia.com/"]: #for line in ["blogging,http://blogging.wikia.com/"]: header = True if header: i = 1 else: i = 0 # for line in open("list_of_wikis.csv", "r").readlines(): for line in islice(open("../wikis.needing.userroles.csv", "r"),i,None): (wiki, url) = line.split(",") url = url.strip() print("Processing wiki: %s" % wiki) if wiki in files: print("SKIPPING: file \"%s\" already exists)" % wiki) continue if "wikia.com" in url: wikitype = "wikia" else:# "wikipedia.org in url": wikitype = "wikipedia" get_events_for_wiki(wiki, url, wikitype=wikitype) time.sleep(1)