#!/usr/bin/python3 # -*- coding: utf-8 -*- """ script to scrape a list of EasyChair review data and save them as CSV files """ # # (C) Benjamin Mako Hill, 2018 # (C) Federico Leva, 2016 # # Distributed under the terms of the MIT license. # __version__ = '0.2.0' # NOTE: change all copies of FIXME import requests from lxml import html import re from kitchen.text.converters import to_bytes import pandas as pd cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } ) headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" } index = requests.get("https://easychair.org/conferences/status.cgi?a=FIXME", cookies=cj, headers=headers) indexdata = html.fromstring(index.text) urls = indexdata.xpath('//a[contains(@href,"review_for_paper.cgi")]/@href') reviews = pd.DataFrame() def empty_to_none(s): if s == "": s = None return(s) for url in urls: sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url, cookies=cj, headers=headers).text) # capture features of submissions sub_id = sub_html.xpath('//title')[0].text sub_id = re.sub(r'^Reviews and Comments on Submission (\d+)$', r'\1', sub_id) score_labels = ['label', 'date', 'reviewer', 'subreviewer', 'score', 'confidence' 'overall'] for tr in sub_html.xpath('//th[text()="PC member"]/../../../tbody/tr'): score = [td.text_content() for td in tr.xpath('td')] score = [empty_to_none(x) for x in score] score_dict = dict(zip(score_labels, score)) score_dict["sub_id"] = sub_id reviews = reviews.append(pd.DataFrame(score_dict, index=[0])) reviews["date"] = reviews["date"] + ", 2017" reviews["date"] = pd.to_datetime(reviews["date"]) reviews.to_csv("opensym-reviews-20180113.csv", index=False, index_label=False)