#!/usr/bin/python3 # -*- coding: utf-8 -*- """ Bot to scrape a list of EasyChair submissions and upload them to a wiki """ # # (C) Benjamin Mako Hill, 2018 # (C) Federico Leva, 2016 # # Distributed under the terms of the MIT license. # __version__ = '0.2.0' # NOTE: change all copies of FIXME import requests from lxml import html import re from kitchen.text.converters import to_bytes import pandas as pd cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } ) headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" } index = requests.get("https://easychair.org/conferences/submission_show_all.cgi?a=FIXME", cookies=cj, headers=headers) indexdata = html.fromstring(index.text) urls = indexdata.xpath('//a[contains(@href,"submission_info_show.cgi")]/@href') submissions = pd.DataFrame() authors = pd.DataFrame() reviewers = pd.DataFrame() author_keywords = pd.DataFrame() easychair_keywords = pd.DataFrame() bids = pd.DataFrame() for url in urls: sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url, cookies=cj, headers=headers).text) # capture features of submissions sub_id = sub_html.xpath('//title')[0].text sub_id = re.sub(r'^Submission (\d+)$', r'\1', sub_id) final_type = sub_html.xpath('//td[text()="Category"]/../td[2]')[0].text title = sub_html.xpath('//td[text()="Title:"]/../td[2]/text()')[0].strip() # it's possible to submit papers w/o topics try: topic = sub_html.xpath('//span[text()="Topics:"]/../../td[2]/text()')[0].strip() except IndexError: topic = None abstract = sub_html.xpath('//td[text()="Abstract:"]/../td[2]')[0].text.strip() result = sub_html.xpath('//td[text()="Decision:"]/../td[2]')[0].text_content().strip() submissions = submissions.append(pd.DataFrame({ 'sub_id' : sub_id, 'type' : final_type, 'title' : title, 'topic' : topic, 'abstract' : abstract, 'result' : result}, index=[0])) # create a list of authors names = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[1]/text()') surnames = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[2]/text()') countries = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[4]/text()') for i in range(1, len(names)): authors = authors.append(pd.DataFrame({ 'sub_id' : sub_id, 'author' : " ".join([names[i], surnames[i]]), 'country' : countries[i] }, index=[0])) # add the list of reviewers assigned_to = sub_html.xpath('//span[text()="Assigned to:"]/../../td[2]')[0].text.strip().split(", ") reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id, 'reviewer' : assigned_to, 'type' : 'normal' })) senior_pc = sub_html.xpath('//span[text()="Senior PC member:"]/../../td[2]')[0].text senior_pc = re.sub(r'^(.+?) \<.*$', r'\1', senior_pc) reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id, 'reviewer' : senior_pc, 'type' : 'senior' }, index=[0])) # add author keywords sub_author_keywords = sub_html.xpath('//div[parent::td[@class="value"]]/text()') sub_author_keywords = [x.lower() for x in sub_author_keywords] author_keywords = author_keywords.append(pd.DataFrame({ 'sub_id' : sub_id, 'keyword' : sub_author_keywords})) # easychair keywords sub_easychair_keywords = sub_html.xpath('//span[text()="EasyChair keyphrases:"]/../../td[2]')[0].text.strip() sub_easychair_keywords = sub_easychair_keywords.split(", ") for kw in sub_easychair_keywords: g = re.match(r'^\s*([A-Za-z1-9 ]+) \((\d+)\)\s*$', kw).groups() easychair_keywords = easychair_keywords.append(pd.DataFrame({ 'sub_id' : sub_id, 'keyword' : g[0].lower(), 'number' : g[1]}, index=[0])) #coi = sub_html.xpath('//span[text()="Conflict of interest:"]/../../td[2]')[0].text.strip() #if coi == "nobody": # coi = [] #else: # TODO this is not tested on /any/ data # coi = coi.split(", ") def parse_bid_tbl(tbl): key = re.sub(r'^\s*([a-z]+):\s*$', r'\1', tbl[0][0].text) return((key, tbl[0][1].text.split(", "))) sub_bids = dict([ parse_bid_tbl(x) for x in sub_html.xpath('//td[text()="Bid:"]/../td[2]/table[*]') ]) for bid_type in sub_bids: bids = bids.append(pd.DataFrame({ 'sub_id' : sub_id, 'bid' : bid_type, 'bidder' : sub_bids[bid_type] })) submissions.to_csv("opensym-submissions-20180113.csv", index=False, index_label=False) authors.to_csv("opensym-authors-20180113.csv", index=False, index_label=False) reviewers.to_csv("opensym-reviewers-20180113.csv", index=False, index_label=False) author_keywords.to_csv("opensym-author_keywords-20180113.csv", index=False, index_label=False) easychair_keywords.to_csv("opensym-easychair_keywords-20180113.csv", index=False, index_label=False) bids.to_csv("opensym-bids-20180113.csv", index=False, index_label=False)