]> code.communitydata.science - opensym2017_postmortem.git/blob - easychair-review-scraper.py
final set of changes before uploading to website
[opensym2017_postmortem.git] / easychair-review-scraper.py
1 #!/usr/bin/python3
2 # -*- coding: utf-8  -*-
3 """ script to scrape a list of EasyChair review data and save them as CSV files """
4 #
5 # (C) Benjamin Mako Hill, 2018
6 # (C) Federico Leva, 2016
7 #
8 # Distributed under the terms of the MIT license.
9 #
10 __version__ = '0.2.0'
11
12 # NOTE: change all copies of FIXME
13
14 import requests
15 from lxml import html
16 import re
17 from kitchen.text.converters import to_bytes
18 import pandas as pd
19
20 cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } )
21 headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" }
22 index = requests.get("https://easychair.org/conferences/status.cgi?a=FIXME", cookies=cj, headers=headers)
23 indexdata = html.fromstring(index.text)
24 urls = indexdata.xpath('//a[contains(@href,"review_for_paper.cgi")]/@href')
25
26 reviews = pd.DataFrame()
27
28 def empty_to_none(s):
29     if s == "":
30         s = None
31     return(s)
32
33 for url in urls:
34     sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url, 
35                                             cookies=cj, headers=headers).text)
36
37     # capture features of submissions
38     sub_id = sub_html.xpath('//title')[0].text
39     sub_id = re.sub(r'^Reviews and Comments on Submission (\d+)$', r'\1', sub_id)
40
41     score_labels = ['label', 'date', 'reviewer', 'subreviewer', 'score', 'confidence' 'overall']
42     for tr in sub_html.xpath('//th[text()="PC member"]/../../../tbody/tr'):
43         score = [td.text_content() for td in tr.xpath('td')]
44         score = [empty_to_none(x) for x in score]
45         score_dict = dict(zip(score_labels, score))
46         score_dict["sub_id"] = sub_id
47         reviews = reviews.append(pd.DataFrame(score_dict, index=[0]))
48
49 reviews["date"] = reviews["date"] + ", 2017"
50 reviews["date"] = pd.to_datetime(reviews["date"])
51
52 reviews.to_csv("opensym-reviews-20180113.csv", index=False, index_label=False)
53

Community Data Science Collective || Want to submit a patch?