easychair-submissions-scraper.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8  -*-
   3 """ script to scrape a list of EasyChair submission metadata and save them as CSV files """
   4 #
   5 # (C) Benjamin Mako Hill, 2018
   6 # (C) Federico Leva, 2016
   7 #
   8 # Distributed under the terms of the MIT license.
   9 #
  10 __version__ = '0.2.0'
  11
  12 # NOTE: change all copies of FIXME
  13
  14 import requests
  15 from lxml import html
  16 import re
  17 from kitchen.text.converters import to_bytes
  18 import pandas as pd
  19
  20 cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } )
  21 headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" }
  22 index = requests.get("https://easychair.org/conferences/submission_show_all.cgi?a=FIXME", cookies=cj, headers=headers)
  23 indexdata = html.fromstring(index.text)
  24 urls = indexdata.xpath('//a[contains(@href,"submission_info_show.cgi")]/@href')
  25
  26 submissions = pd.DataFrame()
  27 authors = pd.DataFrame()
  28 reviewers = pd.DataFrame()
  29 author_keywords = pd.DataFrame()
  30 easychair_keywords = pd.DataFrame()
  31 bids = pd.DataFrame()
  32
  33 for url in urls:
  34     sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url,
  35                                             cookies=cj, headers=headers).text)
  36
  37     # capture features of submissions
  38     sub_id = sub_html.xpath('//title')[0].text
  39     sub_id = re.sub(r'^Submission (\d+)$', r'\1', sub_id)
  40
  41     final_type = sub_html.xpath('//td[text()="Category"]/../td[2]')[0].text
  42     title = sub_html.xpath('//td[text()="Title:"]/../td[2]/text()')[0].strip()
  43
  44     # it's possible to submit papers w/o topics
  45     try:
  46         topic = sub_html.xpath('//span[text()="Topics:"]/../../td[2]/text()')[0].strip()
  47     except IndexError:
  48         topic = None
  49
  50     abstract = sub_html.xpath('//td[text()="Abstract:"]/../td[2]')[0].text.strip()
  51     result = sub_html.xpath('//td[text()="Decision:"]/../td[2]')[0].text_content().strip()
  52
  53     submissions = submissions.append(pd.DataFrame({ 'sub_id' : sub_id,
  54                                                     'type' : final_type,
  55                                                     'title' : title,
  56                                                     'topic' : topic,
  57                                                     'abstract' : abstract,
  58                                                     'result' : result},
  59                                                    index=[0]))
  60
  61     # create a list of authors
  62     names = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[1]/text()')
  63     surnames = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[2]/text()')
  64     countries = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[4]/text()')
  65
  66     for i in range(1, len(names)):
  67         authors = authors.append(pd.DataFrame({ 'sub_id' : sub_id,
  68                                                 'author' : " ".join([names[i], surnames[i]]),
  69                                                 'country' : countries[i] },
  70                                               index=[0]))
  71
  72     # add the list of reviewers
  73     assigned_to = sub_html.xpath('//span[text()="Assigned to:"]/../../td[2]')[0].text.strip().split(", ")
  74
  75     reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id,
  76                                                 'reviewer' : assigned_to,
  77                                                 'type' : 'normal' }))
  78
  79     senior_pc = sub_html.xpath('//span[text()="Senior PC member:"]/../../td[2]')[0].text
  80     senior_pc = re.sub(r'^(.+?) \<.*$', r'\1', senior_pc)
  81
  82     reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id,
  83                                                 'reviewer' : senior_pc,
  84                                                 'type' : 'senior' },
  85                                               index=[0]))
  86
  87     # add author keywords
  88     sub_author_keywords = sub_html.xpath('//div[parent::td[@class="value"]]/text()')
  89     sub_author_keywords = [x.lower() for x in sub_author_keywords]
  90
  91     author_keywords = author_keywords.append(pd.DataFrame({ 'sub_id' : sub_id,
  92                                                             'keyword' : sub_author_keywords}))
  93
  94
  95     # easychair keywords
  96     sub_easychair_keywords = sub_html.xpath('//span[text()="EasyChair keyphrases:"]/../../td[2]')[0].text.strip()
  97     sub_easychair_keywords = sub_easychair_keywords.split(", ")
  98
  99     for kw in sub_easychair_keywords:
 100         g = re.match(r'^\s*([A-Za-z1-9 ]+) \((\d+)\)\s*$', kw).groups()
 101         easychair_keywords = easychair_keywords.append(pd.DataFrame({ 'sub_id' : sub_id,
 102                                                                       'keyword' : g[0].lower(),
 103                                                                       'number' : g[1]},
 104                                                                      index=[0]))
 105
 106     #coi = sub_html.xpath('//span[text()="Conflict of interest:"]/../../td[2]')[0].text.strip()
 107     #if coi == "nobody":
 108     #    coi = []
 109     #else: # TODO this is not tested on /any/ data
 110     #    coi = coi.split(", ")
 111
 112     def parse_bid_tbl(tbl):
 113         key = re.sub(r'^\s*([a-z]+):\s*$', r'\1', tbl[0][0].text)
 114         return((key, tbl[0][1].text.split(", ")))
 115
 116     sub_bids = dict([ parse_bid_tbl(x) for x in sub_html.xpath('//td[text()="Bid:"]/../td[2]/table[*]') ])
 117
 118     for bid_type in sub_bids:
 119         bids = bids.append(pd.DataFrame({ 'sub_id' : sub_id,
 120                                           'bid' : bid_type,
 121                                           'bidder' : sub_bids[bid_type] }))
 122
 123
 124 submissions.to_csv("opensym-submissions-20180113.csv", index=False, index_label=False)
 125 authors.to_csv("opensym-authors-20180113.csv", index=False, index_label=False)
 126 reviewers.to_csv("opensym-reviewers-20180113.csv", index=False, index_label=False)
 127 author_keywords.to_csv("opensym-author_keywords-20180113.csv", index=False, index_label=False)
 128 easychair_keywords.to_csv("opensym-easychair_keywords-20180113.csv", index=False, index_label=False)
 129 bids.to_csv("opensym-bids-20180113.csv", index=False, index_label=False)
 130