code/data_collection/request_functions.py

   1 import requests
   2 from datetime import datetime
   3 from scopus_api import key as API_KEY
   4 import json
   5 import os
   6 import logging
   7 import re
   8
   9 logging.basicConfig(level=logging.DEBUG)
  10
  11 RETRY_COUNT = 5
  12 TIMEOUT_SECS = 10
  13
  14 # Initialize a global session object
  15 s = requests.Session()
  16 s.headers.update({'X-ELS-APIKey' : API_KEY,
  17             'X-ELS-ResourceVersion' : 'XOCS',
  18             'Accept' : 'application/json'})
  19
  20 def get_token(location_id = None):
  21     '''Given a location_id, gets an authentication token'''
  22     print('Getting a token')
  23     api_resource = 'http://api.elsevier.com/authenticate'
  24     # Parameters
  25     payload = {'platform':'SCOPUS',
  26             'choice': location_id}
  27     r = s.get(api_resource, params = payload)
  28     r.raise_for_status()
  29     s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
  30
  31 def get_search_results(query, output_file, results_per_call = 200,
  32         tot_results=None, year=None, sort='+title', citation_call=False):
  33     '''Handles getting search results. Takes a query and an output
  34     file. Writes as many of the search results as possible to the
  35     output file as JSON dictionaries, one per line.'''
  36     result_set = []
  37     results_added = 0
  38     def curr_call(start=0, count=results_per_call):
  39         '''Shorthand for the current call: DRY'''
  40         return make_search_call(query, start=start,
  41             count=count, year=year, sort=sort)
  42     if tot_results == None:
  43         # Call the API initially to figure out how many results there are, and write the results
  44         initial_results = curr_call(count=results_per_call)
  45         tot_results = int(initial_results['search-results']['opensearch:totalResults'])
  46         result_set.append((initial_results, sort))
  47         results_added += results_per_call
  48     logging.debug("Total results: {}".format(tot_results))
  49
  50     if tot_results == 0:
  51         return None
  52     if tot_results > 5000:
  53             # If this is just one year, we can't get any more granular, and
  54             # we need to return what we can.
  55         if tot_results > 10000:
  56             print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
  57             first_half = last_half = 5000
  58         else:
  59             # Get half, and correct for odd # of results
  60             first_half = tot_results//2 + tot_results % 2
  61             last_half = tot_results//2
  62         # Break the search into the first half and the bottom half of results.
  63         get_search_results(query, output_file,
  64                year = year,
  65                tot_results=first_half)
  66          # Get the other half
  67         get_search_results(query, output_file,
  68                 year = year,
  69                 tot_results = last_half, sort='-title')
  70 # If there are 5000 or fewer to retrieve, then get them
  71     else:
  72         logging.debug('Retrieving {} results'.format(tot_results))
  73         # As long as there are more citations to retrieve, then do it, and write
  74         # them to the file
  75         while results_added < tot_results:
  76             # If we are near the end, then only get as many results as are left.
  77             to_retrieve = min(results_per_call, (tot_results - results_added))
  78             curr_results = curr_call(start=results_added, count=to_retrieve)
  79             result_set.append((curr_results, sort))
  80             results_added += results_per_call
  81     # This is hacky, but I'm doing it
  82     # If this is a citation call, then construct metadata to be written with the result
  83     if citation_call:
  84         metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
  85     else:
  86         metadata = {}
  87     write_results(result_set, output_file, metadata)
  88
  89 def write_results(result_set, output_file, metadata={}):
  90     for x in result_set:
  91         search_json = x[0]
  92         to_reverse = x[1].startswith('-')
  93         try:
  94             results = [x for x in search_json['search-results']['entry']]
  95         except KeyError:
  96             raise
  97         if to_reverse:
  98             results = results[::-1]
  99         for x in results:
 100             for k, v in metadata.items():
 101                 x[k] = v
 102             json.dump(x, output_file)
 103             output_file.write('\n')
 104
 105
 106 def make_search_call(query, start=0, count=200,
 107         sort='+title', year=None,
 108         retry_limit = RETRY_COUNT,
 109         timeout_secs = TIMEOUT_SECS):
 110     api_resource = "https://api.elsevier.com/content/search/scopus"
 111     # Parameters
 112     payload = {'query':query,
 113             'count':count,
 114             'start':start,
 115             'sort': sort,
 116             'date': year}
 117     for _ in range(retry_limit):
 118         try:
 119             r = s.get(api_resource,
 120                     params = payload,
 121                     timeout = timeout_secs)
 122             logging.debug(r.url)
 123             if r.status_code == 401:
 124                 get_token()
 125                 continue
 126             if r.status_code == 400:
 127                 raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
 128             break
 129         except requests.exceptions.Timeout:
 130             pass
 131     else:
 132         raise requests.exceptions.Timeout('Timeout Error')
 133
 134     r.raise_for_status()
 135     return r.json()
 136
 137
 138 def get_cited_by(eid, output_file):
 139     return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
 140             citation_call = True)
 141
 142
 143 def get_abstract(eid, retry_limit = RETRY_COUNT,
 144         timeout_secs = TIMEOUT_SECS):
 145     api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
 146     # Parameters
 147     payload = {}
 148     for _ in range(retry_limit):
 149         try:
 150             r = s.get(api_resource,
 151                     params = payload,
 152                     timeout = timeout_secs)
 153             if r.status_code == 401:
 154                 get_token()
 155                 continue
 156             if r.status_code == 400:
 157                 raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
 158             break
 159         except requests.exceptions.Timeout:
 160             pass
 161     else:
 162         raise requests.exceptions.Timeout('Timeout Error')
 163     if r.status_code == 404:
 164         return None
 165     r.raise_for_status()
 166     return r.content.decode('utf-8')