2 from datetime import datetime
3 from scopus_api import key as API_KEY
9 logging.basicConfig(level=logging.DEBUG)
14 # Initialize a global session object
15 s = requests.Session()
16 s.headers.update({'X-ELS-APIKey' : API_KEY,
17 'X-ELS-ResourceVersion' : 'XOCS',
18 'Accept' : 'application/json'})
20 def get_token(location_id = None):
21 '''Given a location_id, gets an authentication token'''
22 print('Getting a token')
23 api_resource = 'http://api.elsevier.com/authenticate'
25 payload = {'platform':'SCOPUS',
26 'choice': location_id}
27 r = s.get(api_resource, params = payload)
29 s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
31 def get_search_results(query, output_file, results_per_call = 200,
32 tot_results=None, year=None, sort='+title', citation_call=False):
33 '''Handles getting search results. Takes a query and an output
34 file. Writes as many of the search results as possible to the
35 output file as JSON dictionaries, one per line.'''
38 def curr_call(start=0, count=results_per_call):
39 '''Shorthand for the current call: DRY'''
40 return make_search_call(query, start=start,
41 count=count, year=year, sort=sort)
42 if tot_results == None:
43 # Call the API initially to figure out how many results there are, and write the results
44 initial_results = curr_call(count=results_per_call)
45 tot_results = int(initial_results['search-results']['opensearch:totalResults'])
46 result_set.append((initial_results, sort))
47 results_added += results_per_call
48 logging.debug("Total results: {}".format(tot_results))
52 if tot_results > 5000:
53 # If this is just one year, we can't get any more granular, and
54 # we need to return what we can.
55 if tot_results > 10000:
56 print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
57 first_half = last_half = 5000
59 # Get half, and correct for odd # of results
60 first_half = tot_results//2 + tot_results % 2
61 last_half = tot_results//2
62 # Break the search into the first half and the bottom half of results.
63 get_search_results(query, output_file,
65 tot_results=first_half)
67 get_search_results(query, output_file,
69 tot_results = last_half, sort='-title')
70 # If there are 5000 or fewer to retrieve, then get them
72 logging.debug('Retrieving {} results'.format(tot_results))
73 # As long as there are more citations to retrieve, then do it, and write
75 while results_added < tot_results:
76 # If we are near the end, then only get as many results as are left.
77 to_retrieve = min(results_per_call, (tot_results - results_added))
78 curr_results = curr_call(start=results_added, count=to_retrieve)
79 result_set.append((curr_results, sort))
80 results_added += results_per_call
81 # This is hacky, but I'm doing it
82 # If this is a citation call, then construct metadata to be written with the result
84 metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
87 write_results(result_set, output_file, metadata)
89 def write_results(result_set, output_file, metadata={}):
92 to_reverse = x[1].startswith('-')
94 results = [x for x in search_json['search-results']['entry']]
98 results = results[::-1]
100 for k, v in metadata.items():
102 json.dump(x, output_file)
103 output_file.write('\n')
106 def make_search_call(query, start=0, count=200,
107 sort='+title', year=None,
108 retry_limit = RETRY_COUNT,
109 timeout_secs = TIMEOUT_SECS):
110 api_resource = "https://api.elsevier.com/content/search/scopus"
112 payload = {'query':query,
117 for _ in range(retry_limit):
119 r = s.get(api_resource,
121 timeout = timeout_secs)
123 if r.status_code == 401:
126 if r.status_code == 400:
127 raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
129 except requests.exceptions.Timeout:
132 raise requests.exceptions.Timeout('Timeout Error')
138 def get_cited_by(eid, output_file):
139 return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
140 citation_call = True)
143 def get_abstract(eid, retry_limit = RETRY_COUNT,
144 timeout_secs = TIMEOUT_SECS):
145 api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
148 for _ in range(retry_limit):
150 r = s.get(api_resource,
152 timeout = timeout_secs)
153 if r.status_code == 401:
156 if r.status_code == 400:
157 raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
159 except requests.exceptions.Timeout:
162 raise requests.exceptions.Timeout('Timeout Error')
163 if r.status_code == 404:
166 return r.content.decode('utf-8')