1 from collections import Counter
2 from datetime import datetime
12 parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
13 parser.add_argument('-i', help='Abstract file')
14 parser.add_argument('-o', help='TSV output file')
15 args = parser.parse_args()
17 with open(args.i, 'r') as i:
18 with open(args.o, 'w') as o:
19 # Have to get the field names
20 first_line = clean_abstract(json.loads(next(i)))
21 fieldnames = first_line.keys()
22 output = csv.DictWriter(o, fieldnames, delimiter='\t')
24 output.writerow(first_line)
26 output.writerow(clean_abstract(json.loads(line)))
29 def clean_abstract(json_response):
30 result = json_response['abstracts-retrieval-response']
31 head = result['item']['bibrecord']['head']
34 'modal_country': get_country(head),
35 'abstract' : get_abstract(result),
36 'title' : get_title(result),
37 'source_title': get_source_title(head),
38 'language': result['language']['@xml:lang'],
39 'first_ASJC_subject_area': get_subject(result, '$'),
40 'first_ASJC_classification': get_subject(result, '@code'),
41 'first_CPX_class': get_CPX_class(head, 'classification-description'),
42 'date': to_date(result['coredata']['prism:coverDate']),
43 'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
44 'eid' : result['coredata']['eid'],
45 'cited_by_count': result['coredata']['citedby-count'],
46 'num_citations': get_citation_count(result)
55 def get_citation_count(result):
57 return result['item']['bibrecord']['tail']['bibliography']['@refcount']
61 def get_title(result):
63 return result['coredata']['dc:title']
68 def get_source_title(head):
70 return head['source']['sourcetitle']
74 def get_abstract(result):
76 abstract = result['coredata']['dc:description']
77 abstract = abstract.replace('\n',' ')
82 def get_auth_names(head):
84 auth_info = [x['author'] for x in make_list(head['author-group'])]
88 for auth_group in auth_info:
89 for auth in make_list(auth_group):
90 auth_names.append('{} {}'.format(
91 auth['preferred-name']['ce:given-name'],
92 auth['preferred-name']['ce:surname']))
95 def get_country(head):
96 all_countries = get_aff_info(head, 'country')
98 # Find the mode. If there's more than one, choose randomly
100 s = set(all_countries)
101 max_count = max([all_countries.count(x) for x in s])
102 modes = [x for x in s if all_countries.count(x) == max_count]
103 return random.choice(modes)
105 def get_aff_info(head, affiliation_key):
108 authors = make_list(head['author-group'])
113 num_auth = len(make_list(x['author']))
115 # Apparently there are things called "collaborations", which don't have affiliation info.
116 # I'm just skipping them
119 # And apparently "None" appears in the author list for no reason. :)
122 curr_inst = x['affiliation'][affiliation_key]
123 # Add one instance for each author from this institution
124 aff_info += [curr_inst] * num_auth
126 # If there isn't affiliation info for these authors, return empty str
127 aff_info += [''] * num_auth
130 def get_keywords(head):
131 cite_info = head['citation-info']
133 keywords = [x for x in
134 make_list(cite_info['author-keywords']['author-keyword'])]
135 # When there's only one keyword, it's a string. Otherwise, we will
136 # have a list of dictionaries
137 if len(keywords) == 1:
140 return [x['$'] for x in keywords]
144 def get_subject(result, key):
146 return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
151 def get_CPX_class(head, class_key):
153 for x in head['enhancement']['classificationgroup']['classifications']:
154 if x['@type'] == 'CPXCLASS':
156 return [y[class_key] for y in make_list(x['classification'])][0]
157 except (KeyError, TypeError):
160 print(head['enhancement']['classificationgroup'])
163 def to_date(date_string):
164 return datetime.strptime(date_string, '%Y-%m-%d')
167 def if_exists(key, dictionary, else_val = None):
169 return dictionary[key]
173 def make_list(list_or_dict):
174 return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
176 if __name__ == '__main__':