]> code.communitydata.science - social-media-chapter.git/blob - code/data_processing/00_abstracts_to_tsv.py
initial import of material for public archive into git
[social-media-chapter.git] / code / data_processing / 00_abstracts_to_tsv.py
1 from collections import Counter
2 from datetime import datetime
3 import json
4 import argparse
5 import csv
6 import random
7
8 random.seed(2017)
9
10 def main():
11
12     parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
13     parser.add_argument('-i', help='Abstract file')
14     parser.add_argument('-o', help='TSV output file')
15     args = parser.parse_args()
16
17     with open(args.i, 'r') as i:
18         with open(args.o, 'w') as o:
19             # Have to get the field names
20             first_line = clean_abstract(json.loads(next(i)))
21             fieldnames = first_line.keys()
22             output = csv.DictWriter(o, fieldnames, delimiter='\t')
23             output.writeheader()
24             output.writerow(first_line)
25             for line in i:
26                 output.writerow(clean_abstract(json.loads(line)))
27
28
29 def clean_abstract(json_response):
30     result = json_response['abstracts-retrieval-response']
31     head = result['item']['bibrecord']['head']
32     try:
33         attributes = {
34                 'modal_country': get_country(head),
35                 'abstract' : get_abstract(result),
36                 'title' : get_title(result),
37                 'source_title': get_source_title(head),
38                 'language': result['language']['@xml:lang'],
39                 'first_ASJC_subject_area': get_subject(result, '$'),
40                 'first_ASJC_classification': get_subject(result, '@code'),
41                 'first_CPX_class': get_CPX_class(head, 'classification-description'),
42                 'date': to_date(result['coredata']['prism:coverDate']),
43                 'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
44                 'eid' : result['coredata']['eid'],
45                 'cited_by_count': result['coredata']['citedby-count'],
46                 'num_citations': get_citation_count(result)
47                 }
48     except KeyError:
49         raise
50     except TypeError:
51        # print(result)
52         raise
53     return attributes
54
55 def get_citation_count(result):
56     try:
57         return result['item']['bibrecord']['tail']['bibliography']['@refcount']
58     except TypeError:
59         return None
60
61 def get_title(result):
62     try:
63         return result['coredata']['dc:title']
64     except KeyError:
65         raise
66
67
68 def get_source_title(head):
69     try:
70         return head['source']['sourcetitle']
71     except KeyError:
72         raise
73
74 def get_abstract(result):
75     try:
76         abstract = result['coredata']['dc:description']
77         abstract = abstract.replace('\n',' ')
78         return abstract
79     except KeyError:
80         return None
81
82 def get_auth_names(head):
83     try:
84         auth_info = [x['author'] for x in make_list(head['author-group'])]
85     except KeyError:
86         print(head)
87     auth_names = []
88     for auth_group in auth_info:
89         for auth in make_list(auth_group):
90             auth_names.append('{} {}'.format(
91                 auth['preferred-name']['ce:given-name'],
92                 auth['preferred-name']['ce:surname']))
93     return auth_names
94
95 def get_country(head):
96     all_countries = get_aff_info(head, 'country')
97     if all_countries:
98         # Find the mode. If there's more than one, choose randomly
99         modes = Counter
100         s = set(all_countries)
101         max_count = max([all_countries.count(x) for x in s])
102         modes = [x for x in s if all_countries.count(x) == max_count]
103         return random.choice(modes)
104
105 def get_aff_info(head, affiliation_key):
106     aff_info = []
107     try:
108         authors = make_list(head['author-group'])
109     except KeyError:
110         return None
111     for x in authors:
112         try:
113             num_auth = len(make_list(x['author']))
114         except KeyError:
115             # Apparently there are things called "collaborations", which don't have affiliation info.
116             # I'm just skipping them
117             continue
118         except TypeError:
119             # And apparently "None" appears in the author list for no reason. :)
120             continue
121         try:
122             curr_inst = x['affiliation'][affiliation_key]
123             # Add one instance for each author from this institution
124             aff_info += [curr_inst] * num_auth
125         except KeyError:
126             # If there isn't affiliation info for these authors, return empty str
127             aff_info += [''] * num_auth
128     return aff_info
129
130 def get_keywords(head):
131     cite_info = head['citation-info']
132     try:
133         keywords = [x for x in
134                 make_list(cite_info['author-keywords']['author-keyword'])]
135         # When there's only one keyword, it's a string. Otherwise, we will
136         # have a list of dictionaries
137         if len(keywords) == 1:
138             return keywords
139         else:
140             return [x['$'] for x in keywords]
141     except KeyError:
142         return None
143
144 def get_subject(result, key):
145     try:
146         return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
147     except KeyError:
148         print(result)
149         raise
150
151 def get_CPX_class(head, class_key):
152     try:
153         for x in head['enhancement']['classificationgroup']['classifications']:
154             if x['@type'] == 'CPXCLASS':
155                 try:
156                     return [y[class_key] for y in make_list(x['classification'])][0]
157                 except (KeyError, TypeError):
158                     return None
159     except KeyError:
160         print(head['enhancement']['classificationgroup'])
161         raise
162
163 def to_date(date_string):
164     return datetime.strptime(date_string, '%Y-%m-%d')
165
166
167 def if_exists(key, dictionary, else_val = None):
168     try:
169         return dictionary[key]
170     except KeyError:
171         return else_val
172
173 def make_list(list_or_dict):
174     return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
175
176 if __name__ == '__main__':
177     main()

Community Data Science Collective || Want to submit a patch?