code/data_processing/00_abstracts_to_tsv.py

   1 from collections import Counter
   2 from datetime import datetime
   3 import json
   4 import argparse
   5 import csv
   6 import random
   7
   8 random.seed(2017)
   9
  10 def main():
  11
  12     parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
  13     parser.add_argument('-i', help='Abstract file')
  14     parser.add_argument('-o', help='TSV output file')
  15     args = parser.parse_args()
  16
  17     with open(args.i, 'r') as i:
  18         with open(args.o, 'w') as o:
  19             # Have to get the field names
  20             first_line = clean_abstract(json.loads(next(i)))
  21             fieldnames = first_line.keys()
  22             output = csv.DictWriter(o, fieldnames, delimiter='\t')
  23             output.writeheader()
  24             output.writerow(first_line)
  25             for line in i:
  26                 output.writerow(clean_abstract(json.loads(line)))
  27
  28
  29 def clean_abstract(json_response):
  30     result = json_response['abstracts-retrieval-response']
  31     head = result['item']['bibrecord']['head']
  32     try:
  33         attributes = {
  34                 'modal_country': get_country(head),
  35                 'abstract' : get_abstract(result),
  36                 'title' : get_title(result),
  37                 'source_title': get_source_title(head),
  38                 'language': result['language']['@xml:lang'],
  39                 'first_ASJC_subject_area': get_subject(result, '$'),
  40                 'first_ASJC_classification': get_subject(result, '@code'),
  41                 'first_CPX_class': get_CPX_class(head, 'classification-description'),
  42                 'date': to_date(result['coredata']['prism:coverDate']),
  43                 'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
  44                 'eid' : result['coredata']['eid'],
  45                 'cited_by_count': result['coredata']['citedby-count'],
  46                 'num_citations': get_citation_count(result)
  47                 }
  48     except KeyError:
  49         raise
  50     except TypeError:
  51        # print(result)
  52         raise
  53     return attributes
  54
  55 def get_citation_count(result):
  56     try:
  57         return result['item']['bibrecord']['tail']['bibliography']['@refcount']
  58     except TypeError:
  59         return None
  60
  61 def get_title(result):
  62     try:
  63         return result['coredata']['dc:title']
  64     except KeyError:
  65         raise
  66
  67
  68 def get_source_title(head):
  69     try:
  70         return head['source']['sourcetitle']
  71     except KeyError:
  72         raise
  73
  74 def get_abstract(result):
  75     try:
  76         abstract = result['coredata']['dc:description']
  77         abstract = abstract.replace('\n',' ')
  78         return abstract
  79     except KeyError:
  80         return None
  81
  82 def get_auth_names(head):
  83     try:
  84         auth_info = [x['author'] for x in make_list(head['author-group'])]
  85     except KeyError:
  86         print(head)
  87     auth_names = []
  88     for auth_group in auth_info:
  89         for auth in make_list(auth_group):
  90             auth_names.append('{} {}'.format(
  91                 auth['preferred-name']['ce:given-name'],
  92                 auth['preferred-name']['ce:surname']))
  93     return auth_names
  94
  95 def get_country(head):
  96     all_countries = get_aff_info(head, 'country')
  97     if all_countries:
  98         # Find the mode. If there's more than one, choose randomly
  99         modes = Counter
 100         s = set(all_countries)
 101         max_count = max([all_countries.count(x) for x in s])
 102         modes = [x for x in s if all_countries.count(x) == max_count]
 103         return random.choice(modes)
 104
 105 def get_aff_info(head, affiliation_key):
 106     aff_info = []
 107     try:
 108         authors = make_list(head['author-group'])
 109     except KeyError:
 110         return None
 111     for x in authors:
 112         try:
 113             num_auth = len(make_list(x['author']))
 114         except KeyError:
 115             # Apparently there are things called "collaborations", which don't have affiliation info.
 116             # I'm just skipping them
 117             continue
 118         except TypeError:
 119             # And apparently "None" appears in the author list for no reason. :)
 120             continue
 121         try:
 122             curr_inst = x['affiliation'][affiliation_key]
 123             # Add one instance for each author from this institution
 124             aff_info += [curr_inst] * num_auth
 125         except KeyError:
 126             # If there isn't affiliation info for these authors, return empty str
 127             aff_info += [''] * num_auth
 128     return aff_info
 129
 130 def get_keywords(head):
 131     cite_info = head['citation-info']
 132     try:
 133         keywords = [x for x in
 134                 make_list(cite_info['author-keywords']['author-keyword'])]
 135         # When there's only one keyword, it's a string. Otherwise, we will
 136         # have a list of dictionaries
 137         if len(keywords) == 1:
 138             return keywords
 139         else:
 140             return [x['$'] for x in keywords]
 141     except KeyError:
 142         return None
 143
 144 def get_subject(result, key):
 145     try:
 146         return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
 147     except KeyError:
 148         print(result)
 149         raise
 150
 151 def get_CPX_class(head, class_key):
 152     try:
 153         for x in head['enhancement']['classificationgroup']['classifications']:
 154             if x['@type'] == 'CPXCLASS':
 155                 try:
 156                     return [y[class_key] for y in make_list(x['classification'])][0]
 157                 except (KeyError, TypeError):
 158                     return None
 159     except KeyError:
 160         print(head['enhancement']['classificationgroup'])
 161         raise
 162
 163 def to_date(date_string):
 164     return datetime.strptime(date_string, '%Y-%m-%d')
 165
 166
 167 def if_exists(key, dictionary, else_val = None):
 168     try:
 169         return dictionary[key]
 170     except KeyError:
 171         return else_val
 172
 173 def make_list(list_or_dict):
 174     return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
 175
 176 if __name__ == '__main__':
 177     main()