import json import argparse import csv def main(): parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file') parser.add_argument('-i', help='Abstract file') parser.add_argument('-o', help='TSV output file') args = parser.parse_args() with open(args.i, 'r') as i: with open(args.o, 'w') as o: output = csv.writer(o, delimiter='\t') output.writerow(['paper_eid','affiliation_id', 'organization','country']) for line in i: entries = get_entries(line) for entry in entries: output.writerow(entry) def get_entries(l): json_response = json.loads(l) full = json_response['abstracts-retrieval-response'] head = full['item']['bibrecord']['head'] eid = full['coredata']['eid'] countries = get_aff_info(head, 'country') affiliation_ids = get_aff_info(head, '@afid') org_names = get_aff_info(head, 'organization') if countries: result = [[eid, affiliation_ids[i], org_names[i], countries[i]] for i in range(len(countries))] return result return [] def get_aff_info(head, affiliation_key): aff_info = [] try: affiliations = make_list(head['author-group']) except KeyError: return None for x in affiliations: if x is None: continue try: curr_inst = x['affiliation'][affiliation_key] # May return a string or a list. If it's a list, then # return the final value of that list (This is the base organization) if isinstance(curr_inst, list): curr_inst = [x['$'] for x in curr_inst][-1] aff_info.append(curr_inst) except KeyError: # If there isn't affiliation info for these authors, return empty str aff_info.append('') return aff_info def make_list(list_or_dict): return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict] if __name__ == '__main__': main()