import json import argparse import csv def main(): parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file') parser.add_argument('-i', help='Abstract file') parser.add_argument('-o', help='TSV output file') args = parser.parse_args() with open(args.i, 'r') as i: with open(args.o, 'w') as o: output = csv.writer(o, delimiter='\t') output.writerow(['paper_eid','subject', 'subject_code']) for line in i: entries = get_entries(line) for entry in entries: output.writerow(entry) def get_entries(l): json_response = json.loads(l) full = json_response['abstracts-retrieval-response'] eid = full['coredata']['eid'] subjects = get_subjects(full) # Prepend the eid, and return the subjects return [[eid,s[0],s[1]] for s in subjects] return [] def get_subjects(abstract_response): try: subject_info = make_list(abstract_response['subject-areas']['subject-area']) except KeyError: print(result) raise result = [] for s in subject_info: # Get the subject name and code, and append them result.append([s['$'],s['@code']]) return result def make_list(list_or_dict): return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict] if __name__ == '__main__': main()