from request_functions import * import argparse import json import subprocess from os import remove def main(): parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in') parser.add_argument('-i', help='JSON file which includes eids and citedby-count') parser.add_argument('-o', help='Where to append JSON results') args = parser.parse_args() with open(args.i, 'r') as f: # Make a dictionary of eid:citation count for each line in the file eids = {} for line in f: l = json.loads(line) eids[l['eid']] = l['citedby-count'] # If the script gets interrupted, we need to start where we left off try: # Open the output file, and grab all of the eids which are already completed with open(args.o, 'r') as f: completed_eids = [json.loads(l)['parent_eid'] for l in f] # Remove those which came from the last id (since we may have missed some) if len(completed_eids) > 0: last_eid = completed_eids.pop() # Remove all of the lines which came from the last eid subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o]) # Hopefully everything has worked out, because here we blow away the backup remove('{}.bak'.format(args.o)) except IOError: # If the file doesn't exist, then there aren't any completed eids completed_eids = [] with open(args.o, 'a') as out_file: for eid, citation_count in eids.items(): if citation_count != '0' and eid not in completed_eids: get_cited_by(eid, out_file) if __name__ == '__main__': main()