code/data_collection/02_get_cited_by.py

   1 from request_functions import *
   2 import argparse
   3 import json
   4 import subprocess
   5 from os import remove
   6
   7 def main():
   8
   9     parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
  10     parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
  11     parser.add_argument('-o', help='Where to append JSON results')
  12     args = parser.parse_args()
  13
  14     with open(args.i, 'r') as f:
  15         # Make a dictionary of eid:citation count for each line in the file
  16         eids = {}
  17         for line in f:
  18             l = json.loads(line)
  19             eids[l['eid']] = l['citedby-count']
  20
  21     # If the script gets interrupted, we need to start where we left off
  22     try:
  23         # Open the output file, and grab all of the eids which are already completed
  24         with open(args.o, 'r') as f:
  25             completed_eids = [json.loads(l)['parent_eid'] for l in f]
  26         # Remove those which came from the last id (since we may have missed some)
  27         if len(completed_eids) > 0:
  28             last_eid = completed_eids.pop()
  29             # Remove all of the lines which came from the last eid
  30             subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
  31             # Hopefully everything has worked out, because here we blow away the backup
  32             remove('{}.bak'.format(args.o))
  33     except IOError:
  34         # If the file doesn't exist, then there aren't any completed eids
  35         completed_eids = []
  36
  37     with open(args.o, 'a') as out_file:
  38         for eid, citation_count in eids.items():
  39             if citation_count != '0' and eid not in completed_eids:
  40                 get_cited_by(eid, out_file)
  41
  42 if __name__ == '__main__':
  43     main()