code/data_processing/02_filter_edgelist.py

   1 import argparse
   2 import csv
   3
   4
   5 def main():
   6
   7     parser = argparse.ArgumentParser(description='Take the edgelist, and reduce it to just the papers which are in our search')
   8     parser.add_argument('-i', help='Full edgelist file')
   9     parser.add_argument('-o', help='Edgelist output file')
  10     args = parser.parse_args()
  11
  12     with open(args.i, 'r') as in_file:
  13         i = csv.reader(in_file, delimiter= '\t')
  14         next(i) # Discard header
  15         # Get the list of nodes to keep
  16         nodes = set([x[0] for x in i])
  17         in_file.seek(0) # Start over at the beginning
  18         with open(args.o, 'w') as o:
  19             output = csv.writer(o, delimiter = '\t')
  20             output.writerow(['to','from', 'date'])
  21             for line in i:
  22                 # If the both items are in nodes, then keep the line
  23                 if line[1] in nodes:
  24                     output.writerow(line)
  25
  26
  27 if __name__ == '__main__':
  28     main()
  29