translations/src/find_entities.py

   1 # generate a list of wikidata entities related to keywords
   2 from os import path
   3 from sys import stdout
   4 from wikidata_api_calls import search_wikidata, get_wikidata_api
   5
   6 class Wikidata_ResultSet(object):
   7     def __init__(self):
   8         self.results = []
   9
  10     def extend(self, term, results):
  11         self.results.extend([Wikidata_Result(term, result, i)
  12                                     for i, result in enumerate(results)])
  13
  14     def to_csv(self, outfile=None):
  15
  16         header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + '\n'
  17         if outfile is None:
  18             of = stdout
  19
  20         else:
  21             of = open(outfile,'w')
  22
  23         of.write(header)
  24         for result in self.results:
  25             of.write(result.to_csv())
  26
  27         of.close()
  28
  29
  30 class Wikidata_Result(object):
  31     # store unique entities found in the search results, the position in the search result, and the date
  32     __slots__=['search_term','entityid','pageid','search_position','timestamp']
  33
  34     def __init__(self,
  35                  term,
  36                  search_result,
  37                  position):
  38
  39         self.search_term = term.strip()
  40         self.entityid = search_result['title']
  41         self.pageid = search_result['pageid']
  42         self.search_position = position
  43         self.timestamp = search_result['timestamp']
  44
  45     def to_csv(self):
  46         return ','.join([self.search_term,
  47                          self.entityid,
  48                          str(self.pageid),
  49                          str(self.search_position),
  50                          str(self.timestamp)]) + '\n'
  51
  52 def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
  53
  54     resultset = Wikidata_ResultSet()
  55     for term in open(terms_file,'r'):
  56         api = get_wikidata_api()
  57         search_results = search_wikidata(api, term)
  58         resultset.extend(term, search_results)
  59
  60     resultset.to_csv(outfile)
  61
  62
  63     ## search each of the base terms in wikidata
  64
  65     # store unique entities found in the search results, the position in the search result, and the date
  66
  67 if __name__ == "__main__":
  68     run_wikidata_searches()