transliterations/src/wikidata_search.py

   1 # generate a list of wikidata entities related to keywords
   2 from os import path
   3 from sys import stdout
   4 from wikidata_api_calls import search_wikidata, get_wikidata_api
   5 import csv
   6 from itertools import chain
   7
   8 class Wikidata_ResultSet:
   9     def __init__(self):
  10         self.results = []
  11
  12     def extend(self, term, results):
  13         self.results.append(
  14             (Wikidata_Result(term, result, i)
  15              for i, result in enumerate(results))
  16         )
  17
  18     def to_csv(self, outfile=None):
  19         if outfile is None:
  20             of = stdout
  21
  22         else:
  23             of = open(outfile,'w',newline='')
  24         writer = csv.writer(of)
  25         writer.writerow(Wikidata_Result.__slots__)
  26         writer.writerows(map(Wikidata_Result.to_list, chain(* self.results)))
  27
  28
  29 class Wikidata_Result:
  30     # store unique entities found in the search results, the position in the search result, and the date
  31     __slots__=['search_term','entityid','pageid','search_position','timestamp']
  32
  33     def __init__(self,
  34                  term,
  35                  search_result,
  36                  position):
  37
  38         self.search_term = term.strip()
  39         self.entityid = search_result['title']
  40         self.pageid = int(search_result['pageid'])
  41         self.search_position = int(position)
  42         self.timestamp = search_result['timestamp']
  43
  44     def to_list(self):
  45         return [self.search_term,
  46                 self.entityid,
  47                 self.pageid,
  48                 self.search_position,
  49                 self.timestamp]
  50
  51 def run_wikidata_searches(terms):
  52     api = get_wikidata_api()
  53     resultset = Wikidata_ResultSet()
  54     for term in terms:
  55         search_results = search_wikidata(api, term)
  56         resultset.extend(term, search_results)
  57     return resultset
  58
  59 def read_google_trends_files(terms_files):
  60     def _read_file(infile):
  61         return csv.DictReader(open(infile,'r',newline=''))
  62
  63     for row in chain(* [_read_file(terms_file) for terms_file in terms_files]):
  64         yield row['query']
  65
  66
  67 def trawl_google_trends(terms_files, outfile = None):
  68     terms = read_google_trends_files(terms_files)
  69     resultset = run_wikidata_searches(terms)
  70     resultset.to_csv(outfile)
  71
  72 def trawl_base_terms(infiles, outfile = None):
  73     terms = chain(* (open(infile,'r') for infile in infiles))
  74     resultset = run_wikidata_searches(terms)
  75     resultset.to_csv(outfile)
  76
  77     ## search each of the base terms in wikidata
  78
  79     # store unique entities found in the search results, the position in the search result, and the date
  80
  81 if __name__ == "__main__":
  82     import argparse
  83     parser = argparse.ArgumentParser("Search wikidata for entities related to a set of terms.")
  84     parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read')
  85     parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends')
  86     parser.add_argument('--output', type=str, help='an output file. defaults to stdout')
  87     args = parser.parse_args()
  88     if args.use_gtrends:
  89         trawl_google_trends(args.inputs, args.output)
  90     else:
  91         trawl_base_terms(args.inputs, args.output)