1 # generate a list of wikidata entities related to keywords
4 from wikidata_api_calls import search_wikidata, get_wikidata_api
6 from itertools import chain
8 class Wikidata_ResultSet:
12 def extend(self, term, results):
14 (Wikidata_Result(term, result, i)
15 for i, result in enumerate(results))
18 def to_csv(self, outfile=None):
23 of = open(outfile,'w',newline='')
24 writer = csv.writer(of)
25 writer.writerow(Wikidata_Result.__slots__)
26 writer.writerows(map(Wikidata_Result.to_list, chain(* self.results)))
29 class Wikidata_Result:
30 # store unique entities found in the search results, the position in the search result, and the date
31 __slots__=['search_term','entityid','pageid','search_position','timestamp']
38 self.search_term = term.strip()
39 self.entityid = search_result['title']
40 self.pageid = int(search_result['pageid'])
41 self.search_position = int(position)
42 self.timestamp = search_result['timestamp']
45 return [self.search_term,
51 def run_wikidata_searches(terms):
52 api = get_wikidata_api()
53 resultset = Wikidata_ResultSet()
55 search_results = search_wikidata(api, term)
56 resultset.extend(term, search_results)
59 def read_google_trends_files(terms_files):
60 def _read_file(infile):
61 return csv.DictReader(open(infile,'r',newline=''))
63 for row in chain(* [_read_file(terms_file) for terms_file in terms_files]):
67 def trawl_google_trends(terms_files, outfile = None):
68 terms = read_google_trends_files(terms_files)
69 resultset = run_wikidata_searches(terms)
70 resultset.to_csv(outfile)
72 def trawl_base_terms(infiles, outfile = None):
73 terms = chain(* (open(infile,'r') for infile in infiles))
74 resultset = run_wikidata_searches(terms)
75 resultset.to_csv(outfile)
77 ## search each of the base terms in wikidata
79 # store unique entities found in the search results, the position in the search result, and the date
81 if __name__ == "__main__":
83 parser = argparse.ArgumentParser("Search wikidata for entities related to a set of terms.")
84 parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read')
85 parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends')
86 parser.add_argument('--output', type=str, help='an output file. defaults to stdout')
87 args = parser.parse_args()
89 trawl_google_trends(args.inputs, args.output)
91 trawl_base_terms(args.inputs, args.output)