]> code.communitydata.science - covid19.git/blob - translations/src/find_entities.py
6edce057508d1bf676cfa4009086d7f0d425f75e
[covid19.git] / translations / src / find_entities.py
1 # generate a list of wikidata entities related to keywords
2 from os import path
3 from sys import stdout
4 from wikidata_api_calls import search_wikidata, get_wikidata_api
5
6 class Wikidata_ResultSet(object):
7     def __init__(self):
8         self.results = []
9
10     def extend(self, term, results):
11         self.results.extend([Wikidata_Result(term, result, i)
12                                     for i, result in enumerate(results)])
13
14     def to_csv(self, outfile=None):
15         
16         header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + '\n'
17         if outfile is None:
18             of = stdout
19
20         else:
21             of = open(outfile,'w')
22
23         of.write(header)
24         for result in self.results:
25             of.write(result.to_csv())
26
27         of.close()
28
29
30 class Wikidata_Result(object):
31     # store unique entities found in the search results, the position in the search result, and the date
32     __slots__=['search_term','entityid','pageid','search_position','timestamp']
33
34     def __init__(self,
35                  term,
36                  search_result,
37                  position):
38
39         self.search_term = term.strip()
40         self.entityid = search_result['title']
41         self.pageid = search_result['pageid']
42         self.search_position = position
43         self.timestamp = search_result['timestamp']
44
45     def to_csv(self):
46         return ','.join([self.search_term,
47                          self.entityid,
48                          str(self.pageid),
49                          str(self.search_position),
50                          str(self.timestamp)]) + '\n'
51     
52 def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
53
54     resultset = Wikidata_ResultSet()
55     for term in open(terms_file,'r'):
56         api = get_wikidata_api()
57         search_results = search_wikidata(api, term)
58         resultset.extend(term, search_results)
59
60     resultset.to_csv(outfile)
61
62
63     ## search each of the base terms in wikidata
64
65     # store unique entities found in the search results, the position in the search result, and the date
66
67 if __name__ == "__main__":
68     run_wikidata_searches()

Community Data Science Collective || Want to submit a patch?