From: Nathan TeBlunthuis Date: Tue, 24 Mar 2020 22:03:47 +0000 (-0700) Subject: Python code to find wikidata entities to translate. Here we search the api for entit... X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/836098461e0e39542ddd89859a45b5c6ddaab9b4 Python code to find wikidata entities to translate. Here we search the api for entities that have covid keywords. Building system for finding translations from Wikidata. --- diff --git a/translations/data/input/base_terms.txt b/translations/data/input/base_terms.txt new file mode 100644 index 0000000..cd45abc --- /dev/null +++ b/translations/data/input/base_terms.txt @@ -0,0 +1,2 @@ +coronavirus +covid-19 diff --git a/translations/src/__init__.py b/translations/src/__init__.py new file mode 100644 index 0000000..5211ac6 --- /dev/null +++ b/translations/src/__init__.py @@ -0,0 +1,2 @@ +from wikidata_api_calls import * +from find_entities import * diff --git a/translations/src/defaults.py b/translations/src/defaults.py new file mode 100644 index 0000000..3d6168f --- /dev/null +++ b/translations/src/defaults.py @@ -0,0 +1 @@ +user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)" diff --git a/translations/src/find_entities.py b/translations/src/find_entities.py new file mode 100644 index 0000000..eafb42a --- /dev/null +++ b/translations/src/find_entities.py @@ -0,0 +1,68 @@ +# generate a list of wikidata entities related to keywords +from os import path +from sys import stdout +from wikidata_api_calls import search_wikidata + +class Wikidata_ResultSet(object): + def __init__(self): + self.results = [] + + def extend(self, term, results): + self.results.extend([Wikidata_Result(term, result, i) + for i, result in enumerate(results)]) + + def to_csv(self, outfile=None): + + header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + if outfile is None: + of = stdout + + else: + of = open(outfile,'w') + + of.write(header) + for result in self.results: + of.write(result.to_csv()) + + of.close() + + +class Wikidata_Result(object): + # store unique entities found in the search results, the position in the search result, and the date + __slots__=['search_term','entityid','pageid','search_position','timestamp'] + + def __init__(self, + term, + search_result, + position): + + self.search_term = term.strip() + self.entityid = search_result['title'] + self.pageid = search_result['pageid'] + self.search_position = position + self.timestamp = search_result['timestamp'] + + def to_csv(self): + return ','.join([self.search_term, + self.entityid, + str(self.pageid), + str(self.search_position), + str(self.timestamp)]) + '\n' + +def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"): + + resultset = Wikidata_ResultSet() + for term in open(terms_file,'r'): + api = get_wikidata_api() + search_results = search_wikidata(api, term) + resultset.extend(term, search_results) + + resultset.to_csv(outfile) + + + ## search each of the base terms in wikidata + + # store unique entities found in the search results, the position in the search result, and the date + +if __name__ == "__main__": + run_wikidata_searches() diff --git a/translations/src/wikidata_api_calls.py b/translations/src/wikidata_api_calls.py new file mode 100644 index 0000000..022a9d4 --- /dev/null +++ b/translations/src/wikidata_api_calls.py @@ -0,0 +1,29 @@ +# File defines functions for making api calls to find translations and transliterations for key terms. + +import mwapi +import sys +sys.path.append("..") +from defaults import user_agent + +def get_wikidata_api(): + session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent) + return session + +def search_wikidata(session, term, *args, **kwargs): + search_results = session.get(action='query', + list='search', + srsearch=term, +# srqiprofile='popular_inclinks_pv', + srlimit='max', + srnamespace=0, + *args, + **kwargs) + + + query = search_results.get('query', None) + results = query.get('search', None) + + if results is None: + raise mwapi.session.APIError(f"No results for query: {term}") + + return results