]> code.communitydata.science - covid19.git/commitdiff
Python code to find wikidata entities to translate. Here we search the api for entit...
authorNathan TeBlunthuis <nathante@uw.edu>
Tue, 24 Mar 2020 22:03:47 +0000 (15:03 -0700)
committerNathan TeBlunthuis <nathante@uw.edu>
Tue, 24 Mar 2020 22:03:47 +0000 (15:03 -0700)
Building system for finding translations from Wikidata.

translations/data/input/base_terms.txt [new file with mode: 0644]
translations/src/__init__.py [new file with mode: 0644]
translations/src/defaults.py [new file with mode: 0644]
translations/src/find_entities.py [new file with mode: 0644]
translations/src/wikidata_api_calls.py [new file with mode: 0644]

diff --git a/translations/data/input/base_terms.txt b/translations/data/input/base_terms.txt
new file mode 100644 (file)
index 0000000..cd45abc
--- /dev/null
@@ -0,0 +1,2 @@
+coronavirus
+covid-19
diff --git a/translations/src/__init__.py b/translations/src/__init__.py
new file mode 100644 (file)
index 0000000..5211ac6
--- /dev/null
@@ -0,0 +1,2 @@
+from wikidata_api_calls import *
+from find_entities import *
diff --git a/translations/src/defaults.py b/translations/src/defaults.py
new file mode 100644 (file)
index 0000000..3d6168f
--- /dev/null
@@ -0,0 +1 @@
+user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
diff --git a/translations/src/find_entities.py b/translations/src/find_entities.py
new file mode 100644 (file)
index 0000000..eafb42a
--- /dev/null
@@ -0,0 +1,68 @@
+# generate a list of wikidata entities related to keywords
+from os import path
+from sys import stdout
+from wikidata_api_calls import search_wikidata
+
+class Wikidata_ResultSet(object):
+    def __init__(self):
+        self.results = []
+
+    def extend(self, term, results):
+        self.results.extend([Wikidata_Result(term, result, i)
+                                    for i, result in enumerate(results)])
+
+    def to_csv(self, outfile=None):
+        
+        header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp'])
+        if outfile is None:
+            of = stdout
+
+        else:
+            of = open(outfile,'w')
+
+        of.write(header)
+        for result in self.results:
+            of.write(result.to_csv())
+
+        of.close()
+
+
+class Wikidata_Result(object):
+    # store unique entities found in the search results, the position in the search result, and the date
+    __slots__=['search_term','entityid','pageid','search_position','timestamp']
+
+    def __init__(self,
+                 term,
+                 search_result,
+                 position):
+
+        self.search_term = term.strip()
+        self.entityid = search_result['title']
+        self.pageid = search_result['pageid']
+        self.search_position = position
+        self.timestamp = search_result['timestamp']
+
+    def to_csv(self):
+        return ','.join([self.search_term,
+                         self.entityid,
+                         str(self.pageid),
+                         str(self.search_position),
+                         str(self.timestamp)]) + '\n'
+    
+def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
+
+    resultset = Wikidata_ResultSet()
+    for term in open(terms_file,'r'):
+        api = get_wikidata_api()
+        search_results = search_wikidata(api, term)
+        resultset.extend(term, search_results)
+
+    resultset.to_csv(outfile)
+
+
+    ## search each of the base terms in wikidata
+
+    # store unique entities found in the search results, the position in the search result, and the date
+
+if __name__ == "__main__":
+    run_wikidata_searches()
diff --git a/translations/src/wikidata_api_calls.py b/translations/src/wikidata_api_calls.py
new file mode 100644 (file)
index 0000000..022a9d4
--- /dev/null
@@ -0,0 +1,29 @@
+# File defines functions for making api calls to find translations and transliterations for key terms.
+
+import mwapi
+import sys
+sys.path.append("..")
+from defaults import user_agent
+
+def get_wikidata_api():
+    session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent)
+    return session
+
+def search_wikidata(session, term, *args, **kwargs):
+    search_results = session.get(action='query',
+                                 list='search',
+                                 srsearch=term,
+#                                 srqiprofile='popular_inclinks_pv',
+                                 srlimit='max',
+                                 srnamespace=0,
+                                 *args,
+                                 **kwargs)
+
+
+    query = search_results.get('query', None)
+    results = query.get('search', None)
+
+    if results is None:
+        raise mwapi.session.APIError(f"No results for query: {term}")
+
+    return results

Community Data Science Collective || Want to submit a patch?