X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/308d462e767920ef541f8ccef2942e87eb854be8..36167295ecfe297780be2ec42ef1213e26d014e0:/transliterations/src/wikidata_transliterations.py diff --git a/transliterations/src/wikidata_transliterations.py b/transliterations/src/wikidata_transliterations.py new file mode 100644 index 0000000..e947422 --- /dev/null +++ b/transliterations/src/wikidata_transliterations.py @@ -0,0 +1,79 @@ +from wikidata_api_calls import run_sparql_query +from itertools import chain, islice +import csv +from json import JSONDecodeError + +class LabelData: + __slots__ = ['entityid','label','langcode','is_alt'] + + def __init__(self, wd_res, entityid, is_alt): + obj = wd_res.get('label',None) + self.label = obj.get('value',None) + self.langcode = obj.get('xml:lang',None) + self.entityid = entityid + self.is_alt = is_alt + + def to_list(self): + return [self.entityid, + self.label, + self.langcode, + self.is_alt] + + +def GetAllLabels(in_csv, outfile, topN): + + def load_entity_ids(in_csv, topN=5): + with open(in_csv,'r',newline='') as infile: + reader = csv.DictReader(infile) + for row in reader: + if int(row['search_position']) < topN: + yield row["entityid"] + + ids = set(load_entity_ids(in_csv, topN)) + + labeldata = chain(* map(GetEntityLabels, ids)) + + with open(outfile, 'w', newline='') as of: + writer = csv.writer(of) + writer.writerow(LabelData.__slots__) + writer.writerows(map(LabelData.to_list,labeldata)) + + +def GetEntityLabels(entityid): + + def run_query_and_parse(query, entityid, is_alt): + results = run_sparql_query(query % entityid) + try: + jobj = results.json() + res = jobj.get('results',None) + if res is not None: + res = res.get('bindings',None) + if res is None: + raise requests.APIError(f"got invalid response from wikidata for {query % entityid}") + for info in res: + yield LabelData(info, entityid, is_alt) + + except JSONDecodeError as e: + print(e) + print(query % entityid) + + + label_base_query = """ + SELECT DISTINCT ?label WHERE { + wd:%s rdfs:label ?label; + }""" + + altLabel_base_query = """ + SELECT DISTINCT ?label WHERE { + wd:%s skos:altLabel ?label; + }""" + + label_results = run_query_and_parse(label_base_query, entityid, is_alt=False) + + altLabel_results = run_query_and_parse(altLabel_base_query, entityid, is_alt=True) + + return chain(label_results, altLabel_results) + + +if __name__ == "__main__": + GetAllLabels("../data/output/wikidata_search_results.csv","../data/output/wikidata_entity_labels.csv", topN=20)