X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/09d171608f699eafe6528325938a892937a65302..98b07b8098611287eaa775b09622d1f3514303c8:/keywords/src/wikidata_transliterations.py diff --git a/keywords/src/wikidata_transliterations.py b/keywords/src/wikidata_transliterations.py new file mode 100644 index 0000000..1ac956c --- /dev/null +++ b/keywords/src/wikidata_transliterations.py @@ -0,0 +1,107 @@ +from wikidata_api_calls import run_sparql_query +from itertools import chain, islice +import csv +from json import JSONDecodeError +from os import path + +class LabelData: + __slots__ = ['entityid','label','langcode','is_alt'] + + def __init__(self, wd_res, is_alt): + obj = wd_res.get('label',None) + self.label = obj.get('value',None) + self.langcode = obj.get('xml:lang',None) + self.entityid = wd_res.get('entity',None).get('value',None) + self.is_alt = is_alt + + def to_list(self): + return [self.entityid, + self.label, + self.langcode, + self.is_alt] + +def GetAllLabels(in_csvs, outfile, topNs): + + def load_entity_ids(in_csv, topN=5): + with open(in_csv,'r',newline='') as infile: + reader = list(csv.DictReader(infile)) + for row in reader: + if int(row['search_position']) < topN: + yield row["entityid"] + + ids = set(chain(* map(lambda in_csv, topN: load_entity_ids(in_csv, topN), in_csvs, topNs))) + + labeldata = GetEntityLabels(ids) + + with open(outfile, 'w', newline='') as of: + writer = csv.writer(of) + writer.writerow(LabelData.__slots__) + writer.writerows(map(LabelData.to_list,labeldata)) + + +def GetEntityLabels(entityids): + + def run_query_and_parse(query, is_alt): + results = run_sparql_query(query) + try: + jobj = results.json() + + res = jobj.get('results',None) + if res is not None: + res = res.get('bindings',None) + if res is None: + raise requests.APIError(f"got invalid response from wikidata for {query % entityid}") + + for info in res: + yield LabelData(info, is_alt) + + except JSONDecodeError as e: + print(e) + print(query) + + def prep_query(query, prop, entityids): + values = ' '.join(('wd:{0}'.format(id) for id in entityids)) + return query.format(prop, values) + + base_query = """ + SELECT DISTINCT ?entity ?label WHERE {{ + ?entity {0} ?label; + VALUES ?entity {{ {1} }} + }}""" + + # we can't get all the entities at once. how about 100 at a time? + chunksize = 100 + entityids = (id for id in entityids) + chunk = list(islice(entityids, chunksize)) + calls = [] + while len(chunk) > 0: + label_query = prep_query(base_query, "rdfs:label", chunk) + altLabel_query = prep_query(base_query, "skos:altLabel", chunk) + label_results = run_query_and_parse(label_query, is_alt=False) + altLabel_results = run_query_and_parse(altLabel_query, is_alt=True) + calls.extend([label_results, altLabel_results]) + chunk = list(islice(entityids, chunksize)) + + return chain(*calls) + + +def find_new_output_file(output, i = 1): + if path.exists(output): + name, ext = path.splitext(output) + + return find_new_output_file(f"{name}_{i}.{ext}", i+1) + else: + return output + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser("Use wikidata to find transliterations of terms") + parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read. the inputs are generated by wikidata_search.py') + parser.add_argument('--topN', type=int, nargs='+', help='limit number of wikidata search results to use, can pass one arg for each source.') + parser.add_argument('--output', type=str, help='an output file. defaults to stdout',default=20) + + args = parser.parse_args() + + output = find_new_output_file(args.output) + + GetAllLabels(args.inputs, output, topNs=args.topN)