From 8bb3db8b46e22311a5b7a1b0f88c8cc84c649699 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 31 Mar 2020 16:56:59 -0700 Subject: [PATCH] add examples using the translations data --- keywords/analysis/translations_example.R | 17 +++++++++++++++++ keywords/analysis/translations_example.py | 13 +++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 keywords/analysis/translations_example.R create mode 100644 keywords/analysis/translations_example.py diff --git a/keywords/analysis/translations_example.R b/keywords/analysis/translations_example.R new file mode 100644 index 0000000..0f0342c --- /dev/null +++ b/keywords/analysis/translations_example.R @@ -0,0 +1,17 @@ +## example reading latest file straight from the server +df <- read.csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv") + +## make the data more R-friendly +df$is.alt <- df$is_alt == "True" +df$is_alt <- NULL + +## find all translations for coronavirus +coronavirus.itemids <- df[ (tolower(df$label) == "coronavirus") & + (df$langcode == 'en') + ,"itemid"] + +## there are actually 5 item ids. The one referring to the family of virus is Q57751738 +coronavirus.translations <- df[df$itemid == "http://www.wikidata.org/entity/Q57751738",] + +## let's only look at non-aliases +print(coronavirus.translations[c(coronavirus.translations$is.alt == FALSE), c("label","langcode")]) diff --git a/keywords/analysis/translations_example.py b/keywords/analysis/translations_example.py new file mode 100644 index 0000000..8df986b --- /dev/null +++ b/keywords/analysis/translations_example.py @@ -0,0 +1,13 @@ +import pandas as pd + +# read the latest dataset +df = pd.read_csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv") + +# find translations of "coronavirus" +coronavirus_itemids = df.loc[df.label.str.lower() == "coronavirus"] + +# there are actually 5 item ids. The one referring to the family of virus is Q57751738 +coronavirus_translations = df.loc[df.itemid == "http://www.wikidata.org/entity/Q57751738"] + +# let's only look at unique, non-aliases +print(coronavirus_translations.loc[df.is_alt == False,['label','langcode']]) -- 2.39.5