From: Nathan TeBlunthuis <nathante@uw.edu>
Date: Tue, 31 Mar 2020 23:56:59 +0000 (-0700)
Subject: add examples using the translations data
X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/8bb3db8b46e22311a5b7a1b0f88c8cc84c649699?hp=--cc

add examples using the translations data
---

8bb3db8b46e22311a5b7a1b0f88c8cc84c649699
diff --git a/keywords/analysis/translations_example.R b/keywords/analysis/translations_example.R
new file mode 100644
index 0000000..0f0342c
--- /dev/null
+++ b/keywords/analysis/translations_example.R
@@ -0,0 +1,17 @@
+## example reading latest file straight from the server
+df <- read.csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv")
+
+## make the data more R-friendly
+df$is.alt <- df$is_alt == "True"
+df$is_alt <- NULL
+
+## find all translations for coronavirus
+coronavirus.itemids <- df[ (tolower(df$label) == "coronavirus") &
+                         (df$langcode == 'en')
+                       ,"itemid"]
+
+## there are actually 5 item ids. The one referring to the family of virus is Q57751738
+coronavirus.translations <- df[df$itemid == "http://www.wikidata.org/entity/Q57751738",]
+
+## let's only look at non-aliases
+print(coronavirus.translations[c(coronavirus.translations$is.alt == FALSE), c("label","langcode")])
diff --git a/keywords/analysis/translations_example.py b/keywords/analysis/translations_example.py
new file mode 100644
index 0000000..8df986b
--- /dev/null
+++ b/keywords/analysis/translations_example.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+# read the latest dataset
+df  = pd.read_csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv")
+
+# find translations of "coronavirus"
+coronavirus_itemids = df.loc[df.label.str.lower() == "coronavirus"]
+
+# there are actually 5 item ids. The one referring to the family of virus is Q57751738
+coronavirus_translations = df.loc[df.itemid == "http://www.wikidata.org/entity/Q57751738"]
+
+# let's only look at unique, non-aliases
+print(coronavirus_translations.loc[df.is_alt == False,['label','langcode']])