From: aaronshaw Date: Wed, 1 Apr 2020 21:52:22 +0000 (-0500) Subject: pointing at updated data url, adding explicit NA handling to factor, cutting unnecess... X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/282588772e99e7df51523928c364247e9ad5a54b?ds=sidebyside;hp=4f8a698c62f878e248dc1c4d4bb8e048a79fb661 pointing at updated data url, adding explicit NA handling to factor, cutting unnecessary call to ggplot2, and updated corresponding output from new data file. May not work while kibo urls are getting resolved --- diff --git a/wikipedia_views/analysis/output/top10_views_by_project_date.csv b/wikipedia_views/analysis/output/top10_views_by_project_date.csv index 796af10..ce7eb5e 100644 --- a/wikipedia_views/analysis/output/top10_views_by_project_date.csv +++ b/wikipedia_views/analysis/output/top10_views_by_project_date.csv @@ -1,11 +1,11 @@ "article","project","timestamp","views" -"2019–20_coronavirus_pandemic","en.wikipedia","2020032600",1148284 -"2020_coronavirus_pandemic_in_India","en.wikipedia","2020032600",513901 -"Coronavirus","en.wikipedia","2020032600",397959 -"2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020032600",337676 -"2019–20_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020032600",298603 -"2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020032600",297687 -"Coronavirus_disease_2019","en.wikipedia","2020032600",292272 -"2020_coronavirus_pandemic_in_Spain","en.wikipedia","2020032600",114732 -"2020_coronavirus_pandemic_in_the_United_Kingdom","en.wikipedia","2020032600",111856 -"Anthony_Fauci","en.wikipedia","2020032600",103205 +"2019–20_coronavirus_pandemic","en.wikipedia","2020033100",831879 +"2020_coronavirus_pandemic_in_India","en.wikipedia","2020033100",323123 +"2019–20_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020033100",315572 +"2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020033100",290535 +"Coronavirus_disease_2019","en.wikipedia","2020033100",211391 +"2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020033100",209908 +"Coronavirus","en.wikipedia","2020033100",188921 +"USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422 +"USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422 +"WrestleMania_36","en.wikipedia","2020033100",137637 diff --git a/wikipedia_views/analysis/pageview_example.R b/wikipedia_views/analysis/pageview_example.R index 8a7aba3..fb5359a 100644 --- a/wikipedia_views/analysis/pageview_example.R +++ b/wikipedia_views/analysis/pageview_example.R @@ -4,13 +4,12 @@ ### Minimal example analysis file using pageview data library(tidyverse) -library(ggplot2) library(scales) -### Import and cleanup data +### Import and cleanup one datafile from the observatory DataURL <- - url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv") + url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_dailyviews-20200401.tsv") views <- read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE) @@ -30,12 +29,14 @@ views <- ### (see https://www.tidyverse.org for more info) views <- views[,c("article", "project", "timestamp", "views")] -views$timestamp <- factor(views$timestamp) +views$timestamp <- fct_explicit_na(views$timestamp) + ### Sorts and groups at the same time views.by.proj.date <- arrange(group_by(views, project, timestamp), desc(views)) + ### Export just the top 10 by pageviews write.table(head(views.by.proj.date, 10), file="output/top10_views_by_project_date.csv", sep=",",