From: aaronshaw Date: Wed, 1 Apr 2020 21:52:22 +0000 (-0500) Subject: pointing at updated data url, adding explicit NA handling to factor, cutting unnecess... X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/282588772e99e7df51523928c364247e9ad5a54b?hp=-c pointing at updated data url, adding explicit NA handling to factor, cutting unnecessary call to ggplot2, and updated corresponding output from new data file. May not work while kibo urls are getting resolved --- 282588772e99e7df51523928c364247e9ad5a54b diff --git a/wikipedia_views/analysis/output/top10_views_by_project_date.csv b/wikipedia_views/analysis/output/top10_views_by_project_date.csv index 796af10..ce7eb5e 100644 --- a/wikipedia_views/analysis/output/top10_views_by_project_date.csv +++ b/wikipedia_views/analysis/output/top10_views_by_project_date.csv @@ -1,11 +1,11 @@ "article","project","timestamp","views" -"2019–20_coronavirus_pandemic","en.wikipedia","2020032600",1148284 -"2020_coronavirus_pandemic_in_India","en.wikipedia","2020032600",513901 -"Coronavirus","en.wikipedia","2020032600",397959 -"2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020032600",337676 -"2019–20_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020032600",298603 -"2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020032600",297687 -"Coronavirus_disease_2019","en.wikipedia","2020032600",292272 -"2020_coronavirus_pandemic_in_Spain","en.wikipedia","2020032600",114732 -"2020_coronavirus_pandemic_in_the_United_Kingdom","en.wikipedia","2020032600",111856 -"Anthony_Fauci","en.wikipedia","2020032600",103205 +"2019–20_coronavirus_pandemic","en.wikipedia","2020033100",831879 +"2020_coronavirus_pandemic_in_India","en.wikipedia","2020033100",323123 +"2019–20_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020033100",315572 +"2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020033100",290535 +"Coronavirus_disease_2019","en.wikipedia","2020033100",211391 +"2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020033100",209908 +"Coronavirus","en.wikipedia","2020033100",188921 +"USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422 +"USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422 +"WrestleMania_36","en.wikipedia","2020033100",137637 diff --git a/wikipedia_views/analysis/pageview_example.R b/wikipedia_views/analysis/pageview_example.R index 8a7aba3..fb5359a 100644 --- a/wikipedia_views/analysis/pageview_example.R +++ b/wikipedia_views/analysis/pageview_example.R @@ -4,13 +4,12 @@ ### Minimal example analysis file using pageview data library(tidyverse) -library(ggplot2) library(scales) -### Import and cleanup data +### Import and cleanup one datafile from the observatory DataURL <- - url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv") + url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_dailyviews-20200401.tsv") views <- read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE) @@ -30,12 +29,14 @@ views <- ### (see https://www.tidyverse.org for more info) views <- views[,c("article", "project", "timestamp", "views")] -views$timestamp <- factor(views$timestamp) +views$timestamp <- fct_explicit_na(views$timestamp) + ### Sorts and groups at the same time views.by.proj.date <- arrange(group_by(views, project, timestamp), desc(views)) + ### Export just the top 10 by pageviews write.table(head(views.by.proj.date, 10), file="output/top10_views_by_project_date.csv", sep=",",