wikipedia_views/analysis/pageview_example.R

   1 ### COVID-19 Digital Observatory
   2 ### 2020-03-28
   3 ###
   4 ### Minimal example analysis file using pageview data
   5
   6 library(tidyverse)
   7 library(ggplot2)
   8 library(scales)
   9
  10 ### Import and cleanup data
  11
  12 DataURL <-
  13     url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv")
  14
  15 views <-
  16     read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE)
  17
  18 ### Alternatively, uncomment and run if working locally with full git
  19 ### tree
  20 ###
  21 ### Identify data source directory and file
  22 ## DataDir <- ("../data/")
  23 ## DataFile <- ("dailyviews2020032600.tsv")
  24
  25 ## related.searches.top <- read.table(paste(DataDir,DataFile, sep=""),
  26 ##                                   sep="\t", header=TRUE,
  27 ##                                   stringsAsFactors=FALSE)
  28
  29 ### Cleanup and do the grouping with functions from the Tidyverse
  30 ### (see https://www.tidyverse.org for more info)
  31
  32 views <- views[,c("article", "project", "timestamp", "views")]
  33 views$timestamp <- factor(views$timestamp)
  34
  35 ### Sorts and groups at the same time
  36 views.by.proj.date <- arrange(group_by(views, project, timestamp),
  37                         desc(views))
  38
  39 ### Export just the top 10 by pageviews
  40 write.table(head(views.by.proj.date, 10),
  41             file="output/top10_views_by_project_date.csv", sep=",",
  42             row.names=FALSE)
  43
  44 ### A simple visualization
  45 p <- ggplot(data=views.by.proj.date, aes(views))
  46
  47 ## Density plot with log-transformed axis
  48 p + geom_density() + scale_x_log10(labels=comma)
  49
  50
  51