wikipedia/example_analysis/revisions_example.R

   1 ### COVID-19 Digital Observatory
   2 ###
   3 ### Minimal example analysis file using revisions data.
   4 ###
   5 ### Note: The revisions data files can get quite big and it may not make
   6 ### sense to try to load them into memory in R. This example file
   7 ### loads the first 1,000 rows of a sample file.
   8
   9 library(tidyverse)
  10 library(lubridate)
  11 library(scales)
  12
  13 ### Import and cleanup first 1000 rows of one datafile from the observatory
  14
  15 DataURL <-
  16   url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_revisions-20200401.tsv")
  17
  18 revisions <- read.table(DataURL, sep="\t", header=TRUE,
  19                         stringsAsFactors=FALSE,
  20                         nrows=1000)
  21
  22 revisions$timestamp <- as_datetime(revisions$timestamp)
  23
  24 ### Group edit by editor to see edit counts
  25 user.edit.counts <- revisions %>%
  26     group_by(user, title) %>% # for the example, there's only one
  27                                         # article title
  28     summarize(appearances = n()) %>%
  29     arrange(-appearances)
  30
  31 ### Export that as a little table
  32 write.table(user.edit.counts,
  33             file="output/user_edit_counts-first1000.csv", sep=",",
  34             row.names=FALSE)
  35
  36 ### A simple time series of edits
  37 plot <- revisions %>%
  38     arrange(timestamp) %>%
  39     mutate(count = as.numeric(rownames(revisions))) %>%
  40     ggplot(aes(x=timestamp, y=count)) +
  41     geom_jitter(alpha=0.1) +
  42     xlab("Date") +
  43     ylab("Total edits (cumulative)") +
  44     ggtitle(paste("Revisions to article: ",
  45                   head(revisions$title,1), sep="")) +
  46     theme_minimal()
  47
  48 ggsave("output/first_1000_edit_timeseries.png", plot)
  49
  50
  51