X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/f770ade87a6e06828f015147f28c1a8334878731..13371fd83edfd11d7c9051fe1e69e92b4204fc3b:/wikipedia/example_analysis/revisions_example.R diff --git a/wikipedia/example_analysis/revisions_example.R b/wikipedia/example_analysis/revisions_example.R new file mode 100644 index 0000000..86c7770 --- /dev/null +++ b/wikipedia/example_analysis/revisions_example.R @@ -0,0 +1,51 @@ +### COVID-19 Digital Observatory +### +### Minimal example analysis file using revisions data. +### +### Note: The revisions data files can get quite big and it may not make +### sense to try to load them into memory in R. This example file +### loads the first 1,000 rows of a sample file. + +library(tidyverse) +library(lubridate) +library(scales) + +### Import and cleanup first 1000 rows of one datafile from the observatory + +DataURL <- + url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_revisions-20200401.tsv") + +revisions <- read.table(DataURL, sep="\t", header=TRUE, + stringsAsFactors=FALSE, + nrows=1000) + +revisions$timestamp <- as_datetime(revisions$timestamp) + +### Group edit by editor to see edit counts +user.edit.counts <- revisions %>% + group_by(user, title) %>% # for the example, there's only one + # article title + summarize(appearances = n()) %>% + arrange(-appearances) + +### Export that as a little table +write.table(user.edit.counts, + file="output/user_edit_counts-first1000.csv", sep=",", + row.names=FALSE) + +### A simple time series of edits +plot <- revisions %>% + arrange(timestamp) %>% + mutate(count = as.numeric(rownames(revisions))) %>% + ggplot(aes(x=timestamp, y=count)) + + geom_jitter(alpha=0.1) + + xlab("Date") + + ylab("Total edits (cumulative)") + + ggtitle(paste("Revisions to article: ", + head(revisions$title,1), sep="")) + + theme_minimal() + +ggsave("output/first_1000_edit_timeseries.png", plot) + + +