]> code.communitydata.science - covid19.git/blob - wikipedia/example_analysis/revisions_example.R
Merge pull request #20 from makoshark/master
[covid19.git] / wikipedia / example_analysis / revisions_example.R
1 ### COVID-19 Digital Observatory
2 ### 
3 ### Minimal example analysis file using revisions data.
4 ###
5 ### Note: The revisions data files can get quite big and it may not make
6 ### sense to try to load them into memory in R. This example file
7 ### loads the first 1,000 rows of a sample file.
8
9 library(tidyverse)
10 library(lubridate)
11 library(scales)
12
13 ### Import and cleanup first 1000 rows of one datafile from the observatory
14
15 DataURL <-
16   url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_revisions-20200401.tsv")
17
18 revisions <- read.table(DataURL, sep="\t", header=TRUE,
19                         stringsAsFactors=FALSE,
20                         nrows=1000) 
21
22 revisions$timestamp <- as_datetime(revisions$timestamp)
23
24 ### Group edit by editor to see edit counts
25 user.edit.counts <- revisions %>%
26     group_by(user, title) %>% # for the example, there's only one
27                                         # article title
28     summarize(appearances = n()) %>%
29     arrange(-appearances)
30
31 ### Export that as a little table
32 write.table(user.edit.counts,
33             file="output/user_edit_counts-first1000.csv", sep=",",
34             row.names=FALSE)
35
36 ### A simple time series of edits
37 plot <- revisions %>%
38     arrange(timestamp) %>%
39     mutate(count = as.numeric(rownames(revisions))) %>%
40     ggplot(aes(x=timestamp, y=count)) +
41     geom_jitter(alpha=0.1) +
42     xlab("Date") +
43     ylab("Total edits (cumulative)") +
44     ggtitle(paste("Revisions to article: ",
45                   head(revisions$title,1), sep="")) +
46     theme_minimal()
47
48 ggsave("output/first_1000_edit_timeseries.png", plot)
49
50
51

Community Data Science Collective || Want to submit a patch?