PC2
## You'll need to edit these first lines to work on your own machine
##
## Note that for working with .Rmd files interactively in Rstudio you may find it easier to do this
## using the drop down menus: "Session" → "Set Working Directory" → "To Source File Location"
##
## setwd("~/Documents/Teaching/2019/stats/")
## list.files("data/week_04")
mobile <- read.csv("data/week_04/COS-Statistics-Mobile_Sessions.csv")
total <- read.csv("data/week_04/COS-Statistics-Gov-Domains-Only.csv")
summary.df <- function (d) {
print(nrow(d))
print(ncol(d))
print(head(d))
print(d[sample(seq(1, nrow(d)), 5),])
}
## run these two lines a few times to look at the numbers
summary.df(mobile)
## [1] 231
## [1] 8
## Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 1 iOS 332291 47.75 158674 60.79
## 2 Android 170107 45.53 77453 58.14
## 3 Windows 27325 44.76 12231 44.60
## 4 Windows Phone 10109 45.71 4621 59.01
## 5 BlackBerry 1375 39.27 540 62.98
## 6 (not set) 408 83.09 339 72.30
## PagesPerSession AvgSessionDuration Month
## 1 2.34 0:02:11 01/01/2015 12:00:00 AM
## 2 2.98 0:03:53 01/01/2015 12:00:00 AM
## 3 3.26 0:02:40 01/01/2015 12:00:00 AM
## 4 2.14 0:01:45 01/01/2015 12:00:00 AM
## 5 2.10 0:02:24 01/01/2015 12:00:00 AM
## 6 1.82 0:01:01 01/01/2015 12:00:00 AM
## Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 51 Samsung 59 44.07 26 77.97
## 160 Nokia 12 100.00 12 100.00
## 162 Firefox OS 6 100.00 6 0.00
## 224 (not set) 419 87.11 365 74.22
## 172 LG 6 100.00 6 0.00
## PagesPerSession AvgSessionDuration Month
## 51 1.54 0:03:01 04/01/2015 12:00:00 AM
## 160 1.00 0:00:00 10/01/2015 12:00:00 AM
## 162 2.00 0:00:02 10/01/2015 12:00:00 AM
## 224 1.37 0:01:09 08/01/2016 12:00:00 AM
## 172 1.83 0:01:37 12/01/2015 12:00:00 AM
summary.df(total)
## [1] 1242
## [1] 7
## domain pageviews unique.pageviews average.time.on.page
## 1 www.seattle.gov/ 3525737 2689843 0:01:19
## 2 www2.seattle.gov/ 2158182 125984 0:01:12
## 3 web6.seattle.gov/ 367871 204803 0:01:18
## 4 spdblotter.seattle.gov/ 117645 91076 0:01:14
## 5 web1.seattle.gov/ 79529 32258 0:01:09
## 6 find.seattle.gov/ 78611 62516 0:00:39
## bounce.rate exit.percent month
## 1 50.86 36.53 04/01/2015 12:00:00 AM
## 2 41.69 4.53 04/01/2015 12:00:00 AM
## 3 40.66 23.23 04/01/2015 12:00:00 AM
## 4 69.29 46.42 04/01/2015 12:00:00 AM
## 5 59.57 18.76 04/01/2015 12:00:00 AM
## 6 25.67 21.74 04/01/2015 12:00:00 AM
## domain pageviews unique.pageviews
## 963 sdotperformance.seattle.gov/ 20 15
## 1022 obrien.seattle.gov/ 283 241
## 914 lc317web.light.seattle.gov/ 4 4
## 455 mayormurray.seattle.gov/ 9896 8925
## 744 councilconnection.seattle.gov/ 2 2
## average.time.on.page bounce.rate exit.percent month
## 963 0:00:13 81.82 55.00 01/01/2016 12:00:00 AM
## 1022 0:01:51 0.61 0.34 02/01/2016 12:00:00 AM
## 914 0:00:04 0.00 50.00 01/01/2016 12:00:00 AM
## 455 0:03:23 90.59 87.12 06/01/2015 12:00:00 AM
## 744 0:00:00 1.00 1.00 09/01/2015 12:00:00 AM
## PC3. Using the top 5000 dataset, create a new data frame that one
## column per month (as described in the data) and a second column is
## the total number of views made to all pages in the dataset over
## that month.
## first create a table/array using tapply
total.views.bymonth.tbl <- tapply(total$pageviews, total$month, sum)
total.views.bymonth.tbl
## 01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM
## NA 6350440 3471121
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM
## 5820453 3366834 6609602
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM
## 4087054 6481483 3644750
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM
## 6544055 6952488 8084318
## 08/01/2015 12:00:00 AM 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM
## 7045189 3067760 2961681
## 12/01/2015 12:00:00 AM
## 5745045
## now construct a data frame
total.views <- data.frame(months=names(total.views.bymonth.tbl),
total=total.views.bymonth.tbl)
## zero out the rownames so it looks a bit better (this would all work
## the same if i didn't do this part)
rownames(total.views) <- NULL
head(total.views)
## months total
## 1 NA
## 2 01/01/2015 12:00:00 AM 6350440
## 3 01/01/2016 12:00:00 AM 3471121
## 4 02/01/2015 12:00:00 AM 5820453
## 5 02/01/2016 12:00:00 AM 3366834
## 6 03/01/2015 12:00:00 AM 6609602
## PC4. Using the mobile dataset, create a new data frame where one
## column is each month described in the data and the second is a
## measure (estimate?) of the total number of views made by mobiles
## (all platforms) over each month. This will will involve at least
## two steps since total views are included. You'll need to first use
## the data there to create a measure of the total views per platform.
## first, multiply sessions by pages per session to get an estimate of
## total pages
mobile$total.pages <- mobile$Sessions * mobile$PagesPerSession
# see above, this is more or less copy/pasted from above
mobile.views.bymonth.tbl <- tapply(mobile$total.pages, mobile$Month, sum)
mobile.views.bymonth.tbl
## 01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM
## NA 1399185.6 668275.2
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM
## 1275315.2 592607.8 1402086.4
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM
## 800842.8 1381295.1 788533.7
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM
## 1605914.9 1722519.5 1988848.0
## 07/01/2016 12:00:00 AM 08/01/2015 12:00:00 AM 08/01/2016 12:00:00 AM
## 878142.6 1741067.8 912435.4
## 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 12/01/2015 12:00:00 AM
## 564453.5 1285288.0 1223414.0
mobile.views <- data.frame(months=names(mobile.views.bymonth.tbl),
mobile=mobile.views.bymonth.tbl)
## PC5. Merge your two datasets together into a new dataset with
## columns for each month, total views (across the top 5000 pages) and
## total mobile views. Are there are missing data? Can you tell why?
### TODO cleanup variable names to match
views <- merge(mobile.views, total.views, all.x=TRUE, all.y=TRUE, by="months")
## these don't sort well at the moment because they're not really
## dates, so lets recode them
views$months <- as.Date(views$months, format="%m/%d/%Y %H:%M:%S")
## as then sort them
views <- views[sort.list(views$months),]
## there's one line that is all missing, so lets drop that
views <- views[apply(views, 1, function (x) {!all(is.na(x))}),]
## inspect it, looks like there's some missing data. lets drop
## that. there are a few ways but complete.cases() might make most
## cases
views.complete <- views[complete.cases(views),]
## PC6. Create a new column in your merged dataset that describes your
## best estimate of the proportion (or percentage, if you really
## must!) of views that comes from mobile. Be able to talk about the
## assumptions you've made here. Make sure that date, in this final
## column, is a date or datetime object in R.
views.complete$pct.mobile <- views.complete$mobile / views.complete$total
## PC6. Graph this over time and be ready to describe: (a) your best
## estimate of the proportion of views from mobiles to the Seattle
## City website over time and (b) an indication of whether it's going
## up or down.
library(ggplot2)
ggplot(data=views.complete) + aes(x=months, y=pct.mobile) + geom_point() + scale_y_continuous(limits=c(0, 1))