Programming challenges

PC2

## You'll need to edit these first lines to work on your own machine
##
## Note that for working with .Rmd files interactively in Rstudio you may find it easier to do this 
## using the drop down menus: "Session" → "Set Working Directory" → "To Source File Location" 
##

## setwd("~/Documents/Teaching/2019/stats/")
## list.files("data/week_04")

mobile <- read.csv("data/week_04/COS-Statistics-Mobile_Sessions.csv")
total <- read.csv("data/week_04/COS-Statistics-Gov-Domains-Only.csv")


summary.df <- function (d) {
    print(nrow(d))
    print(ncol(d))
    print(head(d))
    print(d[sample(seq(1, nrow(d)), 5),])
}

## run these two lines a few times to look at the numbers
summary.df(mobile)

## [1] 231
## [1] 8
##   Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 1              iOS   332291        47.75    158674       60.79
## 2          Android   170107        45.53     77453       58.14
## 3          Windows    27325        44.76     12231       44.60
## 4    Windows Phone    10109        45.71      4621       59.01
## 5       BlackBerry     1375        39.27       540       62.98
## 6        (not set)      408        83.09       339       72.30
##   PagesPerSession AvgSessionDuration                  Month
## 1            2.34            0:02:11 01/01/2015 12:00:00 AM
## 2            2.98            0:03:53 01/01/2015 12:00:00 AM
## 3            3.26            0:02:40 01/01/2015 12:00:00 AM
## 4            2.14            0:01:45 01/01/2015 12:00:00 AM
## 5            2.10            0:02:24 01/01/2015 12:00:00 AM
## 6            1.82            0:01:01 01/01/2015 12:00:00 AM
##     Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 51           Samsung       59        44.07        26       77.97
## 160            Nokia       12       100.00        12      100.00
## 162       Firefox OS        6       100.00         6        0.00
## 224        (not set)      419        87.11       365       74.22
## 172               LG        6       100.00         6        0.00
##     PagesPerSession AvgSessionDuration                  Month
## 51             1.54            0:03:01 04/01/2015 12:00:00 AM
## 160            1.00            0:00:00 10/01/2015 12:00:00 AM
## 162            2.00            0:00:02 10/01/2015 12:00:00 AM
## 224            1.37            0:01:09 08/01/2016 12:00:00 AM
## 172            1.83            0:01:37 12/01/2015 12:00:00 AM

summary.df(total)

## [1] 1242
## [1] 7
##                    domain pageviews unique.pageviews average.time.on.page
## 1        www.seattle.gov/   3525737          2689843              0:01:19
## 2       www2.seattle.gov/   2158182           125984              0:01:12
## 3       web6.seattle.gov/    367871           204803              0:01:18
## 4 spdblotter.seattle.gov/    117645            91076              0:01:14
## 5       web1.seattle.gov/     79529            32258              0:01:09
## 6       find.seattle.gov/     78611            62516              0:00:39
##   bounce.rate exit.percent                  month
## 1       50.86        36.53 04/01/2015 12:00:00 AM
## 2       41.69         4.53 04/01/2015 12:00:00 AM
## 3       40.66        23.23 04/01/2015 12:00:00 AM
## 4       69.29        46.42 04/01/2015 12:00:00 AM
## 5       59.57        18.76 04/01/2015 12:00:00 AM
## 6       25.67        21.74 04/01/2015 12:00:00 AM
##                              domain pageviews unique.pageviews
## 963    sdotperformance.seattle.gov/        20               15
## 1022            obrien.seattle.gov/       283              241
## 914     lc317web.light.seattle.gov/         4                4
## 455        mayormurray.seattle.gov/      9896             8925
## 744  councilconnection.seattle.gov/         2                2
##      average.time.on.page bounce.rate exit.percent                  month
## 963               0:00:13       81.82        55.00 01/01/2016 12:00:00 AM
## 1022              0:01:51        0.61         0.34 02/01/2016 12:00:00 AM
## 914               0:00:04        0.00        50.00 01/01/2016 12:00:00 AM
## 455               0:03:23       90.59        87.12 06/01/2015 12:00:00 AM
## 744               0:00:00        1.00         1.00 09/01/2015 12:00:00 AM

## PC3. Using the top 5000 dataset, create a new data frame that one
## column per month (as described in the data) and a second column is
## the total number of views made to all pages in the dataset over
## that month.

## first create a table/array using tapply
total.views.bymonth.tbl <- tapply(total$pageviews, total$month, sum)
total.views.bymonth.tbl

##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
##                     NA                6350440                3471121 
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
##                5820453                3366834                6609602 
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
##                4087054                6481483                3644750 
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
##                6544055                6952488                8084318 
## 08/01/2015 12:00:00 AM 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 
##                7045189                3067760                2961681 
## 12/01/2015 12:00:00 AM 
##                5745045

## now construct a data frame
total.views <- data.frame(months=names(total.views.bymonth.tbl),
                          total=total.views.bymonth.tbl)

## zero out the rownames so it looks a bit better (this would all work
## the same if i didn't do this part)
rownames(total.views) <- NULL

head(total.views)

##                   months   total
## 1                             NA
## 2 01/01/2015 12:00:00 AM 6350440
## 3 01/01/2016 12:00:00 AM 3471121
## 4 02/01/2015 12:00:00 AM 5820453
## 5 02/01/2016 12:00:00 AM 3366834
## 6 03/01/2015 12:00:00 AM 6609602

## PC4. Using the mobile dataset, create a new data frame where one
## column is each month described in the data and the second is a
## measure (estimate?) of the total number of views made by mobiles
## (all platforms) over each month. This will will involve at least
## two steps since total views are included. You'll need to first use
## the data there to create a measure of the total views per platform.

## first, multiply sessions by pages per session to get an estimate of
## total pages
mobile$total.pages <- mobile$Sessions * mobile$PagesPerSession 

# see above, this is more or less copy/pasted from above
mobile.views.bymonth.tbl <- tapply(mobile$total.pages, mobile$Month, sum)
mobile.views.bymonth.tbl

##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
##                     NA              1399185.6               668275.2 
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
##              1275315.2               592607.8              1402086.4 
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
##               800842.8              1381295.1               788533.7 
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
##              1605914.9              1722519.5              1988848.0 
## 07/01/2016 12:00:00 AM 08/01/2015 12:00:00 AM 08/01/2016 12:00:00 AM 
##               878142.6              1741067.8               912435.4 
## 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 12/01/2015 12:00:00 AM 
##               564453.5              1285288.0              1223414.0

mobile.views <- data.frame(months=names(mobile.views.bymonth.tbl),
                           mobile=mobile.views.bymonth.tbl)

## PC5. Merge your two datasets together into a new dataset with
## columns for each month, total views (across the top 5000 pages) and
## total mobile views. Are there are missing data? Can you tell why?

### TODO cleanup variable names to match

views <- merge(mobile.views, total.views, all.x=TRUE, all.y=TRUE, by="months")

## these don't sort well at the moment because they're not really
## dates, so lets recode them
views$months <- as.Date(views$months, format="%m/%d/%Y %H:%M:%S")

## as then sort them
views <- views[sort.list(views$months),]

## there's one line that is all missing, so lets drop that
views <- views[apply(views, 1, function (x) {!all(is.na(x))}),]

## inspect it, looks like there's some missing data. lets drop
## that. there are a few ways but complete.cases() might make most
## cases
views.complete <- views[complete.cases(views),]

## PC6. Create a new column in your merged dataset that describes your
## best estimate of the proportion (or percentage, if you really
## must!) of views that comes from mobile. Be able to talk about the
## assumptions you've made here. Make sure that date, in this final
## column, is a date or datetime object in R.

views.complete$pct.mobile <- views.complete$mobile / views.complete$total
    
## PC6. Graph this over time and be ready to describe: (a) your best
## estimate of the proportion of views from mobiles to the Seattle
## City website over time and (b) an indication of whether it's going
## up or down.

library(ggplot2)
ggplot(data=views.complete) + aes(x=months, y=pct.mobile) + geom_point() + scale_y_continuous(limits=c(0, 1))

Week 4 Problem set: Worked solutions

Statistics and statistical programming
Northwestern University
MTS 525

Aaron Shaw

April 16, 2019

Programming challenges

PC2

Statistical questions

Empirical paper questions

Week 4 Problem set: Worked solutions

Statistics and statistical programming Northwestern University MTS 525

Aaron Shaw

April 16, 2019

Programming challenges

PC2

Statistical questions

Empirical paper questions

Statistics and statistical programming
Northwestern University
MTS 525