Programming challenges

PC2

## You'll need to edit these first lines to work on your own machine
##
## Note that for working with .Rmd files interactively in Rstudio you may find it easier to do this 
## using the drop down menus: "Session" → "Set Working Directory" → "To Source File Location" 
##

## setwd("~/Documents/Teaching/2019/stats/")
## list.files("data/week_04")

mobile <- read.csv("data/week_04/COS-Statistics-Mobile_Sessions.csv")
total <- read.csv("data/week_04/COS-Statistics-Gov-Domains-Only.csv")


summary.df <- function (d) {
    print(nrow(d))
    print(ncol(d))
    print(head(d))
    print(d[sample(seq(1, nrow(d)), 5),])
}

## run these two lines a few times to look at the numbers
summary.df(mobile)
## [1] 231
## [1] 8
##   Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 1              iOS   332291        47.75    158674       60.79
## 2          Android   170107        45.53     77453       58.14
## 3          Windows    27325        44.76     12231       44.60
## 4    Windows Phone    10109        45.71      4621       59.01
## 5       BlackBerry     1375        39.27       540       62.98
## 6        (not set)      408        83.09       339       72.30
##   PagesPerSession AvgSessionDuration                  Month
## 1            2.34            0:02:11 01/01/2015 12:00:00 AM
## 2            2.98            0:03:53 01/01/2015 12:00:00 AM
## 3            3.26            0:02:40 01/01/2015 12:00:00 AM
## 4            2.14            0:01:45 01/01/2015 12:00:00 AM
## 5            2.10            0:02:24 01/01/2015 12:00:00 AM
## 6            1.82            0:01:01 01/01/2015 12:00:00 AM
##     Operating_System Sessions New_Sessions New_Users Bounce_Rate
## 51           Samsung       59        44.07        26       77.97
## 160            Nokia       12       100.00        12      100.00
## 162       Firefox OS        6       100.00         6        0.00
## 224        (not set)      419        87.11       365       74.22
## 172               LG        6       100.00         6        0.00
##     PagesPerSession AvgSessionDuration                  Month
## 51             1.54            0:03:01 04/01/2015 12:00:00 AM
## 160            1.00            0:00:00 10/01/2015 12:00:00 AM
## 162            2.00            0:00:02 10/01/2015 12:00:00 AM
## 224            1.37            0:01:09 08/01/2016 12:00:00 AM
## 172            1.83            0:01:37 12/01/2015 12:00:00 AM
summary.df(total)
## [1] 1242
## [1] 7
##                    domain pageviews unique.pageviews average.time.on.page
## 1        www.seattle.gov/   3525737          2689843              0:01:19
## 2       www2.seattle.gov/   2158182           125984              0:01:12
## 3       web6.seattle.gov/    367871           204803              0:01:18
## 4 spdblotter.seattle.gov/    117645            91076              0:01:14
## 5       web1.seattle.gov/     79529            32258              0:01:09
## 6       find.seattle.gov/     78611            62516              0:00:39
##   bounce.rate exit.percent                  month
## 1       50.86        36.53 04/01/2015 12:00:00 AM
## 2       41.69         4.53 04/01/2015 12:00:00 AM
## 3       40.66        23.23 04/01/2015 12:00:00 AM
## 4       69.29        46.42 04/01/2015 12:00:00 AM
## 5       59.57        18.76 04/01/2015 12:00:00 AM
## 6       25.67        21.74 04/01/2015 12:00:00 AM
##                              domain pageviews unique.pageviews
## 963    sdotperformance.seattle.gov/        20               15
## 1022            obrien.seattle.gov/       283              241
## 914     lc317web.light.seattle.gov/         4                4
## 455        mayormurray.seattle.gov/      9896             8925
## 744  councilconnection.seattle.gov/         2                2
##      average.time.on.page bounce.rate exit.percent                  month
## 963               0:00:13       81.82        55.00 01/01/2016 12:00:00 AM
## 1022              0:01:51        0.61         0.34 02/01/2016 12:00:00 AM
## 914               0:00:04        0.00        50.00 01/01/2016 12:00:00 AM
## 455               0:03:23       90.59        87.12 06/01/2015 12:00:00 AM
## 744               0:00:00        1.00         1.00 09/01/2015 12:00:00 AM
## PC3. Using the top 5000 dataset, create a new data frame that one
## column per month (as described in the data) and a second column is
## the total number of views made to all pages in the dataset over
## that month.

## first create a table/array using tapply
total.views.bymonth.tbl <- tapply(total$pageviews, total$month, sum)
total.views.bymonth.tbl
##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
##                     NA                6350440                3471121 
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
##                5820453                3366834                6609602 
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
##                4087054                6481483                3644750 
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
##                6544055                6952488                8084318 
## 08/01/2015 12:00:00 AM 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 
##                7045189                3067760                2961681 
## 12/01/2015 12:00:00 AM 
##                5745045
## now construct a data frame
total.views <- data.frame(months=names(total.views.bymonth.tbl),
                          total=total.views.bymonth.tbl)

## zero out the rownames so it looks a bit better (this would all work
## the same if i didn't do this part)
rownames(total.views) <- NULL

head(total.views)
##                   months   total
## 1                             NA
## 2 01/01/2015 12:00:00 AM 6350440
## 3 01/01/2016 12:00:00 AM 3471121
## 4 02/01/2015 12:00:00 AM 5820453
## 5 02/01/2016 12:00:00 AM 3366834
## 6 03/01/2015 12:00:00 AM 6609602
## PC4. Using the mobile dataset, create a new data frame where one
## column is each month described in the data and the second is a
## measure (estimate?) of the total number of views made by mobiles
## (all platforms) over each month. This will will involve at least
## two steps since total views are included. You'll need to first use
## the data there to create a measure of the total views per platform.

## first, multiply sessions by pages per session to get an estimate of
## total pages
mobile$total.pages <- mobile$Sessions * mobile$PagesPerSession 

# see above, this is more or less copy/pasted from above
mobile.views.bymonth.tbl <- tapply(mobile$total.pages, mobile$Month, sum)
mobile.views.bymonth.tbl
##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
##                     NA              1399185.6               668275.2 
## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
##              1275315.2               592607.8              1402086.4 
## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
##               800842.8              1381295.1               788533.7 
## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
##              1605914.9              1722519.5              1988848.0 
## 07/01/2016 12:00:00 AM 08/01/2015 12:00:00 AM 08/01/2016 12:00:00 AM 
##               878142.6              1741067.8               912435.4 
## 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 12/01/2015 12:00:00 AM 
##               564453.5              1285288.0              1223414.0
mobile.views <- data.frame(months=names(mobile.views.bymonth.tbl),
                           mobile=mobile.views.bymonth.tbl)
## PC5. Merge your two datasets together into a new dataset with
## columns for each month, total views (across the top 5000 pages) and
## total mobile views. Are there are missing data? Can you tell why?

### TODO cleanup variable names to match

views <- merge(mobile.views, total.views, all.x=TRUE, all.y=TRUE, by="months")

## these don't sort well at the moment because they're not really
## dates, so lets recode them
views$months <- as.Date(views$months, format="%m/%d/%Y %H:%M:%S")

## as then sort them
views <- views[sort.list(views$months),]

## there's one line that is all missing, so lets drop that
views <- views[apply(views, 1, function (x) {!all(is.na(x))}),]

## inspect it, looks like there's some missing data. lets drop
## that. there are a few ways but complete.cases() might make most
## cases
views.complete <- views[complete.cases(views),]
## PC6. Create a new column in your merged dataset that describes your
## best estimate of the proportion (or percentage, if you really
## must!) of views that comes from mobile. Be able to talk about the
## assumptions you've made here. Make sure that date, in this final
## column, is a date or datetime object in R.

views.complete$pct.mobile <- views.complete$mobile / views.complete$total
    
## PC6. Graph this over time and be ready to describe: (a) your best
## estimate of the proportion of views from mobiles to the Seattle
## City website over time and (b) an indication of whether it's going
## up or down.

library(ggplot2)
ggplot(data=views.complete) + aes(x=months, y=pct.mobile) + geom_point() + scale_y_continuous(limits=c(0, 1))

Statistical questions

Empirical paper questions