X-Git-Url: https://code.communitydata.science/stats_class_2019.git/blobdiff_plain/bd9eba025b1137bb6497a56ad3f5634ff411058e..f21e8d18685e0ed07df48b7eb5050a616ee316dd:/problem_sets/week_04/ps4-worked_solution.html diff --git a/problem_sets/week_04/ps4-worked_solution.html b/problem_sets/week_04/ps4-worked_solution.html new file mode 100644 index 0000000..cbb90a6 --- /dev/null +++ b/problem_sets/week_04/ps4-worked_solution.html @@ -0,0 +1,426 @@ + + + + + + + + + + + + + + + +Week 4 Problem set: Worked solutions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Programming challenges

+
+

PC2

+
## You'll need to edit these first lines to work on your own machine
+##
+## Note that for working with .Rmd files interactively in Rstudio you may find it easier to do this 
+## using the drop down menus: "Session" → "Set Working Directory" → "To Source File Location" 
+##
+
+## setwd("~/Documents/Teaching/2019/stats/")
+## list.files("data/week_04")
+
+mobile <- read.csv("data/week_04/COS-Statistics-Mobile_Sessions.csv")
+total <- read.csv("data/week_04/COS-Statistics-Gov-Domains-Only.csv")
+
+
+summary.df <- function (d) {
+    print(nrow(d))
+    print(ncol(d))
+    print(head(d))
+    print(d[sample(seq(1, nrow(d)), 5),])
+}
+
+## run these two lines a few times to look at the numbers
+summary.df(mobile)
+
## [1] 231
+## [1] 8
+##   Operating_System Sessions New_Sessions New_Users Bounce_Rate
+## 1              iOS   332291        47.75    158674       60.79
+## 2          Android   170107        45.53     77453       58.14
+## 3          Windows    27325        44.76     12231       44.60
+## 4    Windows Phone    10109        45.71      4621       59.01
+## 5       BlackBerry     1375        39.27       540       62.98
+## 6        (not set)      408        83.09       339       72.30
+##   PagesPerSession AvgSessionDuration                  Month
+## 1            2.34            0:02:11 01/01/2015 12:00:00 AM
+## 2            2.98            0:03:53 01/01/2015 12:00:00 AM
+## 3            3.26            0:02:40 01/01/2015 12:00:00 AM
+## 4            2.14            0:01:45 01/01/2015 12:00:00 AM
+## 5            2.10            0:02:24 01/01/2015 12:00:00 AM
+## 6            1.82            0:01:01 01/01/2015 12:00:00 AM
+##     Operating_System Sessions New_Sessions New_Users Bounce_Rate
+## 51           Samsung       59        44.07        26       77.97
+## 160            Nokia       12       100.00        12      100.00
+## 162       Firefox OS        6       100.00         6        0.00
+## 224        (not set)      419        87.11       365       74.22
+## 172               LG        6       100.00         6        0.00
+##     PagesPerSession AvgSessionDuration                  Month
+## 51             1.54            0:03:01 04/01/2015 12:00:00 AM
+## 160            1.00            0:00:00 10/01/2015 12:00:00 AM
+## 162            2.00            0:00:02 10/01/2015 12:00:00 AM
+## 224            1.37            0:01:09 08/01/2016 12:00:00 AM
+## 172            1.83            0:01:37 12/01/2015 12:00:00 AM
+
summary.df(total)
+
## [1] 1242
+## [1] 7
+##                    domain pageviews unique.pageviews average.time.on.page
+## 1        www.seattle.gov/   3525737          2689843              0:01:19
+## 2       www2.seattle.gov/   2158182           125984              0:01:12
+## 3       web6.seattle.gov/    367871           204803              0:01:18
+## 4 spdblotter.seattle.gov/    117645            91076              0:01:14
+## 5       web1.seattle.gov/     79529            32258              0:01:09
+## 6       find.seattle.gov/     78611            62516              0:00:39
+##   bounce.rate exit.percent                  month
+## 1       50.86        36.53 04/01/2015 12:00:00 AM
+## 2       41.69         4.53 04/01/2015 12:00:00 AM
+## 3       40.66        23.23 04/01/2015 12:00:00 AM
+## 4       69.29        46.42 04/01/2015 12:00:00 AM
+## 5       59.57        18.76 04/01/2015 12:00:00 AM
+## 6       25.67        21.74 04/01/2015 12:00:00 AM
+##                              domain pageviews unique.pageviews
+## 963    sdotperformance.seattle.gov/        20               15
+## 1022            obrien.seattle.gov/       283              241
+## 914     lc317web.light.seattle.gov/         4                4
+## 455        mayormurray.seattle.gov/      9896             8925
+## 744  councilconnection.seattle.gov/         2                2
+##      average.time.on.page bounce.rate exit.percent                  month
+## 963               0:00:13       81.82        55.00 01/01/2016 12:00:00 AM
+## 1022              0:01:51        0.61         0.34 02/01/2016 12:00:00 AM
+## 914               0:00:04        0.00        50.00 01/01/2016 12:00:00 AM
+## 455               0:03:23       90.59        87.12 06/01/2015 12:00:00 AM
+## 744               0:00:00        1.00         1.00 09/01/2015 12:00:00 AM
+
## PC3. Using the top 5000 dataset, create a new data frame that one
+## column per month (as described in the data) and a second column is
+## the total number of views made to all pages in the dataset over
+## that month.
+
+## first create a table/array using tapply
+total.views.bymonth.tbl <- tapply(total$pageviews, total$month, sum)
+total.views.bymonth.tbl
+
##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
+##                     NA                6350440                3471121 
+## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
+##                5820453                3366834                6609602 
+## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
+##                4087054                6481483                3644750 
+## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
+##                6544055                6952488                8084318 
+## 08/01/2015 12:00:00 AM 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 
+##                7045189                3067760                2961681 
+## 12/01/2015 12:00:00 AM 
+##                5745045
+
## now construct a data frame
+total.views <- data.frame(months=names(total.views.bymonth.tbl),
+                          total=total.views.bymonth.tbl)
+
+## zero out the rownames so it looks a bit better (this would all work
+## the same if i didn't do this part)
+rownames(total.views) <- NULL
+
+head(total.views)
+
##                   months   total
+## 1                             NA
+## 2 01/01/2015 12:00:00 AM 6350440
+## 3 01/01/2016 12:00:00 AM 3471121
+## 4 02/01/2015 12:00:00 AM 5820453
+## 5 02/01/2016 12:00:00 AM 3366834
+## 6 03/01/2015 12:00:00 AM 6609602
+
## PC4. Using the mobile dataset, create a new data frame where one
+## column is each month described in the data and the second is a
+## measure (estimate?) of the total number of views made by mobiles
+## (all platforms) over each month. This will will involve at least
+## two steps since total views are included. You'll need to first use
+## the data there to create a measure of the total views per platform.
+
+## first, multiply sessions by pages per session to get an estimate of
+## total pages
+mobile$total.pages <- mobile$Sessions * mobile$PagesPerSession 
+
+# see above, this is more or less copy/pasted from above
+mobile.views.bymonth.tbl <- tapply(mobile$total.pages, mobile$Month, sum)
+mobile.views.bymonth.tbl
+
##                        01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM 
+##                     NA              1399185.6               668275.2 
+## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM 
+##              1275315.2               592607.8              1402086.4 
+## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM 
+##               800842.8              1381295.1               788533.7 
+## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM 
+##              1605914.9              1722519.5              1988848.0 
+## 07/01/2016 12:00:00 AM 08/01/2015 12:00:00 AM 08/01/2016 12:00:00 AM 
+##               878142.6              1741067.8               912435.4 
+## 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 12/01/2015 12:00:00 AM 
+##               564453.5              1285288.0              1223414.0
+
mobile.views <- data.frame(months=names(mobile.views.bymonth.tbl),
+                           mobile=mobile.views.bymonth.tbl)
+
## PC5. Merge your two datasets together into a new dataset with
+## columns for each month, total views (across the top 5000 pages) and
+## total mobile views. Are there are missing data? Can you tell why?
+
+### TODO cleanup variable names to match
+
+views <- merge(mobile.views, total.views, all.x=TRUE, all.y=TRUE, by="months")
+
+## these don't sort well at the moment because they're not really
+## dates, so lets recode them
+views$months <- as.Date(views$months, format="%m/%d/%Y %H:%M:%S")
+
+## as then sort them
+views <- views[sort.list(views$months),]
+
+## there's one line that is all missing, so lets drop that
+views <- views[apply(views, 1, function (x) {!all(is.na(x))}),]
+
+## inspect it, looks like there's some missing data. lets drop
+## that. there are a few ways but complete.cases() might make most
+## cases
+views.complete <- views[complete.cases(views),]
+
## PC6. Create a new column in your merged dataset that describes your
+## best estimate of the proportion (or percentage, if you really
+## must!) of views that comes from mobile. Be able to talk about the
+## assumptions you've made here. Make sure that date, in this final
+## column, is a date or datetime object in R.
+
+views.complete$pct.mobile <- views.complete$mobile / views.complete$total
+    
+## PC6. Graph this over time and be ready to describe: (a) your best
+## estimate of the proportion of views from mobiles to the Seattle
+## City website over time and (b) an indication of whether it's going
+## up or down.
+
+library(ggplot2)
+ggplot(data=views.complete) + aes(x=months, y=pct.mobile) + geom_point() + scale_y_continuous(limits=c(0, 1))
+

+
+
+
+

Statistical questions

+
+
+

Empirical paper questions

+
+ + + + +
+ + + + + + + +