X-Git-Url: https://code.communitydata.science/stats_class_2019.git/blobdiff_plain/bd9eba025b1137bb6497a56ad3f5634ff411058e..f21e8d18685e0ed07df48b7eb5050a616ee316dd:/problem_sets/week_04/ps4-worked_solution.html diff --git a/problem_sets/week_04/ps4-worked_solution.html b/problem_sets/week_04/ps4-worked_solution.html new file mode 100644 index 0000000..cbb90a6 --- /dev/null +++ b/problem_sets/week_04/ps4-worked_solution.html @@ -0,0 +1,426 @@ + + + + +
+ + + + + + + + + + +## You'll need to edit these first lines to work on your own machine
+##
+## Note that for working with .Rmd files interactively in Rstudio you may find it easier to do this
+## using the drop down menus: "Session" â "Set Working Directory" â "To Source File Location"
+##
+
+## setwd("~/Documents/Teaching/2019/stats/")
+## list.files("data/week_04")
+
+mobile <- read.csv("data/week_04/COS-Statistics-Mobile_Sessions.csv")
+total <- read.csv("data/week_04/COS-Statistics-Gov-Domains-Only.csv")
+
+
+summary.df <- function (d) {
+ print(nrow(d))
+ print(ncol(d))
+ print(head(d))
+ print(d[sample(seq(1, nrow(d)), 5),])
+}
+
+## run these two lines a few times to look at the numbers
+summary.df(mobile)
+## [1] 231
+## [1] 8
+## Operating_System Sessions New_Sessions New_Users Bounce_Rate
+## 1 iOS 332291 47.75 158674 60.79
+## 2 Android 170107 45.53 77453 58.14
+## 3 Windows 27325 44.76 12231 44.60
+## 4 Windows Phone 10109 45.71 4621 59.01
+## 5 BlackBerry 1375 39.27 540 62.98
+## 6 (not set) 408 83.09 339 72.30
+## PagesPerSession AvgSessionDuration Month
+## 1 2.34 0:02:11 01/01/2015 12:00:00 AM
+## 2 2.98 0:03:53 01/01/2015 12:00:00 AM
+## 3 3.26 0:02:40 01/01/2015 12:00:00 AM
+## 4 2.14 0:01:45 01/01/2015 12:00:00 AM
+## 5 2.10 0:02:24 01/01/2015 12:00:00 AM
+## 6 1.82 0:01:01 01/01/2015 12:00:00 AM
+## Operating_System Sessions New_Sessions New_Users Bounce_Rate
+## 51 Samsung 59 44.07 26 77.97
+## 160 Nokia 12 100.00 12 100.00
+## 162 Firefox OS 6 100.00 6 0.00
+## 224 (not set) 419 87.11 365 74.22
+## 172 LG 6 100.00 6 0.00
+## PagesPerSession AvgSessionDuration Month
+## 51 1.54 0:03:01 04/01/2015 12:00:00 AM
+## 160 1.00 0:00:00 10/01/2015 12:00:00 AM
+## 162 2.00 0:00:02 10/01/2015 12:00:00 AM
+## 224 1.37 0:01:09 08/01/2016 12:00:00 AM
+## 172 1.83 0:01:37 12/01/2015 12:00:00 AM
+summary.df(total)
+## [1] 1242
+## [1] 7
+## domain pageviews unique.pageviews average.time.on.page
+## 1 www.seattle.gov/ 3525737 2689843 0:01:19
+## 2 www2.seattle.gov/ 2158182 125984 0:01:12
+## 3 web6.seattle.gov/ 367871 204803 0:01:18
+## 4 spdblotter.seattle.gov/ 117645 91076 0:01:14
+## 5 web1.seattle.gov/ 79529 32258 0:01:09
+## 6 find.seattle.gov/ 78611 62516 0:00:39
+## bounce.rate exit.percent month
+## 1 50.86 36.53 04/01/2015 12:00:00 AM
+## 2 41.69 4.53 04/01/2015 12:00:00 AM
+## 3 40.66 23.23 04/01/2015 12:00:00 AM
+## 4 69.29 46.42 04/01/2015 12:00:00 AM
+## 5 59.57 18.76 04/01/2015 12:00:00 AM
+## 6 25.67 21.74 04/01/2015 12:00:00 AM
+## domain pageviews unique.pageviews
+## 963 sdotperformance.seattle.gov/ 20 15
+## 1022 obrien.seattle.gov/ 283 241
+## 914 lc317web.light.seattle.gov/ 4 4
+## 455 mayormurray.seattle.gov/ 9896 8925
+## 744 councilconnection.seattle.gov/ 2 2
+## average.time.on.page bounce.rate exit.percent month
+## 963 0:00:13 81.82 55.00 01/01/2016 12:00:00 AM
+## 1022 0:01:51 0.61 0.34 02/01/2016 12:00:00 AM
+## 914 0:00:04 0.00 50.00 01/01/2016 12:00:00 AM
+## 455 0:03:23 90.59 87.12 06/01/2015 12:00:00 AM
+## 744 0:00:00 1.00 1.00 09/01/2015 12:00:00 AM
+## PC3. Using the top 5000 dataset, create a new data frame that one
+## column per month (as described in the data) and a second column is
+## the total number of views made to all pages in the dataset over
+## that month.
+
+## first create a table/array using tapply
+total.views.bymonth.tbl <- tapply(total$pageviews, total$month, sum)
+total.views.bymonth.tbl
+## 01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM
+## NA 6350440 3471121
+## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM
+## 5820453 3366834 6609602
+## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM
+## 4087054 6481483 3644750
+## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM
+## 6544055 6952488 8084318
+## 08/01/2015 12:00:00 AM 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM
+## 7045189 3067760 2961681
+## 12/01/2015 12:00:00 AM
+## 5745045
+## now construct a data frame
+total.views <- data.frame(months=names(total.views.bymonth.tbl),
+ total=total.views.bymonth.tbl)
+
+## zero out the rownames so it looks a bit better (this would all work
+## the same if i didn't do this part)
+rownames(total.views) <- NULL
+
+head(total.views)
+## months total
+## 1 NA
+## 2 01/01/2015 12:00:00 AM 6350440
+## 3 01/01/2016 12:00:00 AM 3471121
+## 4 02/01/2015 12:00:00 AM 5820453
+## 5 02/01/2016 12:00:00 AM 3366834
+## 6 03/01/2015 12:00:00 AM 6609602
+## PC4. Using the mobile dataset, create a new data frame where one
+## column is each month described in the data and the second is a
+## measure (estimate?) of the total number of views made by mobiles
+## (all platforms) over each month. This will will involve at least
+## two steps since total views are included. You'll need to first use
+## the data there to create a measure of the total views per platform.
+
+## first, multiply sessions by pages per session to get an estimate of
+## total pages
+mobile$total.pages <- mobile$Sessions * mobile$PagesPerSession
+
+# see above, this is more or less copy/pasted from above
+mobile.views.bymonth.tbl <- tapply(mobile$total.pages, mobile$Month, sum)
+mobile.views.bymonth.tbl
+## 01/01/2015 12:00:00 AM 01/01/2016 12:00:00 AM
+## NA 1399185.6 668275.2
+## 02/01/2015 12:00:00 AM 02/01/2016 12:00:00 AM 03/01/2015 12:00:00 AM
+## 1275315.2 592607.8 1402086.4
+## 03/01/2016 12:00:00 AM 04/01/2015 12:00:00 AM 04/01/2016 12:00:00 AM
+## 800842.8 1381295.1 788533.7
+## 05/01/2015 12:00:00 AM 06/01/2015 12:00:00 AM 07/01/2015 12:00:00 AM
+## 1605914.9 1722519.5 1988848.0
+## 07/01/2016 12:00:00 AM 08/01/2015 12:00:00 AM 08/01/2016 12:00:00 AM
+## 878142.6 1741067.8 912435.4
+## 09/01/2015 12:00:00 AM 10/01/2015 12:00:00 AM 12/01/2015 12:00:00 AM
+## 564453.5 1285288.0 1223414.0
+mobile.views <- data.frame(months=names(mobile.views.bymonth.tbl),
+ mobile=mobile.views.bymonth.tbl)
+## PC5. Merge your two datasets together into a new dataset with
+## columns for each month, total views (across the top 5000 pages) and
+## total mobile views. Are there are missing data? Can you tell why?
+
+### TODO cleanup variable names to match
+
+views <- merge(mobile.views, total.views, all.x=TRUE, all.y=TRUE, by="months")
+
+## these don't sort well at the moment because they're not really
+## dates, so lets recode them
+views$months <- as.Date(views$months, format="%m/%d/%Y %H:%M:%S")
+
+## as then sort them
+views <- views[sort.list(views$months),]
+
+## there's one line that is all missing, so lets drop that
+views <- views[apply(views, 1, function (x) {!all(is.na(x))}),]
+
+## inspect it, looks like there's some missing data. lets drop
+## that. there are a few ways but complete.cases() might make most
+## cases
+views.complete <- views[complete.cases(views),]
+## PC6. Create a new column in your merged dataset that describes your
+## best estimate of the proportion (or percentage, if you really
+## must!) of views that comes from mobile. Be able to talk about the
+## assumptions you've made here. Make sure that date, in this final
+## column, is a date or datetime object in R.
+
+views.complete$pct.mobile <- views.complete$mobile / views.complete$total
+
+## PC6. Graph this over time and be ready to describe: (a) your best
+## estimate of the proportion of views from mobiles to the Seattle
+## City website over time and (b) an indication of whether it's going
+## up or down.
+
+library(ggplot2)
+ggplot(data=views.complete) + aes(x=months, y=pct.mobile) + geom_point() + scale_y_continuous(limits=c(0, 1))
+
+