X-Git-Url: https://code.communitydata.science/stats_class_2019.git/blobdiff_plain/94e323497df92322a9e05f1c820a38fb56e38b39..dbde6a6880af749afd92848e06128fe161159d0b:/problem_sets/week_06/ps6-worked-solution.html diff --git a/problem_sets/week_06/ps6-worked-solution.html b/problem_sets/week_06/ps6-worked-solution.html new file mode 100644 index 0000000..7a80c67 --- /dev/null +++ b/problem_sets/week_06/ps6-worked-solution.html @@ -0,0 +1,588 @@ + + + + +
+ + + + + + + + + + +PC0. First we import the data.
+raw_df = read.csv("~/Desktop/DeleteMe/Teaching/owan03.csv") # Note that I saved the file as a CSV for importing to R
+head(raw_df)
+## X1 X2 X3 X4
+## 1 70 49 30 34
+## 2 77 60 37 36
+## 3 83 63 56 48
+## 4 87 67 65 48
+## 5 92 70 76 65
+## 6 93 74 83 91
+PC1. Letâs reshape the data
+library(tidyverse)
+## ââ Attaching packages ââââââââââââââââââââââââââââââââââ tidyverse 1.2.1 ââ
+## â ggplot2 3.1.0 â purrr 0.2.5
+## â tibble 2.1.1 â dplyr 0.7.7
+## â tidyr 0.8.2 â stringr 1.3.1
+## â readr 1.1.1 â forcats 0.3.0
+## ââ Conflicts âââââââââââââââââââââââââââââââââââââ tidyverse_conflicts() ââ
+## â dplyr::filter() masks stats::filter()
+## â dplyr::lag() masks stats::lag()
+# Rename the columns
+colnames(raw_df) <- c("None", "Low", "Medium","High")
+
+# Gather gets all of the columns from the dataframe and
+# use them as keys, with the value as the value from that cell.
+# This gives us a "long" dataframe
+df <- gather(raw_df, dose, weeks_alive)
+
+# We want to treat dose as a factor, so we can group by it easier
+df$dose <- as.factor(df$dose)
+
+# Finally, when we look at it, we see some missing data (NA); this is because not all group sizes were the same.
+# We can safely remove these.
+df <- df[complete.cases(df),]
+PC2: Now weâre goint to get statistics and create some visualizations
+tapply(df$weeks_alive, df$dose, summary)
+## $High
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## 34.00 45.00 56.50 65.25 92.75 102.00
+##
+## $Low
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## 49.00 63.00 70.00 69.89 77.00 89.00
+##
+## $Medium
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## 30.00 58.25 79.50 71.50 89.25 97.00
+##
+## $None
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## 70.00 85.00 93.00 91.36 101.00 103.00
+# Alternative way to do this using tidyverse
+
+df %>% group_by(dose) %>% summarize_all(c('min','max','mean', 'IQR'))
+## # A tibble: 4 x 5
+## dose min max mean IQR
+## <fct> <dbl> <dbl> <dbl> <dbl>
+## 1 High 34 102 65.2 47.8
+## 2 Low 49 89 69.9 14
+## 3 Medium 30 97 71.5 31
+## 4 None 70 103 91.4 16
+When it comes to visualizations, we definitely want to use ggplot. We have lots of options for what to do.
+# Histograms
+
+h_plot = df %>% ggplot(aes(x=weeks_alive, # What to summarize
+ fill = dose # How to group by color
+ )) + geom_histogram(position = 'dodge', bins = 5)
+h_plot
+
+# In this case, faceted histograms is probably better
+
+h_facet = df %>% ggplot(aes(x=weeks_alive, # What to summarize
+ )) + geom_histogram(bins = 5) + facet_grid(~dose)
+
+h_facet
+
+# Density plots
+d_plot = df %>% ggplot(aes(x=weeks_alive, # What to summarize
+ fill = dose # How to group by color
+ )) + geom_density(alpha = .2)
+d_plot
+
+# Boxplots
+box_plot = df %>% ggplot(aes(y=weeks_alive,
+ x = dose
+ )) + geom_boxplot()
+box_plot
+
+# My favorite - ridgeline plots
+
+# install.packages('ggridges')
+
+library(ggridges)
+##
+## Attaching package: 'ggridges'
+## The following object is masked from 'package:ggplot2':
+##
+## scale_discrete_manual
+ridge_plot = df %>% ggplot(aes(x=weeks_alive, y = dose)) +
+ geom_density_ridges(jittered_points = T)
+ridge_plot
+## Picking joint bandwidth of 10.5
+
+Itâs a bit tough to tell, but the overall assumptions of normality and equal variance seem reasonable.
+The global mean is
+mean(df$weeks_alive)
+## [1] 75.55263
+PC3. T-test between None and Any, and between None and High.
+t.test(df[df$dose == 'None', 'weeks_alive'], # Samples with no dose
+ df[df$dose != 'None','weeks_alive'] # Samples with any dose
+ )
+##
+## Welch Two Sample t-test
+##
+## data: df[df$dose == "None", "weeks_alive"] and df[df$dose != "None", "weeks_alive"]
+## t = 4.2065, df = 33.732, p-value = 0.0001806
+## alternative hypothesis: true difference in means is not equal to 0
+## 95 percent confidence interval:
+## 11.49879 33.00626
+## sample estimates:
+## mean of x mean of y
+## 91.36364 69.11111
+# Or, using formula notation
+t.test(weeks_alive ~ dose == 'None', data = df)
+##
+## Welch Two Sample t-test
+##
+## data: weeks_alive by dose == "None"
+## t = -4.2065, df = 33.732, p-value = 0.0001806
+## alternative hypothesis: true difference in means is not equal to 0
+## 95 percent confidence interval:
+## -33.00626 -11.49879
+## sample estimates:
+## mean in group FALSE mean in group TRUE
+## 69.11111 91.36364
+# T-test between None and High
+
+t.test(df[df$dose == 'None', 'weeks_alive'], # Samples with no dose
+ df[df$dose == 'High','weeks_alive'] # Samples with high dose
+ )
+##
+## Welch Two Sample t-test
+##
+## data: df[df$dose == "None", "weeks_alive"] and df[df$dose == "High", "weeks_alive"]
+## t = 2.4958, df = 8.5799, p-value = 0.0353
+## alternative hypothesis: true difference in means is not equal to 0
+## 95 percent confidence interval:
+## 2.266399 49.960874
+## sample estimates:
+## mean of x mean of y
+## 91.36364 65.25000
+# Formula notation is a bit tricker. I would probably create a temprorary dataframe
+
+tmp = df %>% filter(dose %in% c('None', 'High'))
+
+t.test(weeks_alive ~ dose, data = tmp)
+##
+## Welch Two Sample t-test
+##
+## data: weeks_alive by dose
+## t = -2.4958, df = 8.5799, p-value = 0.0353
+## alternative hypothesis: true difference in means is not equal to 0
+## 95 percent confidence interval:
+## -49.960874 -2.266399
+## sample estimates:
+## mean in group High mean in group None
+## 65.25000 91.36364
+The t-test supports the idea that receiving a dose of RD40 reduces lifespan
+PC4. Anova
+summary(aov(weeks_alive ~ dose, data = df))
+## Df Sum Sq Mean Sq F value Pr(>F)
+## dose 3 4052 1350.7 3.55 0.0245 *
+## Residuals 34 12937 380.5
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+This provides evidence that the group means are different.
+Q1. a) It is a sample statistic, because it comes from a sample. b) Confidence intervals for proportions are equal to
+\[p \pm z * \sqrt{ \frac{p*(1-p)}{n}}\]
+For a 95% confidence interval, \(z = 1.96\), so we can calculate it like this:
+lower = .48 - 1.96 * sqrt(.48 * .52 / 1259)
+
+upper = .48 + 1.96 * sqrt(.48 * .52 / 1259)
+
+ci = c(lower, upper)
+
+print(ci)
+## [1] 0.4524028 0.5075972
+This means that we are 95% confident that the true proportion of Americans who support legalizing marijuana is between ~45% and ~51%.
+6.20 We can use the point estimate of the poll to estimate how large a sample we would need to have a confidence interval of a given width.
+Basically, we want each half of the confidence interval to be 1%, i.e., \(1.96 * \sqrt{\frac{.48 * .52}{n}} = .01\)
+We can solve for \(n\):
+\[\sqrt{\frac{.48 * .52}{n}} = .01/1.96\] \[\frac{.48 * .52}{n} = (.01/1.96)^2\] \[n = (.48 * .52)/(.01/1.96)^2\]
+(.48 * .52)/(.01/1.96)^2
+## [1] 9588.634
+So, we need a sample of approximately 9,589
+6.38
+The question is whether there has been a change in the proportion over time. While the tools we have learned could allow you to answer that question, they assume that responses are independent. In this case, they are obviously not independent as they come from the same students.
+6.50
+chisq.test(x = c(83,121,193,103), p = c(.18,.22,.37,.23))
+##
+## Chi-squared test for given probabilities
+##
+## data: c(83, 121, 193, 103)
+## X-squared = 3.2426, df = 3, p-value = 0.3557
+# Or manually
+
+# Calculate expected values
+500 * c(.18,.22,.37,.23)
+## [1] 90 110 185 115
+# Use the formula for chi-squared
+chisq = (83-90)^2/90 + (121-110)^2/110 + (193-185)^2/185 + (103-115)^2/115
+The p-value for this is large, meaning that we donât have evidence that the sample differs from the census distribution.
+x <- matrix(c(29,54,44,77,62,131,36,67), nrow = 4, # this makes a matrix with 4 rows
+ byrow=T) # And this says that we've entered it row by row
+chisq.test(x)
+##
+## Pearson's Chi-squared test
+##
+## data: x
+## X-squared = 0.66724, df = 3, p-value = 0.8809
+