X-Git-Url: https://code.communitydata.science/stats_class_2019.git/blobdiff_plain/6daad5c6d32c7e66729a5444709e1dd4de82b1e0..fb2e2b741c48f616bb32e074de43048e315875a7:/problem_sets/week_08/ps8-worked-solutions.html diff --git a/problem_sets/week_08/ps8-worked-solutions.html b/problem_sets/week_08/ps8-worked-solutions.html new file mode 100644 index 0000000..99a9c82 --- /dev/null +++ b/problem_sets/week_08/ps8-worked-solutions.html @@ -0,0 +1,492 @@ + + + + +
+ + + + + + + + + + +w3 <- read.csv(url("https://communitydata.cc/~ads/teaching/2019/stats/data/week_03/group_08.csv"))
+
+## Cleanup and recoding
+w3$j <- as.logical(w3$j)
+w3$l <- as.logical(w3$l)
+
+w3$k <- factor(w3$k,
+ levels=c(0,1,2,3),
+ labels=c("none", "some", "lots", "all"))
+
+summary(w3)
+## x j l k
+## Min. :0.001657 Mode :logical Mode :logical none: 0
+## 1st Qu.:0.691977 FALSE:55 FALSE:48 some:31
+## Median :2.059659 TRUE :45 TRUE :52 lots:39
+## Mean :2.400375 all :30
+## 3rd Qu.:3.534149
+## Max. :9.170684
+## NA's :5
+## y
+## Min. :-4.40
+## 1st Qu.: 3.02
+## Median : 6.08
+## Mean : 7.47
+## 3rd Qu.:11.95
+## Max. :29.51
+## NA's :5
+t.test(w3$x, w3$y)
+##
+## Welch Two Sample t-test
+##
+## data: w3$x and w3$y
+## t = -7.0336, df = 111.6, p-value = 1.711e-10
+## alternative hypothesis: true difference in means is not equal to 0
+## 95 percent confidence interval:
+## -6.497250 -3.641157
+## sample estimates:
+## mean of x mean of y
+## 2.400375 7.469579
+cor(w3$x, w3$y, use="complete.obs")
+## [1] 0.9063767
+f <- formula(y ~ x)
+m <- lm(f, data=w3) # 'lm' stands for 'linear model', and is used for linear regression
+
+summary(m)
+##
+## Call:
+## lm(formula = f, data = w3)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -5.3044 -2.2549 0.1348 2.2098 4.8179
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 0.3898 0.4502 0.866 0.389
+## x 2.9494 0.1426 20.690 <2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 2.852 on 93 degrees of freedom
+## (5 observations deleted due to missingness)
+## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8196
+## F-statistic: 428.1 on 1 and 93 DF, p-value: < 2.2e-16
+## Using base R graphing tools
+plot(m$residuals)
+
+hist(m$residuals)
+
+plot(m$fitted.values, m$residuals)
+
+qqnorm(m$residuals)
+
+library(ggplot2)
+## Registered S3 methods overwritten by 'ggplot2':
+## method from
+## [.quosures rlang
+## c.quosures rlang
+## print.quosures rlang
+
+library(ggfortify)
+
+autoplot(m)
+
+library(stargazer)
+##
+## Please cite as:
+## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
+## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
+stargazer(m, type="text")
+##
+## ===============================================
+## Dependent variable:
+## ---------------------------
+## y
+## -----------------------------------------------
+## x 2.949***
+## (0.143)
+##
+## Constant 0.390
+## (0.450)
+##
+## -----------------------------------------------
+## Observations 95
+## R2 0.822
+## Adjusted R2 0.820
+## Residual Std. Error 2.852 (df = 93)
+## F Statistic 428.063*** (df = 1; 93)
+## ===============================================
+## Note: *p<0.1; **p<0.05; ***p<0.01
+library(readstata13)
+df <- read.dta13('~/Documents/Teaching/2019/stats/data/week_07/Halloween2012-2014-2015_PLOS.dta')
+df <- df[complete.cases(df),] # Remove rows with NA
+
+f1 <- formula(fruit ~ obama) # Create a formula for the regression
+
+m1 <- lm(f1, data=df) # Create a model and store it as m1
+
+summary(m1) # Display a summary of the regression model
+##
+## Call:
+## lm(formula = f1, data = df)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -0.2748 -0.2748 -0.2368 0.7252 0.7632
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 0.23681 0.01555 15.233 <2e-16 ***
+## obama 0.03797 0.02578 1.473 0.141
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 0.4333 on 1219 degrees of freedom
+## Multiple R-squared: 0.001776, Adjusted R-squared: 0.0009572
+## F-statistic: 2.169 on 1 and 1219 DF, p-value: 0.1411
+autoplot(m1)
+
+hist(residuals(m1))
+
+plot(df$obama, m1$residuals)
+
+Iâll fit the models separately and then present them using Stargazer to facilitate readability:
+m2012 <- lm(f1, data=df[df$year == 2012,]) # Create a regression model for just 2012
+m2014 <- lm(f1, data=df[df$year == 2014,])
+m2015 <- lm(f1, data=df[df$year == 2015,])
+
+stargazer(m2012, m2014, m2015, column.labels = c("2012", "2014", "2015"), type="text")
+##
+## ================================================================================
+## Dependent variable:
+## ------------------------------------------------------------
+## fruit
+## 2012 2014 2015
+## (1) (2) (3)
+## --------------------------------------------------------------------------------
+## obama 0.050 0.017 0.065*
+## (0.067) (0.042) (0.038)
+##
+## Constant 0.203*** 0.219*** 0.253***
+## (0.051) (0.026) (0.021)
+##
+## --------------------------------------------------------------------------------
+## Observations 164 422 635
+## R2 0.003 0.0004 0.004
+## Adjusted R2 -0.003 -0.002 0.003
+## Residual Std. Error 0.424 (df = 162) 0.419 (df = 420) 0.445 (df = 633)
+## F Statistic 0.550 (df = 1; 162) 0.159 (df = 1; 420) 2.849* (df = 1; 633)
+## ================================================================================
+## Note: *p<0.1; **p<0.05; ***p<0.01
+Assumptions and how they look from the plots: Linear relationship between explanatory and outcome variables: Some outliers in the plots of residuals against each of the explanatory variables (IQ and gender) suggests that the fit is less than ideal. It does not look like thereâs a strong linear relationship from these plots. Normally distributed residuals: The quantile-quantile plot (top left) shows some systematic deviation from the normal distribution. Constant variability of residuals: There are some outliers (individual points with very large residual values) in the top right plot. There also seems to be some relationship between the residuals and gender (bottom plot). These features suggest that the assumption is not met. Independent residuals: None of the plots show obvious evidence of dependence between data points, but there are (again) some outliers that could be concerning.
+Overall, the data do not seem to meet several of the assumptions necessary to identify the model due to some outliers and some unobserved relationships structuring the data. It would be risky to rely on this model to draw valid inferences about the relationships between the variables as a result.
+The damaged O-rings almost all occurred at the lowest launch-time temperature.
The model suggests that lower launch-time temperatures result in a higher probability of O-ring damage. The coefficient of the âTemperatureâ term is negative with a very small (proportionally speaking) standard error. It is statistically significant with a p-value near 0 (\(H_0:~\beta_{temp} = 0\), \(H_A:~\beta_{temp}\neq 0\)), indicating that the data provide evidence that the coefficient is likely different from 0. Exponentiating the coefficient (see the R-code below), we see that a one degree farenheit increase in launch-time temperature is associated with 81% as large odds of O-ring damage occurring.
exp(-.2162)
+## [1] 0.8055742
+my.fits <- function(x){
+ p.hat <- exp(11.663-(0.2162*x) ) # Function to get the odds
+ print(p.hat)
+ return(p.hat / (1 + p.hat)) # Convert the odds into a probability
+}
+
+vals <- c(51,52, 53, 55) # Vector of values we want probabilities for
+
+my.fits(vals)
+## [1] 1.8904218 1.5228750 1.2267888 0.7961243
+## [1] 0.6540297 0.6036268 0.5509228 0.4432456
+geom_smooth()
that fits a quadratic function (\(y = x^2\)) to the points. You might experiment with the different options for geom_smooth()
or, for a simpler solution, just try geom_line()
(with no arguments) instead.vals <- seq(51, 71, by=2) # This creates a vector from 51 to 71, counting by twos
+
+probs <- my.fits(vals) # store the probabilities as another vector
+## [1] 1.89042184 1.22678877 0.79612426 0.51664464 0.33527640 0.21757754
+## [7] 0.14119689 0.09162968 0.05946306 0.03858854 0.02504202
+pred <- data.frame(temp=vals, pred.probability=probs) # Change to a data frame for ggplot
+
+ggplot(data=pred, aes(x=temp, y=pred.probability)) +
+ geom_point(color="purple") + # Plot the points
+ geom_smooth(color="orange", # Add a smooth line
+ method="glm", # Create a line fit to the data
+ formula = y ~ poly(x, 2), # Using this formula
+ se=FALSE) # Don't show standard errors
+
+We will discuss these in class.
+