X-Git-Url: https://code.communitydata.science/stats_class_2020.git/blobdiff_plain/1c7b8b613996293fe6f47c598226c8ec4a2b142b..fb64f892792941055a14f285f859056f3afb6ede:/assessment/interactive_assessment.rmd diff --git a/assessment/interactive_assessment.rmd b/assessment/interactive_assessment.rmd index e81b938..0c8c64b 100644 --- a/assessment/interactive_assessment.rmd +++ b/assessment/interactive_assessment.rmd @@ -44,7 +44,7 @@ options(tutorial.event_recorder = tutorial_event_recorder) ## Overview -This is document contains R Markdown code for an *Interactive Self Assessment*. By completing this assessment, both students and the teaching can check in on learning progress. +This is document contains R Markdown code for an *Interactive Self Assessment*. Through this assessment, both students and the teaching team can check in on learning progress. The Self Assessment is broken into six sections, described below. You can navigate throughout the document using the left-hand column. In general, completely this assessment should take about 60 minutes. @@ -57,7 +57,7 @@ The Self Assessment is broken into six sections, described below. You can naviga * Section 6, Helpful Formulas. Contains some helpful formulas that may be useful for Sections 3-5. * Section 7, Answer Report. Time estimate: 5 minutes. Here, you can use R code (some of which is prepopulated for you) to analyze (or visualize) your performance on the assessment. This provides a way for you to (1) practice exploratory analyses R with data you created yourself (by answering questions) and (2) get immediate feedback about your performance. -Note that you can clear **all** your answers to *all* questions by clicking "Start Over" in the left-hand sidebar, but doing that basically erases all progress in the document and your answers to any questions will be deleted. *Use with caution* (if at all)! +Note that you can clear **all** your answers to *all* questions by clicking "Start Over" in the left-hand sidebar, but doing that basically erases all progress in the document and your answers to every question will be deleted. *Use with caution* (if at all)! ## Section 1, Warm-up Exercises @@ -106,6 +106,46 @@ quiz( ``` +## Useful Formulas +Sample Mean (sample statistic): +$\bar{x}=\frac{\sum_{i=1}^n x_i}{n}$ + +Standard deviation: +$s=\sqrt{\frac{\sum_{i=1}^n (x_i-\bar{x})^2}{n-1}}$ + +Variance: +$var = s^2$ + +Useful probability axioms: + +Complement: +$\mbox{Pr}(A^c)=1-\mbox{Pr}(A)$ + +Probability of two *independent* events both happening: +Pr(A and B) = Pr(A) $\times$ Pr(B) + +Probability of one of two events happening: +Pr(A or B) = Pr(A) + Pr(B) - Pr(A and B) + +Conditional probability: +$\mbox{Pr}(A|B)=\frac{\mbox{Pr(A and B)}}{\mbox{Pr(B)}}$ + +Population mean (population statistic): +$\mu = \sum_{i=1}^{n}x\mbox{Pr}(x)$ + +Z-score: +$z=\frac{x-\mu}{\sigma}$ + +Standard errors: + +$SE=\frac{\sigma}{\sqrt{n}}$ + +$SE_{proportion}=\sqrt{\frac{p(1-p)}{n}}$ + +Identifying outliers using Interquartile Range (IRQ): +$Q_1 - 1.5 \times IQR, \quad Q_3 + 1.5 \times IQR$ + + ## Section 2: Writing and Debugging R Code ### Debugging a Function @@ -134,8 +174,8 @@ zeroToOneRescaler <- function() { } test_vector = c(1,2,3,4,5) -zeroToOneRescaler(test_vector) # Should print c(0, 0.25, 0.5, 0.75, 1.00) +zeroToOneRescaler(test_vector) ``` ```{r R_debug1-solution} @@ -146,8 +186,8 @@ zeroToOneRescaler <- function(x) { } test_vector = c(1,2,3,4,5) -zeroToOneRescaler(test_vector) # Should print c(0, 0.25, 0.5, 0.75, 1.00) +zeroToOneRescaler(test_vector) ``` ```{r R_debug1-response} @@ -268,7 +308,8 @@ m1 <- "" wolf <- "Think of the 'Boy who cried wolf', with a null hypothesis that no wolf exists. First the boy claims the alternative hypothesis: there is a wolf. The villagers believe this, and reject the correct null hypothesis. Second, the villagers make an error by not believing the boy when he presents a correct alternative hypothesis." quiz( - question("A hypothesis is typically concerned with a:", + question("A hypothesis is typically written in terms of a:", + answer("p-value."), answer("population statistic.", correct = TRUE), answer("sample statistic.") ), @@ -283,40 +324,44 @@ quiz( answer("if an effect is causal or not.") ), question("A distribution that is right-skewed has a long tail to the:", - answer("right", correct = TRUE), - answer("left") + answer("right.", correct = TRUE), + answer("left.") ), question("A normal distribution can be characterized with only this many parameters:", - answer("1"), - answer("2", correct = TRUE), - answer("3") + answer("1."), + answer("2.", correct = TRUE), + answer("3.") ), - question("When we calculate standard error, we calculate", - answer("using a different formula for every type of variable."), - answer("the sample standard error, which is an estimate of the population standard error.", correct = TRUE), - answer("whether or not our result is causal.") - ), - question("When we calculate standard error, we calculate", - answer("using a different formula for every type of variable."), - answer("the sample standard error, which is an estimate of the population standard error.", correct = TRUE), + question("When we calculate a standard error, we look to understand", + answer("the spread of our observed data based on the spread of the population distribution."), + answer("the spread of the population distribution based on the spread of our observed data.", correct = TRUE), answer("whether or not our result is causal.") ), +# question("When we calculate standard error, we calculate", +# answer("using a different formula for every type of variable."), +# answer("the sample standard error, which is an estimate of the population standard error.", correct = TRUE), +# answer("whether or not our result is causal.") +# ), question("P values tell us about", - answer("the world in which our null hypothesis is true.", correct = TRUE), - answer("the world in which our null hypothesis is false."), - answer("the world in which our data describe a causal effect") + answer("the probability of observing the outcome."), + answer("the world in which the null hypothesis is true.", correct = TRUE), + answer("the world in which the null hypothesis is false."), + answer("the probability that a difference is due to chance.") +# answer("the world in which our data describe a causal effect.") ), question("P values are", answer("a conditional probability.", correct = TRUE), answer("completely misleading."), - answer("only useful when our data has a normal distribution.") + answer("an indication of the strength of an association"), + answer("most useful when our data has a normal distribution.") ), question("A type 1 error occurs when", answer("when we reject a correct null hypothesis (i.e. false positive).", correct = TRUE, message=wolf), answer("when we accept a correct null hypothesis", message=wolf), answer("when we accept an incorrect null hypothesis (i.e. false negative)", message=wolf) ), - question("Before we assume independence of two random samples, it is useful to check that", + question("Before we assume independence of two random samples, it can be useful to check whether", + answer("they are correlated."), answer("both samples include over 90% of the population."), answer("both samples include less than 10% of the population.", correct = TRUE) ) @@ -325,14 +370,13 @@ quiz( ```{r StatsConcepts_sampling} quiz( - question("A political scientist is interested in the effect of government type on economic development. -She wants to use a sample of 30 countries evenly represented among the Americas, Europe, -Asia, and Africa to conduct her analysis. What type of study should she use to ensure that -countries are selected from each region of the world? Assume a limitied research budget.", - answer("Observational - simple random sample"), - answer("Observational - cluster"), - answer("Observational - stratifed", correct=TRUE), - answer("Experimental") + question("A political scientist is interested in the effect of teaching style on standardized test performance +She plans to use a sample of 30 classes evenly spread among the Communication, Computer Science, and Business to conduct her analysis. What type of sampling strategy should she use to ensure that +classes are selected from each discipline equally? Assume a limited research budget.", + answer("A simple random sample"), + answer("A cluster random sample"), + answer("A stratifed random sample", correct=TRUE), + answer("A snowball sample") ) ) ``` @@ -349,7 +393,7 @@ For the following question, you may want to use this "scratch paper" code chunk. ```{r Distributions_quartile} quiz( question("Heights of boys in a high school are approximately normally distributed with mean of 175 cm -standard deviation of 5 cm. What is the first quartile of heights?", +standard deviation of 5 cm. What value most likely corresponds to the first quartile of heights?", answer("25 cm"), answer("167.3 cm"), answer("171.7 cm", correct=TRUE), @@ -407,7 +451,7 @@ m2 <- "Let H be the event of hypertension, M be event of being a male. We see he m3 <- "$P(HIV \\cap HCV) = P(HIV|HCV)\\cdot P(HCV) = 0.1\\cdot 0.02 = 0.002$" quiz( - question("Suppose in a population, half prefer coffee to tea, and assume that 10 percent of the population does not put milk in their coffee or tea. If coffee vs. tea preference and cow milk are independent, what fraction of the population both prefers coffee and does put milk in their coffee?", + question("Suppose in a population, half prefer coffee to tea, and assume that 10 percent of the population prefers no milk in their coffee or tea. If coffee vs. tea preference and milk use are independent, what fraction of the population both prefers coffee and puts milk in their coffee?", answer("40%", message=m1), answer("45%", correct = TRUE, message=m1), answer("50%", message=m1), @@ -448,13 +492,15 @@ breast cancer represent a rate of 1 in 30 women with undiagnosed cancer. The numbers in the table are realistic for US women in this age category. -Has Breast Cancer: 3,296 Positive Test Results and 37 negative test results (3,333 total) +| | Positive test result | Negative test result | +|--------------------|---------------------:|---------------------:| +| Have breast cancer | 3,296 | 37 | +| Do not have breast cancer | 8,313 | 88,354 | -Does not Have Breast Cancer: 8,313 Positive Test Results and 88,354 negative test results (96,667 total) First, compute the "margins" of the above contingency table. -Row margins: How many total women have breast cancer? How many total women do not have breast cancer? -Column margins: How many total positive test? How many total negative tests? +* Row margins: How many total women have breast cancer? How many total women do not have breast cancer? +* Column margins: How many total positive test? How many total negative tests? ```{r Probabilities_mammogram-chunk, exercise=TRUE} ``` @@ -479,13 +525,13 @@ $\\dfrac{\\Pr(\\textrm{Cancer} \\cap \\textrm{Test}^+)} $\\dfrac{3,296}{11,609} = 0.284$" quiz( - question("Based on this data, what is the probability that a woman has a positive test given that women has cancer?", + question("Based on this data, what is the probability that a woman has a positive test given that a women has cancer?", answer("98.9%", correct = TRUE, message=m1), answer("99.9%",message=m1), answer("89.9%",message=m1), answer("88.9%",message=m1) ), - question("Based on this data, what is the probability that a woman has cancer receives a positive test?", + question("Based on this data, what is the probability that a woman who has cancer receives a positive test?", answer("28.4%", correct = TRUE,message=m2), answer("10.3%",message=m2), answer("50.7%",message=m2), @@ -500,44 +546,17 @@ quiz( -## Useful Formulas -Sample Mean (sample statistic): -$\bar{x}=\frac{\sum_{i=1}^n x_i}{n}$ | -Standard deviation: -$s=\sqrt{\frac{\sum_{i=1}^n (x_i-\bar{x})^2}{n-1}}$ | -Variance: -$var = s^2$ - -Useful probability axioms: -$\mbox{Pr}(A^c)=1-\mbox{Pr}(A)$ | Pr(A and B) = Pr(A) $\times$ Pr(B) | Pr(A or B) = Pr(A) + Pr(B) - Pr(A and B) - -$\mbox{Pr}(A|B)=\frac{\mbox{Pr(A and B)}}{\mbox{Pr(B)}}$\\ - -Population mean (population statistic): -$\mu = \sum_{i=1}^{n}x\mbox{Pr}(x)$ - -Z-score: -$z=\frac{x-\mu}{\sigma}$ - -$x=\mu + z\sigma$\\ - -$\mbox{P}(x)=\frac{n!}{x!(n-x)!}p^x(1-p)^{n-x}$ - ~for~ $x=0,1,2,...,n$ - -$\mu=np$, $\sigma=\sqrt{np(1-p)}$\\ - -$\sigma_{\bar{x}}=\frac{\sigma}{\sqrt{n}}$ - -$\sigma_{\hat{p}}=\sqrt{\frac{p(1-p)}{n}}$ - -$Q_1 - 1.5 \times IQR, \quad Q_3 + 1.5 \times IQR$ ## Answer Report Finally, let's generate a report that summarizes your answers to this evaluation. -Answers are written to a file that looks like this: `question_submission-{CURRENT TIME}.csv`. They're also saved in R Studio's global environment as a variable called `df`. Run the below code chunk to see what `df` looks like. +Answers are written to a file that looks like this: `question_submission-{CURRENT TIME}.csv`. + +Take note of this csv file: this is what you will submit to Canvas. + +They're also saved in R Studio's global environment as a variable called `df`. Run the below code chunk to see what `df` looks like. ```{r report1, exercise=TRUE} df