-
Notifications
You must be signed in to change notification settings - Fork 0
/
lab5_code_Laura.R
111 lines (88 loc) · 3.33 KB
/
lab5_code_Laura.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Lab 5
library(ggplot2)
library(reshape2)
# Normal approximation to the binomial
# try is with a reasonable sample
s <- 10000
p <- .4
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))
s*p
s*(1-p)
hist(binom_sample)
sum(binom_sample<3907)/n # tail probability using binomial sample
z <- (3907 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation
# try it with a sample that barely fulfills the requirement
s <- 100
p <- .1
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))
s*p
s*(1-p)
hist(binom_sample)
# lets find the probability of >5 success using binom_sample and compare
# that to using the normal approximation
sum(binom_sample<5)/n # tail probability using binomial sample
z <- (5 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation
# try it with an extreme example
s <- 10**10
p <- 10**(-9)
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))
s*p
s*(1-p)
hist(binom_sample)
# lets find the probability of >5 success using binom_sample and compare
# that to using the normal approximation
sum(binom_sample<5)/n # tail probability using binomial sample
z <- (5 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation
# why is it not appropriate to always assume the np >= 10, np(1-p) >= 10 rule works?
# regardless of p, does having s >= 30 allow us to use normal approimation to the binomial? Try it out
# Homework 3 qusetion 4 simulation
sample1 <- rbinom(1000, 50, 4/33)
sample2 <- rbinom(1000, 50, 7/40)
mean(sample1 + sample2)/(50 + 50)
sample1 <- rbinom(1000, 40, 4/33)
sample2 <- rbinom(1000, 60, 7/40)
mean(sample1 + sample2)/(40 + 60)
# slide 15
diff <- c(.005, .050, .035, .042, .043, .052)
sd_diff <- c(.043, .044, .074, .086, .106, .125)
mean(diff)
var_diff <- sd_diff**2
sqrt(mean(var_diff)) # mean sd_diff
sqrt(sum(sd_diff**2)/length(sd_diff)) # expanded calculation of mean sd_diff
# compare to
mean(sd_diff)
# Challenge
# Pick up where left off last lab
# Create an Rmarkdown file
# generate a data frame with one variable called binomial generated by a bimon(4, .3)
# and two variables called normal1, normal2 generated by norm(13, 2.3) and norm(2, 7.1)
# create a scatter plot of normal1 and normal2
# Describe the graph
# generate a pdf that shows the graph and your description, but not the code
# If you've done all that, go head and look at summary statistics
# if you consider the variables binomial as a grouping, how do the groups differ (if at all)?
# what is the mean, varirance, and standard deviation of these groups?
# Use ggplot to graph the two normal samples created above and color by their value given
# by your binomial sample
# Create a new variable called "shifted normal1" that equals binomial + normal1, look at a scatter plot
# of shifted normal1 and normal2
# Assume you want check if the proportion of dog owners in Ann Arbor is the same as that in Ypsilanti.
# You know that 30% of people in AA own dogs, and you took a random sample from Ypsi where 2 out of
# 10 are dog owners. What ways can you think of to check if these proportions are about the same or are
# too different to make that conclusion? What methods could you use that have been covered in class?
# Try them out.