lab5_code_Laura.R

# Lab 5
library(ggplot2)
library(reshape2)

# Normal approximation to the binomial
# try is with a reasonable sample
s <- 10000
p <- .4
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))

s*p
s*(1-p)

hist(binom_sample)
sum(binom_sample<3907)/n # tail probability using binomial sample

z <- (3907 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation

# try it with a sample that barely fulfills the requirement
s <- 100
p <- .1
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))

s*p
s*(1-p)

hist(binom_sample)
# lets find the probability of >5 success using binom_sample and compare 
# that to using the normal approximation 
sum(binom_sample<5)/n # tail probability using binomial sample

z <- (5 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation

# try it with an extreme example
s <- 10**10
p <- 10**(-9)
n <- 10000
set.seed(3)
binom_sample <- rbinom(n, s, p)
set.seed(2)
norm_sample <- rnorm(n, s*p, s*p*(1-p))

s*p
s*(1-p)

hist(binom_sample)
# lets find the probability of >5 success using binom_sample and compare 
# that to using the normal approximation 
sum(binom_sample<5)/n # tail probability using binomial sample

z <- (5 - s*p)/sqrt(s*p*(1-p))
pnorm(z) # tail probability using normal approximation

# why is it not appropriate to always assume the np >= 10, np(1-p) >= 10 rule works?
# regardless of p, does having s >= 30 allow us to use normal approimation to the binomial? Try it out

# Homework 3 qusetion 4 simulation
sample1 <- rbinom(1000, 50, 4/33)
sample2 <- rbinom(1000, 50, 7/40)
mean(sample1 + sample2)/(50 + 50)

sample1 <- rbinom(1000, 40, 4/33)
sample2 <- rbinom(1000, 60, 7/40)
mean(sample1 + sample2)/(40 + 60)

# slide 15
diff <- c(.005, .050, .035, .042, .043, .052)
sd_diff <- c(.043, .044, .074, .086, .106, .125)

mean(diff)
var_diff <- sd_diff**2
sqrt(mean(var_diff)) # mean sd_diff
sqrt(sum(sd_diff**2)/length(sd_diff)) # expanded calculation of mean sd_diff
# compare to
mean(sd_diff)


# Challenge
# Pick up where left off last lab

# Create an Rmarkdown file
# generate a data frame with one variable called binomial generated by a bimon(4, .3) 
# and two variables called normal1, normal2 generated by norm(13, 2.3) and norm(2, 7.1)
# create a scatter plot of normal1 and normal2 
# Describe the graph
# generate a pdf that shows the graph and your description, but not the code
# If you've done all that, go head and look at summary statistics
# if you consider the variables binomial as a grouping, how do the groups differ (if at all)?
# what is the mean, varirance, and standard deviation of these groups?

# Use ggplot to graph the two normal samples created above and color by their value given 
# by your binomial sample
# Create a new variable called "shifted normal1" that equals binomial + normal1, look at a scatter plot 
# of shifted normal1 and normal2

# Assume you want check if the proportion of dog owners in Ann Arbor is the same as that in Ypsilanti.
# You know that 30% of people in AA own dogs, and you took a random sample from Ypsi where 2 out of 
# 10 are dog owners. What ways can you think of to check if these proportions are about the same or are 
# too different to make that conclusion? What methods could you use that have been covered in class? 
# Try them out.