diff --git a/01_DataScientistToolbox/02_06_01_createNewRepo/index.Rmd b/01_DataScientistToolbox/02_06_01_createNewRepo/index.Rmd index 849298692..7a4263be6 100644 --- a/01_DataScientistToolbox/02_06_01_createNewRepo/index.Rmd +++ b/01_DataScientistToolbox/02_06_01_createNewRepo/index.Rmd @@ -74,7 +74,7 @@ __...OR...__ ``` $ mkdir ~/test-repo ``` -* Note: The tilda (`~`) symbol refers to your "home" directory, so this will create a directory called `test-repo` in your home directory +* Note: The tilde (`~`) symbol refers to your "home" directory, so this will create a directory called `test-repo` in your home directory * Navigate to this new directory using the following command (`cd` stands for "change directory"): ``` diff --git a/04_ExploratoryAnalysis/Colors/Colors.pdf b/04_ExploratoryAnalysis/Colors/Colors.pdf new file mode 100644 index 000000000..6140f8012 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/Colors.pdf differ diff --git a/04_ExploratoryAnalysis/Colors/Plotting and Color in R.pdf b/04_ExploratoryAnalysis/Colors/Plotting and Color in R.pdf deleted file mode 100644 index 034ee30a1..000000000 Binary files a/04_ExploratoryAnalysis/Colors/Plotting and Color in R.pdf and /dev/null differ diff --git a/04_ExploratoryAnalysis/Colors/index.Rmd b/04_ExploratoryAnalysis/Colors/index.Rmd index 1b723a822..e7de83e9f 100644 --- a/04_ExploratoryAnalysis/Colors/index.Rmd +++ b/04_ExploratoryAnalysis/Colors/index.Rmd @@ -1,7 +1,7 @@ --- title : Plotting and Color in R -subtitle : Computing for Data Analysis -author : Roger Peng, Associate Professor +subtitle : +author : Roger D. Peng, Associate Professor of Biostatistics job : Johns Hopkins Bloomberg School of Public Health logo : bloomberg_shield.png framework : io2012 # {io2012, html5slides, shower, dzslides, ...} @@ -177,4 +177,4 @@ mode : selfcontained # {standalone, draft} - Careful use of colors in plots/maps/etc. can make it easier for the reader to get what you're trying to say (why make it harder?) - The `RColorBrewer` package is an R package that provides color palettes for sequential, categorical, and diverging data - The `colorRamp` and `colorRampPalette` functions can be used in conjunction with color palettes to connect data to colors -- Transparency can sometimes be used to clarify plots with many points \ No newline at end of file +- Transparency can sometimes be used to clarify plots with many points diff --git a/04_ExploratoryAnalysis/Colors/index.html b/04_ExploratoryAnalysis/Colors/index.html index 05ee21ecb..fea92522b 100644 --- a/04_ExploratoryAnalysis/Colors/index.html +++ b/04_ExploratoryAnalysis/Colors/index.html @@ -4,7 +4,7 @@ Plotting and Color in R - + @@ -19,6 +19,11 @@ + + + + + @@ -37,8 +42,8 @@

Plotting and Color in R

-

Computing for Data Analysis

-

Roger Peng, Associate Professor
Johns Hopkins Bloomberg School of Public Health

+

+

Roger D. Peng, Associate Professor of Biostatistics
Johns Hopkins Bloomberg School of Public Health

diff --git a/04_ExploratoryAnalysis/Colors/index.md b/04_ExploratoryAnalysis/Colors/index.md index 531d196e9..e7de83e9f 100644 --- a/04_ExploratoryAnalysis/Colors/index.md +++ b/04_ExploratoryAnalysis/Colors/index.md @@ -1,7 +1,7 @@ --- title : Plotting and Color in R -subtitle : Computing for Data Analysis -author : Roger Peng, Associate Professor +subtitle : +author : Roger D. Peng, Associate Professor of Biostatistics job : Johns Hopkins Bloomberg School of Public Health logo : bloomberg_shield.png framework : io2012 # {io2012, html5slides, shower, dzslides, ...} diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide01.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide01.png new file mode 100644 index 000000000..ea02b5ac2 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide01.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide02.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide02.png new file mode 100644 index 000000000..8d94f2039 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide02.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide03.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide03.png new file mode 100644 index 000000000..d96ea2a6e Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide03.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide04.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide04.png new file mode 100644 index 000000000..28a8bcf4b Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide04.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide05.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide05.png new file mode 100644 index 000000000..aa5a8095e Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide05.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide06.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide06.png new file mode 100644 index 000000000..f1de98ae5 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide06.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide07.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide07.png new file mode 100644 index 000000000..96a13ba2d Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide07.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide08.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide08.png new file mode 100644 index 000000000..278628473 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide08.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide09.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide09.png new file mode 100644 index 000000000..c01a00107 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide09.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide10.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide10.png new file mode 100644 index 000000000..26abf6441 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide10.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide11.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide11.png new file mode 100644 index 000000000..4aa4844b8 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide11.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide12.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide12.png new file mode 100644 index 000000000..f283c0987 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide12.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide13.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide13.png new file mode 100644 index 000000000..037449cfa Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide13.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide14.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide14.png new file mode 100644 index 000000000..288e30c08 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide14.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide15.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide15.png new file mode 100644 index 000000000..6c0af87c1 Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide15.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide16.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide16.png new file mode 100644 index 000000000..73daf760e Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide16.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide17.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide17.png new file mode 100644 index 000000000..9c885ecfc Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide17.png differ diff --git a/04_ExploratoryAnalysis/Colors/slides/colors_slide18.png b/04_ExploratoryAnalysis/Colors/slides/colors_slide18.png new file mode 100644 index 000000000..bb18969ea Binary files /dev/null and b/04_ExploratoryAnalysis/Colors/slides/colors_slide18.png differ diff --git a/04_ExploratoryAnalysis/all_pdf_files.zip b/04_ExploratoryAnalysis/all_pdf_files.zip new file mode 100644 index 000000000..c35a59e9a Binary files /dev/null and b/04_ExploratoryAnalysis/all_pdf_files.zip differ diff --git a/04_ExploratoryAnalysis/announcements.html b/04_ExploratoryAnalysis/announcements.html deleted file mode 100644 index 3ebca0d07..000000000 --- a/04_ExploratoryAnalysis/announcements.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - -Week 1 Announcement - - - - - - - - - - - - -

Week 1 Announcement

- -

Welcome to Week 1 of Prediction and Machine Learning! This course -will focus on developing the tools and techniques for understanding, building, and testing prediction functions.

- -

These tools are at the center of the Data Science revolution. Many -researchers, companies, and governmental organizations would like to use the cheap and abundant data they are collecting to predict -what customers will like, what services to offer, or how to improve people's lives.

- -

The emphasis of this course is hands on learning and implementation. If you like what you learn in this class, there are a large number of other Machine and Statistical Learning MOOCs that you can use to deepen your knowledge.

- -

Please see the course syllabus for information about the quizes, the project, due dates, and grading. Don't forget to say hi on the message boards. The community developed around these courses is one of the best places to learn and the best things about taking a MOOC!

- -

Jeff Leek and the Data Science Track Team

- -
- -

Week 2 Announcement

- -

Welcome to Week 2 of Prediction and Machine Learning! This will be the most lecture-intensive week of the course. The primary goal is to introduce you to how to build predictors in practice.

- -

Remember that the course project is open and ongoing. With the skills you learn this week you should be able to start on the basic analyses that will form the beginnings of your project.

- -

Good luck and have a great week!

- -

Jeff Leek and the Data Science Track Team

- -
- -

Week 3 Announcement

- -

Welcome to Week 3 of Prediction and Machine Learning! This week we will start to reduce the number of lectures so you can spend more time focusing on your course project. Your project is due at the end of this week.

- -

If you have trouble or want to explore issues in more depth, please seek out answers on the message boards. They are a great resource! If you happen to be a superstar who already gets it, please take the time to help your classmates by answering their questions as well. This is one of the best ways to practice using and explaining your skills to others. These are two of the key characteristics of excellent data scientists.

- -

Good luck and have a great week!

- -

Jeff Leek and the Data Science Track Team

- -
- -

Week 4 Announcement

- -

Welcome to Week 4 of Prediction and Machine Learning! In this final week we will focus on peer grading of assignments.

- -

Participating in peer grading is an amazing learning opportunity. It gives you a chance to learn things from your fellow students, -pick up tips for explaning key ideas, and helping others to learn as well. We have focused our effort on making the rubric as objective and straightforward to implement as possible. If you have any issues please report them in the forums as described in the syllabus.

- -

Thanks again for all of your efforts in the course, we are in the last stretch. Good luck and have a great week!

- -

Jeff Leek and the Data Science Track Team

- -
- -

Course wrap-up

- -

Congratulations on finishing Prediction and Machine Learning!

- -

We have set the grading and released the Statements of Accomplishment for the Course. It might take a few hours/days for the statements to be disbursed to accounts.

- -

A couple of other notes:

- - - -

Thanks again for all of your efforts during the course of the class and best of luck in your career!

- -

Jeff Leek and the Data Science Track Team

- - - - diff --git a/04_ExploratoryAnalysis/makefile b/04_ExploratoryAnalysis/makefile new file mode 100644 index 000000000..07c62cb1e --- /dev/null +++ b/04_ExploratoryAnalysis/makefile @@ -0,0 +1,24 @@ +DELAY = 1000 +RMD_FILES = $(wildcard */index.Rmd) +HTML_FILES = $(patsubst %.Rmd, %.html, $(RMD_FILES)) +PDF_FILES = $(patsubst %/index.html, lectures/%.pdf, $(HTML_FILES)) + +listfiles: + @echo $(RMD_FILES) + @echo $(HTML_FILES) + @echo $(PDF_FILES) + + +html: $(HTML_FILES) +pdf: $(PDF_FILES) +all: html pdf + +zip: $(PDF_FILES) + zip all_pdf_files.zip $^ + +lectures/%.pdf: %/index.html + casperjs makepdf.js $< $@ $(DELAY) + +%/index.html: %/index.Rmd + cd $(dir $<) && Rscript -e "slidify::slidify('index.Rmd')" && cd .. + diff --git a/04_ExploratoryAnalysis/makepdf.js b/04_ExploratoryAnalysis/makepdf.js new file mode 100644 index 000000000..c01526f94 --- /dev/null +++ b/04_ExploratoryAnalysis/makepdf.js @@ -0,0 +1,10 @@ +var casper = require('casper').create({viewportSize:{width:1500,height:1000}}); +var args = casper.cli.args; +var imgfile = (args[1] || Math.random().toString(36).slice(2)) +casper.start(args[0], function() { + this.wait(args[2], function(){ + this.captureSelector(imgfile, "slides"); + }); +}); + +casper.run(); \ No newline at end of file diff --git a/04_ExploratoryAnalysis/syllabus.html b/04_ExploratoryAnalysis/syllabus.html deleted file mode 100644 index 68557d913..000000000 --- a/04_ExploratoryAnalysis/syllabus.html +++ /dev/null @@ -1,338 +0,0 @@ - - - - - -Course Title - - - - - - - - - - - - -

Course Title

- -

Exploratory Data Analysis

- -
- -

Course Instructor(s)

- -

Roger D. Peng

- -
- -

Course Description

- -

This course covers the essential exploratory techniques for summarizing data. These techniques are typically applied before formal modeling commences and can help inform the development of more complex statistical models. Exploratory techniques are also important for eliminating or sharpening potential hypotheses about the world that can be addressed by the data. We will cover in detail the plotting systems in R as well as some of the basic principles of constructing data graphics. We will also cover some of the common multivariate statistical techniques used to visualize high-dimensional data.

- -
- -

Course Content

- - - -
- -

Lecture Materials

- -

Lecture videos will be released weekly and will be available for the week and thereafter. You are welcome to view them at your convenience. Accompanying each video lecture will be a PDF copy of the slides and a link to an HTML5 version of the slides.

- -
- -

Weekly quizzes

- -

Quiz 1

- -

Assigned: Class open (1st of Month) -Due: 7th of the Month 12:00 AM UTC

- -

Quiz 2

- -

Assigned: 8th of the Month 12:01 AM UTC -Due: 14th of the Month 12:00 AM UTC

- -

Quiz 3

- -

Assigned: 15th of the Month 12:01 AM UTC -Due: 21st of the Month 12:00 AM UTC

- -

Quiz 4

- -

Assigned: 22nd of the Month 12:01 AM UTC -Due: 28th of the Month 12:00 AM UTC

- -
- -

Background lectures

- -

Background lectures about the content of the course with respect to other quantitative courses, course logistics, and the R programming language are provided as reference material. It is not necessary to watch the videos to complete the course, however they may be useful for explaining background, the grading schemes used, and how to use R.

- -
- -

Quiz Scoring

- -

You may attempt each quiz up to 2 times. Only the score from your final attempt will count toward your grade.

- -
- -

Hard deadlines and soft deadlines

- -

The reported due date is the soft deadline for each quiz. You may turn in quizzes up to two days after the soft deadline. The hard deadline is the Tuesday after the Quiz is due at 23:30 UTC-5:00. Each day late will incur a 10% penalty, but if you use a late day, the penalty will not be applied to that day.

- -
- -

Late Days for Quizzes

- -

You are permitted 5 late days for quizzes in the course. If you use a late day, your quiz grade will not be affected.

- -
- -

Dates for the project

- -

Submission

- -

Assigned: Class open (1st of Month) -Due: 21st of the Month 12:00 AM UTC

- -

Review

- -

Assigned: 22nd of the Month 12:01 AM UTC -Due: 28th of the Month 12:00 AM UTC

- -
- -

Typos

- - - -
- -

Differences of opinion

- -

Keep in mind that currently data analysis is as much art as it is science - so we may have a difference of opinion - and that is ok! Please refrain from angry, sarcastic, or abusive comments on the message boards. Our goal is to create a supportive community -that helps the learning of all students, from the most advanced -to those who are just seeing this material for the first time.

- -
- -

Peer Review

- -

For many of the course projects, peer scoring will be necessary -to evaluate the completion of the assignments. We have created -and tested rubrics for each assignment. They are not perfect -and will not be perfectly applied. However, we believe that -the feedback from peer assessment adds value above simple multiple choice assessments.

- - - -
- -

Plagiarism

- -

Johns Hopkins University defines plagiarism as “…taking for one’s own use the words, ideas, concepts or data of another without proper attribution. Plagiarism includes both direct use or paraphrasing of the words, thoughts, or concepts of another without proper attribution.” We take plagiarism very seriously, as does Johns Hopkins University.

- -

We recognize that many students may not have a clear understanding of what plagiarism is or why it is wrong. Please see the following guide for more information on plagiarism:

- -

http://www.jhsph.edu/academics/degree-programs/master-of-public-health/current-students/JHSPH-ReferencingHandbook.pdf

- -

It is critically important that you give people/sources credit when you use their words or ideas. If you do not give proper credit – particularly when quoting directly from a source – you violate the trust of your fellow students.

- -

The Coursera Honor code includes an explicit statement about plagiarism:

- -

I will register for only one account. My answers to homework, quizzes and exams will be my own work (except for assignments that explicitly permit collaboration). I will not make solutions to homework, quizzes or exams available to anyone else. This includes both solutions written by me, as well as any official solutions provided by the course staff. I will not engage in any other activities that will dishonestly improve my results or dishonestly improve/hurt the results of others.

- -
- -

Reporting plagiarism on course projects

- -

One of the criteria in the project rubric focuses on plagiarism. -Keep in mind that some components of the projects will be very -similar across terms and so answers that appear similar may be -honest coincidences. However, we would appreciate if you do a -basic check for obvious plagiarism and report it during your -peer assessment phase.

- -

It is currently very difficult to prove or disprove a charge of plagiarism in the MOOC peer assessment setting. We are not in a position to evaluate whether or not a submission actually constitutes plagiarism, and we will not be able to entertain appeals or to alter any grades that have been assigned through the peer evaluation system.

- -

But if you take the time to report suspected plagiarism, this will help us to understand the extent of the problem and work -with Coursera to address critical issues with the current system.

- -
- -

Technical Information

- -

Regardless of your platform (Windows or Mac) you will need a high-speed Internet connection in order to watch the videos on the Coursera web site. It is possible to download the video files and watch them on your computer rather than stream them from Coursera and this may be preferable for some of you.

- -

Here is some platform-specific information:

- -

Windows

- -

The Coursera web site seems to work best with either the Chrome or the -Firefox web browsers. In particular, you may run into trouble if you -use Internet Explorer. The Chrome and Firefox browsers can be -downloaded from:

- - - -

Mac

- -

The Coursera site appears to work well with Safari, Chrome, or Firefox, so any of these browsers should be fine.

- - - - diff --git a/05_ReproducibleResearch/Checklist/index.Rmd b/05_ReproducibleResearch/Checklist/index.Rmd new file mode 100644 index 000000000..6b5187058 --- /dev/null +++ b/05_ReproducibleResearch/Checklist/index.Rmd @@ -0,0 +1,122 @@ +--- +title : Reproducible Research Checklist +subtitle : What to Do and What Not to Do +author : Roger D. Peng, Associate Professor of Biostatistics +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../libraries + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## DO: Start With Good Science + +* Garbage in, garbage out + +* Coherent, focused question simplifies many problems + +* Working with good collaborators reinforces good practices + +* Something that's interesting to you will (hopefully) motivate good + habits + +--- + +## DON'T: Do Things By Hand + + + +* Editing spreadsheets of data to "clean it up" + + - Removing outliers + - QA / QC + - Validating + +* Editing tables or figures (e.g. rounding, formatting) + +* Downloading data from a web site (clicking links in a web browser) + +* Moving data around your computer; splitting / reformatting data files + +* "We're just going to do this once...." + + +Things done by hand need to be precisely documented (this is harder +than it sounds) + +--- + +## DON'T: Point And Click + + +--- + +## DO: Teach a Computer + + +--- + +## DO: Use Some Version Control + +* Slow things down + +* Add changes in small chunks (don't just do one massive commit) + +* Track / tag snapshots; revert to old versions + +* Software like GitHub / BitBucket / SourceForge make it easy to + publish results + +--- +## DO: Keep Track of Your Software Environment + + +--- + +## DO: Keep Track of Your Software Environment + + +```{r} +sessionInfo() +``` + +--- + +## DON'T: Save Output + + +--- + +## DO: Think About the Pipeline + +* Data analysis is a lengthy process + +* How you got the end is just as important as the end itself + +* The farther back in the pipeline you can "preserve" the better + +--- + +## Summary: Checklist + +* Are we doing good science? + +* Was any part of this analysis done by hand? + - If so, are those parts *precisely* document? + +* Have we taught a computer to do as much as possible (i.e. coded)? + +* Are we using a version control system? + +* Have we documented our software environment? + +* Have we saved any output that we cannot reconstruct from original + data + code? + +* How far back in the analysis pipeline can we go before our results + are no longer (automatically) reproducible? diff --git a/05_ReproducibleResearch/Checklist/index.html b/05_ReproducibleResearch/Checklist/index.html new file mode 100644 index 000000000..1d6400b56 --- /dev/null +++ b/05_ReproducibleResearch/Checklist/index.html @@ -0,0 +1,252 @@ + + + + Reproducible Research Checklist + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Reproducible Research Checklist

+

What to Do and What Not to Do

+

Roger D. Peng, Associate Professor of Biostatistics
Johns Hopkins Bloomberg School of Public Health

+
+
+ + + +
+

DO: Start With Good Science

+
+
+
    +
  • Garbage in, garbage out

  • +
  • Coherent, focused question simplifies many problems

  • +
  • Working with good collaborators reinforces good practices

  • +
  • Something that's interesting to you will (hopefully) motivate good +habits

  • +
+ +
+ +
+ + +
+

DON'T: Do Things By Hand

+
+
+
    +
  • Editing spreadsheets of data to "clean it up"

    + +
      +
    • Removing outliers
    • +
    • QA / QC
    • +
    • Validating
    • +
  • +
  • Editing tables or figures (e.g. rounding, formatting)

  • +
  • Downloading data from a web site (clicking links in a web browser)

  • +
  • Moving data around your computer; splitting / reformatting data files

  • +
  • "We're just going to do this once...."

  • +
+ +

Things done by hand need to be precisely documented (this is harder +than it sounds)

+ +
+ +
+ + +
+

DON'T: Point And Click

+
+
+ +
+ +
+ + +
+

DO: Teach a Computer

+
+
+ +
+ +
+ + +
+

DO: Use Some Version Control

+
+
+
    +
  • Slow things down

  • +
  • Add changes in small chunks (don't just do one massive commit)

  • +
  • Track / tag snapshots; revert to old versions

  • +
  • Software like GitHub / BitBucket / SourceForge make it easy to +publish results

  • +
+ +
+ +
+ + +
+

DO: Keep Track of Your Software Environment

+
+
+ +
+ +
+ + +
+

DO: Keep Track of Your Software Environment

+
+
+
sessionInfo()
+
+ +
## R version 3.0.2 Patched (2013-12-30 r64600)
+## Platform: x86_64-apple-darwin13.0.0 (64-bit)
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  base     
+## 
+## other attached packages:
+## [1] slidify_0.3.3
+## 
+## loaded via a namespace (and not attached):
+## [1] evaluate_0.5.1 formatR_0.10   knitr_1.5      markdown_0.6.3
+## [5] stringr_0.6.2  tools_3.0.2    whisker_0.3-2  yaml_2.1.8
+
+ +
+ +
+ + +
+

DON'T: Save Output

+
+
+ +
+ +
+ + +
+

DO: Think About the Pipeline

+
+
+
    +
  • Data analysis is a lengthy process

  • +
  • How you got the end is just as important as the end itself

  • +
  • The farther back in the pipeline you can "preserve" the better

  • +
+ +
+ +
+ + +
+

Summary: Checklist

+
+
+
    +
  • Are we doing good science?

  • +
  • Was any part of this analysis done by hand?

    + +
      +
    • If so, are those parts precisely document?
    • +
  • +
  • Have we taught a computer to do as much as possible (i.e. coded)?

  • +
  • Are we using a version control system?

  • +
  • Have we documented our software environment?

  • +
  • Have we saved any output that we cannot reconstruct from original +data + code?

  • +
  • How far back in the analysis pipeline can we go before our results +are no longer (automatically) reproducible?

  • +
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/05_ReproducibleResearch/Checklist/index.md b/05_ReproducibleResearch/Checklist/index.md new file mode 100644 index 000000000..e1d4bceb6 --- /dev/null +++ b/05_ReproducibleResearch/Checklist/index.md @@ -0,0 +1,142 @@ +--- +title : Reproducible Research Checklist +subtitle : What to Do and What Not to Do +author : Roger D. Peng, Associate Professor of Biostatistics +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../libraries + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## DO: Start With Good Science + +* Garbage in, garbage out + +* Coherent, focused question simplifies many problems + +* Working with good collaborators reinforces good practices + +* Something that's interesting to you will (hopefully) motivate good + habits + +--- + +## DON'T: Do Things By Hand + + + +* Editing spreadsheets of data to "clean it up" + + - Removing outliers + - QA / QC + - Validating + +* Editing tables or figures (e.g. rounding, formatting) + +* Downloading data from a web site (clicking links in a web browser) + +* Moving data around your computer; splitting / reformatting data files + +* "We're just going to do this once...." + + +Things done by hand need to be precisely documented (this is harder +than it sounds) + +--- + +## DON'T: Point And Click + + +--- + +## DO: Teach a Computer + + +--- + +## DO: Use Some Version Control + +* Slow things down + +* Add changes in small chunks (don't just do one massive commit) + +* Track / tag snapshots; revert to old versions + +* Software like GitHub / BitBucket / SourceForge make it easy to + publish results + +--- +## DO: Keep Track of Your Software Environment + + +--- + +## DO: Keep Track of Your Software Environment + + + +```r +sessionInfo() +``` + +``` +## R version 3.0.2 Patched (2013-12-30 r64600) +## Platform: x86_64-apple-darwin13.0.0 (64-bit) +## +## locale: +## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 +## +## attached base packages: +## [1] stats graphics grDevices utils datasets base +## +## other attached packages: +## [1] slidify_0.3.3 +## +## loaded via a namespace (and not attached): +## [1] evaluate_0.5.1 formatR_0.10 knitr_1.5 markdown_0.6.3 +## [5] stringr_0.6.2 tools_3.0.2 whisker_0.3-2 yaml_2.1.8 +``` + + +--- + +## DON'T: Save Output + + +--- + +## DO: Think About the Pipeline + +* Data analysis is a lengthy process + +* How you got the end is just as important as the end itself + +* The farther back in the pipeline you can "preserve" the better + +--- + +## Summary: Checklist + +* Are we doing good science? + +* Was any part of this analysis done by hand? + - If so, are those parts *precisely* document? + +* Have we taught a computer to do as much as possible (i.e. coded)? + +* Are we using a version control system? + +* Have we documented our software environment? + +* Have we saved any output that we cannot reconstruct from original + data + code? + +* How far back in the analysis pipeline can we go before our results + are no longer (automatically) reproducible? diff --git a/05_ReproducibleResearch/lectures/Checklist.pdf b/05_ReproducibleResearch/lectures/Checklist.pdf new file mode 100644 index 000000000..195c0ebf9 Binary files /dev/null and b/05_ReproducibleResearch/lectures/Checklist.pdf differ diff --git a/05_ReproducibleResearch/lectures/EvidenceBasedDataAnalysis.pdf b/05_ReproducibleResearch/lectures/EvidenceBasedDataAnalysis.pdf new file mode 100644 index 000000000..625fdc263 Binary files /dev/null and b/05_ReproducibleResearch/lectures/EvidenceBasedDataAnalysis.pdf differ diff --git a/05_ReproducibleResearch/lectures/LevelsOfDetail.pdf b/05_ReproducibleResearch/lectures/LevelsOfDetail.pdf new file mode 100644 index 000000000..23022f593 Binary files /dev/null and b/05_ReproducibleResearch/lectures/LevelsOfDetail.pdf differ diff --git a/05_ReproducibleResearch/lectures/Markdown.pdf b/05_ReproducibleResearch/lectures/Markdown.pdf new file mode 100644 index 000000000..bd1a522eb Binary files /dev/null and b/05_ReproducibleResearch/lectures/Markdown.pdf differ diff --git a/05_ReproducibleResearch/lectures/ReproducibleResearchConcepts.pdf b/05_ReproducibleResearch/lectures/ReproducibleResearchConcepts.pdf new file mode 100644 index 000000000..f7743387c Binary files /dev/null and b/05_ReproducibleResearch/lectures/ReproducibleResearchConcepts.pdf differ diff --git a/05_ReproducibleResearch/lectures/knitr.pdf b/05_ReproducibleResearch/lectures/knitr.pdf new file mode 100644 index 000000000..faed3b86b Binary files /dev/null and b/05_ReproducibleResearch/lectures/knitr.pdf differ diff --git a/05_ReproducibleResearch/lectures/organizingADataAnalysis.pdf b/05_ReproducibleResearch/lectures/organizingADataAnalysis.pdf new file mode 100644 index 000000000..b1fa39029 Binary files /dev/null and b/05_ReproducibleResearch/lectures/organizingADataAnalysis.pdf differ diff --git a/05_ReproducibleResearch/lectures/structureOfADataAnalysis1.pdf b/05_ReproducibleResearch/lectures/structureOfADataAnalysis1.pdf new file mode 100644 index 000000000..1c770937d Binary files /dev/null and b/05_ReproducibleResearch/lectures/structureOfADataAnalysis1.pdf differ diff --git a/05_ReproducibleResearch/lectures/structureOfADataAnalysis2.pdf b/05_ReproducibleResearch/lectures/structureOfADataAnalysis2.pdf new file mode 100644 index 000000000..35a2181cd Binary files /dev/null and b/05_ReproducibleResearch/lectures/structureOfADataAnalysis2.pdf differ diff --git a/05_ReproducibleResearch/makefile b/05_ReproducibleResearch/makefile new file mode 100644 index 000000000..07c62cb1e --- /dev/null +++ b/05_ReproducibleResearch/makefile @@ -0,0 +1,24 @@ +DELAY = 1000 +RMD_FILES = $(wildcard */index.Rmd) +HTML_FILES = $(patsubst %.Rmd, %.html, $(RMD_FILES)) +PDF_FILES = $(patsubst %/index.html, lectures/%.pdf, $(HTML_FILES)) + +listfiles: + @echo $(RMD_FILES) + @echo $(HTML_FILES) + @echo $(PDF_FILES) + + +html: $(HTML_FILES) +pdf: $(PDF_FILES) +all: html pdf + +zip: $(PDF_FILES) + zip all_pdf_files.zip $^ + +lectures/%.pdf: %/index.html + casperjs makepdf.js $< $@ $(DELAY) + +%/index.html: %/index.Rmd + cd $(dir $<) && Rscript -e "slidify::slidify('index.Rmd')" && cd .. + diff --git a/05_ReproducibleResearch/makepdf.js b/05_ReproducibleResearch/makepdf.js new file mode 100644 index 000000000..c01526f94 --- /dev/null +++ b/05_ReproducibleResearch/makepdf.js @@ -0,0 +1,10 @@ +var casper = require('casper').create({viewportSize:{width:1500,height:1000}}); +var args = casper.cli.args; +var imgfile = (args[1] || Math.random().toString(36).slice(2)) +casper.start(args[0], function() { + this.wait(args[2], function(){ + this.captureSelector(imgfile, "slides"); + }); +}); + +casper.run(); \ No newline at end of file diff --git a/06_StatisticalInference/01_01_Introduction/index.Rmd b/06_StatisticalInference/01_01_Introduction/index.Rmd index 0ece78b61..74e8b2a1e 100644 --- a/06_StatisticalInference/01_01_Introduction/index.Rmd +++ b/06_StatisticalInference/01_01_Introduction/index.Rmd @@ -1,158 +1,158 @@ ---- -title : Introduction to statistical inference -subtitle : -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- -## Statistical inference defined - -Statistical inference is the process of drawing formal conclusions from -data. - -In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy -statistical data where uncertainty must be accounted for. - ---- - -## Motivating example: who's going to win the election? - -In every major election, pollsters would like to know, ahead of the -actual election, who's going to win. Here, the target of -estimation (the estimand) is clear, the percentage of people in -a particular group (city, state, county, country or other electoral -grouping) who will vote for each candidate. - -We can not poll everyone. Even if we could, some polled -may change their vote by the time the election occurs. -How do we collect a reasonable subset of data and quantify the -uncertainty in the process to produce a good guess at who will win? - ---- - -## Motivating example: is hormone replacement therapy effective? - -A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. **Based on a statistically based protocol, the study was stopped early due an excess number of negative events.** - -Here's there's two inferential problems. - -1. Is HRT effective? -2. How long should we continue the trial in the presence of contrary -evidence? - -See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts - ---- - -## Motivating example: ECMO - -In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. **Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.** - -For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88 - ---- - -## Summary - -- These examples illustrate many of the difficulties of trying -to use data to create general conclusions about a population. -- Paramount among our concerns are: - - Is the sample representative of the population that we'd like to draw inferences about? - - Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions? - - Is there systematic bias created by missing data or the design or conduct of the study? - - What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization -or random sampling, or implicit as the aggregation of many complex uknown processes. - - Are we trying to estimate an underlying mechanistic model of phenomena under study? -- Statistical inference requires navigating the set of assumptions and -tools and subsequently thinking about how to draw conclusions from data. - ---- -## Example goals of inference - -1. Estimate and quantify the uncertainty of an estimate of -a population quantity (the proportion of people who will - vote for a candidate). -2. Determine whether a population quantity - is a benchmark value ("is the treatment effective?"). -3. Infer a mechanistic relationship when quantities are measured with - noise ("What is the slope for Hooke's law?") -4. Determine the impact of a policy? ("If we reduce polution levels, - will asthma rates decline?") - - ---- -## Example tools of the trade - -1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest -2. Random sampling: concerned with obtaining data that is representative -of the population of interest -3. Sampling models: concerned with creating a model for the sampling -process, the most common is so called "iid". -4. Hypothesis testing: concerned with decision making in the presence of uncertainty -5. Confidence intervals: concerned with quantifying uncertainty in -estimation -6. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are -approximated. -7. Study design: the process of designing an experiment to minimize biases and variability. -8. Nonparametric bootstrapping: the process of using the data to, - with minimal probability model assumptions, create inferences. -9. Permutation, randomization and exchangeability testing: the process -of using data permutations to perform inferences. - ---- -## Different thinking about probability leads to different styles of inference - -We won't spend too much time talking about this, but there are several different -styles of inference. Two broad categories that get discussed a lot are: - -1. Frequency probability: is the long run proportion of - times an event occurs in independent, identically distributed - repetitions. -2. Frequency inference: uses frequency interpretations of probabilities -to control error rates. Answers questions like "What should I decide -given my data controlling the long run proportion of mistakes I make at -a tolerable level." -3. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules. -4. Bayesian inference: the use of Bayesian probability representation -of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what -should I believe now?" - -Data scientists tend to fall within shades of gray of these and various other schools of inference. - ---- -## In this class - -* In this class, we will primarily focus on basic sampling models, -basic probability models and frequency style analyses -to create standard inferences. -* Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing -and bootstrapping. -* As probability modeling will be our starting point, we first build -up basic probability. - ---- -## Where to learn more on the topics not covered - -1. Explicit use of random sampling in inferences: look in references -on "finite population statistics". Used heavily in polling and -sample surveys. -2. Explicit use of randomization in inferences: look in references -on "causal inference" especially in clinical trials. -3. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many). -4. Missing data: well covered in biostatistics and econometric -references; look for references to "multiple imputation", a popular tool for -addressing missing data. -5. Study design: consider looking in the subject matter area that - you are interested in; some examples with rich histories in design: - 1. The epidemiological literature is very focused on using study design to investigate public health. - 2. The classical development of study design in agriculture broadly covers design and design principles. - 3. The industrial quality control literature covers design thoroughly. - +--- +title : Introduction to statistical inference +subtitle : Statistical inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- +## Statistical inference defined + +Statistical inference is the process of drawing formal conclusions from +data. + +In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy +statistical data where uncertainty must be accounted for. + +--- + +## Motivating example: who's going to win the election? + +In every major election, pollsters would like to know, ahead of the +actual election, who's going to win. Here, the target of +estimation (the estimand) is clear, the percentage of people in +a particular group (city, state, county, country or other electoral +grouping) who will vote for each candidate. + +We can not poll everyone. Even if we could, some polled +may change their vote by the time the election occurs. +How do we collect a reasonable subset of data and quantify the +uncertainty in the process to produce a good guess at who will win? + +--- + +## Motivating example: is hormone replacement therapy effective? + +A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. **Based on a statistically based protocol, the study was stopped early due an excess number of negative events.** + +Here's there's two inferential problems. + +1. Is HRT effective? +2. How long should we continue the trial in the presence of contrary +evidence? + +See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts + +--- + +## Motivating example: ECMO + +In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. **Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.** + +For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88 + +--- + +## Summary + +- These examples illustrate many of the difficulties of trying +to use data to create general conclusions about a population. +- Paramount among our concerns are: + - Is the sample representative of the population that we'd like to draw inferences about? + - Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions? + - Is there systematic bias created by missing data or the design or conduct of the study? + - What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization +or random sampling, or implicit as the aggregation of many complex uknown processes. + - Are we trying to estimate an underlying mechanistic model of phenomena under study? +- Statistical inference requires navigating the set of assumptions and +tools and subsequently thinking about how to draw conclusions from data. + +--- +## Example goals of inference + +1. Estimate and quantify the uncertainty of an estimate of +a population quantity (the proportion of people who will + vote for a candidate). +2. Determine whether a population quantity + is a benchmark value ("is the treatment effective?"). +3. Infer a mechanistic relationship when quantities are measured with + noise ("What is the slope for Hooke's law?") +4. Determine the impact of a policy? ("If we reduce polution levels, + will asthma rates decline?") + + +--- +## Example tools of the trade + +1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest +2. Random sampling: concerned with obtaining data that is representative +of the population of interest +3. Sampling models: concerned with creating a model for the sampling +process, the most common is so called "iid". +4. Hypothesis testing: concerned with decision making in the presence of uncertainty +5. Confidence intervals: concerned with quantifying uncertainty in +estimation +6. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are +approximated. +7. Study design: the process of designing an experiment to minimize biases and variability. +8. Nonparametric bootstrapping: the process of using the data to, + with minimal probability model assumptions, create inferences. +9. Permutation, randomization and exchangeability testing: the process +of using data permutations to perform inferences. + +--- +## Different thinking about probability leads to different styles of inference + +We won't spend too much time talking about this, but there are several different +styles of inference. Two broad categories that get discussed a lot are: + +1. Frequency probability: is the long run proportion of + times an event occurs in independent, identically distributed + repetitions. +2. Frequency inference: uses frequency interpretations of probabilities +to control error rates. Answers questions like "What should I decide +given my data controlling the long run proportion of mistakes I make at +a tolerable level." +3. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules. +4. Bayesian inference: the use of Bayesian probability representation +of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what +should I believe now?" + +Data scientists tend to fall within shades of gray of these and various other schools of inference. + +--- +## In this class + +* In this class, we will primarily focus on basic sampling models, +basic probability models and frequency style analyses +to create standard inferences. +* Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing +and bootstrapping. +* As probability modeling will be our starting point, we first build +up basic probability. + +--- +## Where to learn more on the topics not covered + +1. Explicit use of random sampling in inferences: look in references +on "finite population statistics". Used heavily in polling and +sample surveys. +2. Explicit use of randomization in inferences: look in references +on "causal inference" especially in clinical trials. +3. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many). +4. Missing data: well covered in biostatistics and econometric +references; look for references to "multiple imputation", a popular tool for +addressing missing data. +5. Study design: consider looking in the subject matter area that + you are interested in; some examples with rich histories in design: + 1. The epidemiological literature is very focused on using study design to investigate public health. + 2. The classical development of study design in agriculture broadly covers design and design principles. + 3. The industrial quality control literature covers design thoroughly. + diff --git a/06_StatisticalInference/01_01_Introduction/index.html b/06_StatisticalInference/01_01_Introduction/index.html index 0d4286cc3..391528189 100644 --- a/06_StatisticalInference/01_01_Introduction/index.html +++ b/06_StatisticalInference/01_01_Introduction/index.html @@ -1,302 +1,358 @@ - - - - Introduction to statistical inference - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Introduction to statistical inference

-

-

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

-
-
- - - -
-

Statistical inference defined

-
-
-

Statistical inference is the process of drawing formal conclusions from -data.

- -

In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy -statistical data where uncertainty must be accounted for.

- -
- -
- - -
-

Motivating example: who's going to win the election?

-
-
-

In every major election, pollsters would like to know, ahead of the -actual election, who's going to win. Here, the target of -estimation (the estimand) is clear, the percentage of people in -a particular group (city, state, county, country or other electoral -grouping) who will vote for each candidate.

- -

We can not poll everyone. Even if we could, some polled -may change their vote by the time the election occurs. -How do we collect a reasonable subset of data and quantify the -uncertainty in the process to produce a good guess at who will win?

- -
- -
- - -
-

Motivating example: is hormone replacement therapy effective?

-
-
-

A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. Based on a statistically based protocol, the study was stopped early due an excess number of negative events.

- -

Here's there's two inferential problems.

- -
    -
  1. Is HRT effective?
  2. -
  3. How long should we continue the trial in the presence of contrary -evidence?
  4. -
- -

See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts

- -
- -
- - -
-

Motivating example: ECMO

-
-
-

In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.

- -

For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88

- -
- -
- - -
-

Summary

-
-
-
    -
  • These examples illustrate many of the difficulties of trying -to use data to create general conclusions about a population.
  • -
  • Paramount among our concerns are: - -
      -
    • Is the sample representative of the population that we'd like to draw inferences about?
    • -
    • Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions?
    • -
    • Is there systematic bias created by missing data or the design or conduct of the study?
    • -
    • What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization -or random sampling, or implicit as the aggregation of many complex uknown processes.
    • -
    • Are we trying to estimate an underlying mechanistic model of phenomena under study?
    • -
  • -
  • Statistical inference requires navigating the set of assumptions and -tools and subsequently thinking about how to draw conclusions from data.
  • -
- -
- -
- - -
-

Example goals of inference

-
-
-
    -
  1. Estimate and quantify the uncertainty of an estimate of -a population quantity (the proportion of people who will -vote for a candidate).
  2. -
  3. Determine whether a population quantity -is a benchmark value ("is the treatment effective?").
  4. -
  5. Infer a mechanistic relationship when quantities are measured with -noise ("What is the slope for Hooke's law?")
  6. -
  7. Determine the impact of a policy? ("If we reduce polution levels, -will asthma rates decline?")
  8. -
- -
- -
- - -
-

Example tools of the trade

-
-
-
    -
  1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest
  2. -
  3. Random sampling: concerned with obtaining data that is representative -of the population of interest
  4. -
  5. Sampling models: concerned with creating a model for the sampling -process, the most common is so called "iid".
  6. -
  7. Hypothesis testing: concerned with decision making in the presence of uncertainty
  8. -
  9. Confidence intervals: concerned with quantifying uncertainty in -estimation
  10. -
  11. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are -approximated.
  12. -
  13. Study design: the process of designing an experiment to minimize biases and variability.
  14. -
  15. Nonparametric bootstrapping: the process of using the data to, -with minimal probability model assumptions, create inferences.
  16. -
  17. Permutation, randomization and exchangeability testing: the process -of using data permutations to perform inferences.
  18. -
- -
- -
- - -
-

Different thinking about probability leads to different styles of inference

-
-
-

We won't spend too much time talking about this, but there are several different -styles of inference. Two broad categories that get discussed a lot are:

- -
    -
  1. Frequency probability: is the long run proportion of -times an event occurs in independent, identically distributed -repetitions.
  2. -
  3. Frequency inference: uses frequency interpretations of probabilities -to control error rates. Answers questions like "What should I decide -given my data controlling the long run proportion of mistakes I make at -a tolerable level."
  4. -
  5. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules.
  6. -
  7. Bayesian inference: the use of Bayesian probability representation -of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what -should I believe now?"
  8. -
- -

Data scientists tend to fall within shades of gray of these and various other schools of inference.

- -
- -
- - -
-

In this class

-
-
-
    -
  • In this class, we will primarily focus on basic sampling models, -basic probability models and frequency style analyses -to create standard inferences.
  • -
  • Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing -and bootstrapping.
  • -
  • As probability modeling will be our starting point, we first build -up basic probability.
  • -
- -
- -
- - -
-

Where to learn more on the topics not covered

-
-
-
    -
  1. Explicit use of random sampling in inferences: look in references -on "finite population statistics". Used heavily in polling and -sample surveys.
  2. -
  3. Explicit use of randomization in inferences: look in references -on "causal inference" especially in clinical trials.
  4. -
  5. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many).
  6. -
  7. Missing data: well covered in biostatistics and econometric -references; look for references to "multiple imputation", a popular tool for -addressing missing data.
  8. -
  9. Study design: consider looking in the subject matter area that -you are interested in; some examples with rich histories in design: - -
      -
    1. The epidemiological literature is very focused on using study design to investigate public health.
    2. -
    3. The classical development of study design in agriculture broadly covers design and design principles.
    4. -
    5. The industrial quality control literature covers design thoroughly.
    6. -
  10. -
- -
- -
- - -
- - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Introduction to statistical inference + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Introduction to statistical inference

+

Statistical inference

+

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

+
+
+
+ + + + +
+

Statistical inference defined

+
+
+

Statistical inference is the process of drawing formal conclusions from +data.

+ +

In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy +statistical data where uncertainty must be accounted for.

+ +
+ +
+ + +
+

Motivating example: who's going to win the election?

+
+
+

In every major election, pollsters would like to know, ahead of the +actual election, who's going to win. Here, the target of +estimation (the estimand) is clear, the percentage of people in +a particular group (city, state, county, country or other electoral +grouping) who will vote for each candidate.

+ +

We can not poll everyone. Even if we could, some polled +may change their vote by the time the election occurs. +How do we collect a reasonable subset of data and quantify the +uncertainty in the process to produce a good guess at who will win?

+ +
+ +
+ + +
+

Motivating example: is hormone replacement therapy effective?

+
+
+

A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. Based on a statistically based protocol, the study was stopped early due an excess number of negative events.

+ +

Here's there's two inferential problems.

+ +
    +
  1. Is HRT effective?
  2. +
  3. How long should we continue the trial in the presence of contrary +evidence?
  4. +
+ +

See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts

+ +
+ +
+ + +
+

Motivating example: ECMO

+
+
+

In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.

+ +

For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88

+ +
+ +
+ + +
+

Summary

+
+
+
    +
  • These examples illustrate many of the difficulties of trying +to use data to create general conclusions about a population.
  • +
  • Paramount among our concerns are: + +
      +
    • Is the sample representative of the population that we'd like to draw inferences about?
    • +
    • Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions?
    • +
    • Is there systematic bias created by missing data or the design or conduct of the study?
    • +
    • What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization +or random sampling, or implicit as the aggregation of many complex uknown processes.
    • +
    • Are we trying to estimate an underlying mechanistic model of phenomena under study?
    • +
  • +
  • Statistical inference requires navigating the set of assumptions and +tools and subsequently thinking about how to draw conclusions from data.
  • +
+ +
+ +
+ + +
+

Example goals of inference

+
+
+
    +
  1. Estimate and quantify the uncertainty of an estimate of +a population quantity (the proportion of people who will +vote for a candidate).
  2. +
  3. Determine whether a population quantity +is a benchmark value ("is the treatment effective?").
  4. +
  5. Infer a mechanistic relationship when quantities are measured with +noise ("What is the slope for Hooke's law?")
  6. +
  7. Determine the impact of a policy? ("If we reduce polution levels, +will asthma rates decline?")
  8. +
+ +
+ +
+ + +
+

Example tools of the trade

+
+
+
    +
  1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest
  2. +
  3. Random sampling: concerned with obtaining data that is representative +of the population of interest
  4. +
  5. Sampling models: concerned with creating a model for the sampling +process, the most common is so called "iid".
  6. +
  7. Hypothesis testing: concerned with decision making in the presence of uncertainty
  8. +
  9. Confidence intervals: concerned with quantifying uncertainty in +estimation
  10. +
  11. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are +approximated.
  12. +
  13. Study design: the process of designing an experiment to minimize biases and variability.
  14. +
  15. Nonparametric bootstrapping: the process of using the data to, +with minimal probability model assumptions, create inferences.
  16. +
  17. Permutation, randomization and exchangeability testing: the process +of using data permutations to perform inferences.
  18. +
+ +
+ +
+ + +
+

Different thinking about probability leads to different styles of inference

+
+
+

We won't spend too much time talking about this, but there are several different +styles of inference. Two broad categories that get discussed a lot are:

+ +
    +
  1. Frequency probability: is the long run proportion of +times an event occurs in independent, identically distributed +repetitions.
  2. +
  3. Frequency inference: uses frequency interpretations of probabilities +to control error rates. Answers questions like "What should I decide +given my data controlling the long run proportion of mistakes I make at +a tolerable level."
  4. +
  5. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules.
  6. +
  7. Bayesian inference: the use of Bayesian probability representation +of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what +should I believe now?"
  8. +
+ +

Data scientists tend to fall within shades of gray of these and various other schools of inference.

+ +
+ +
+ + +
+

In this class

+
+
+
    +
  • In this class, we will primarily focus on basic sampling models, +basic probability models and frequency style analyses +to create standard inferences.
  • +
  • Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing +and bootstrapping.
  • +
  • As probability modeling will be our starting point, we first build +up basic probability.
  • +
+ +
+ +
+ + +
+

Where to learn more on the topics not covered

+
+
+
    +
  1. Explicit use of random sampling in inferences: look in references +on "finite population statistics". Used heavily in polling and +sample surveys.
  2. +
  3. Explicit use of randomization in inferences: look in references +on "causal inference" especially in clinical trials.
  4. +
  5. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many).
  6. +
  7. Missing data: well covered in biostatistics and econometric +references; look for references to "multiple imputation", a popular tool for +addressing missing data.
  8. +
  9. Study design: consider looking in the subject matter area that +you are interested in; some examples with rich histories in design: + +
      +
    1. The epidemiological literature is very focused on using study design to investigate public health.
    2. +
    3. The classical development of study design in agriculture broadly covers design and design principles.
    4. +
    5. The industrial quality control literature covers design thoroughly.
    6. +
  10. +
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/06_StatisticalInference/01_01_Introduction/index.md b/06_StatisticalInference/01_01_Introduction/index.md index 0ece78b61..74e8b2a1e 100644 --- a/06_StatisticalInference/01_01_Introduction/index.md +++ b/06_StatisticalInference/01_01_Introduction/index.md @@ -1,158 +1,158 @@ ---- -title : Introduction to statistical inference -subtitle : -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- -## Statistical inference defined - -Statistical inference is the process of drawing formal conclusions from -data. - -In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy -statistical data where uncertainty must be accounted for. - ---- - -## Motivating example: who's going to win the election? - -In every major election, pollsters would like to know, ahead of the -actual election, who's going to win. Here, the target of -estimation (the estimand) is clear, the percentage of people in -a particular group (city, state, county, country or other electoral -grouping) who will vote for each candidate. - -We can not poll everyone. Even if we could, some polled -may change their vote by the time the election occurs. -How do we collect a reasonable subset of data and quantify the -uncertainty in the process to produce a good guess at who will win? - ---- - -## Motivating example: is hormone replacement therapy effective? - -A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. **Based on a statistically based protocol, the study was stopped early due an excess number of negative events.** - -Here's there's two inferential problems. - -1. Is HRT effective? -2. How long should we continue the trial in the presence of contrary -evidence? - -See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts - ---- - -## Motivating example: ECMO - -In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. **Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.** - -For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88 - ---- - -## Summary - -- These examples illustrate many of the difficulties of trying -to use data to create general conclusions about a population. -- Paramount among our concerns are: - - Is the sample representative of the population that we'd like to draw inferences about? - - Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions? - - Is there systematic bias created by missing data or the design or conduct of the study? - - What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization -or random sampling, or implicit as the aggregation of many complex uknown processes. - - Are we trying to estimate an underlying mechanistic model of phenomena under study? -- Statistical inference requires navigating the set of assumptions and -tools and subsequently thinking about how to draw conclusions from data. - ---- -## Example goals of inference - -1. Estimate and quantify the uncertainty of an estimate of -a population quantity (the proportion of people who will - vote for a candidate). -2. Determine whether a population quantity - is a benchmark value ("is the treatment effective?"). -3. Infer a mechanistic relationship when quantities are measured with - noise ("What is the slope for Hooke's law?") -4. Determine the impact of a policy? ("If we reduce polution levels, - will asthma rates decline?") - - ---- -## Example tools of the trade - -1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest -2. Random sampling: concerned with obtaining data that is representative -of the population of interest -3. Sampling models: concerned with creating a model for the sampling -process, the most common is so called "iid". -4. Hypothesis testing: concerned with decision making in the presence of uncertainty -5. Confidence intervals: concerned with quantifying uncertainty in -estimation -6. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are -approximated. -7. Study design: the process of designing an experiment to minimize biases and variability. -8. Nonparametric bootstrapping: the process of using the data to, - with minimal probability model assumptions, create inferences. -9. Permutation, randomization and exchangeability testing: the process -of using data permutations to perform inferences. - ---- -## Different thinking about probability leads to different styles of inference - -We won't spend too much time talking about this, but there are several different -styles of inference. Two broad categories that get discussed a lot are: - -1. Frequency probability: is the long run proportion of - times an event occurs in independent, identically distributed - repetitions. -2. Frequency inference: uses frequency interpretations of probabilities -to control error rates. Answers questions like "What should I decide -given my data controlling the long run proportion of mistakes I make at -a tolerable level." -3. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules. -4. Bayesian inference: the use of Bayesian probability representation -of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what -should I believe now?" - -Data scientists tend to fall within shades of gray of these and various other schools of inference. - ---- -## In this class - -* In this class, we will primarily focus on basic sampling models, -basic probability models and frequency style analyses -to create standard inferences. -* Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing -and bootstrapping. -* As probability modeling will be our starting point, we first build -up basic probability. - ---- -## Where to learn more on the topics not covered - -1. Explicit use of random sampling in inferences: look in references -on "finite population statistics". Used heavily in polling and -sample surveys. -2. Explicit use of randomization in inferences: look in references -on "causal inference" especially in clinical trials. -3. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many). -4. Missing data: well covered in biostatistics and econometric -references; look for references to "multiple imputation", a popular tool for -addressing missing data. -5. Study design: consider looking in the subject matter area that - you are interested in; some examples with rich histories in design: - 1. The epidemiological literature is very focused on using study design to investigate public health. - 2. The classical development of study design in agriculture broadly covers design and design principles. - 3. The industrial quality control literature covers design thoroughly. - +--- +title : Introduction to statistical inference +subtitle : Statistical inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- +## Statistical inference defined + +Statistical inference is the process of drawing formal conclusions from +data. + +In our class, we wil define formal statistical inference as settings where one wants to infer facts about a population using noisy +statistical data where uncertainty must be accounted for. + +--- + +## Motivating example: who's going to win the election? + +In every major election, pollsters would like to know, ahead of the +actual election, who's going to win. Here, the target of +estimation (the estimand) is clear, the percentage of people in +a particular group (city, state, county, country or other electoral +grouping) who will vote for each candidate. + +We can not poll everyone. Even if we could, some polled +may change their vote by the time the election occurs. +How do we collect a reasonable subset of data and quantify the +uncertainty in the process to produce a good guess at who will win? + +--- + +## Motivating example: is hormone replacement therapy effective? + +A large clinical trial (the Women’s Health Initiative) published results in 2002 that contradicted prior evidence on the efficacy of hormone replacement therapy for post menopausal women and suggested a negative impact of HRT for several key health outcomes. **Based on a statistically based protocol, the study was stopped early due an excess number of negative events.** + +Here's there's two inferential problems. + +1. Is HRT effective? +2. How long should we continue the trial in the presence of contrary +evidence? + +See WHI writing group paper JAMA 2002, Vol 288:321 - 333. for the paper and Steinkellner et al. Menopause 2012, Vol 19:616 621 for adiscussion of the long term impacts + +--- + +## Motivating example: ECMO + +In 1985 a group at a major neonatal intensive care center published the results of a trial comparing a standard treatment and a promising new extracorporeal membrane oxygenation treatment (ECMO) for newborn infants with severe respiratory failure. **Ethical considerations lead to a statistical randomization scheme whereby one infant received the control therapy, thereby opening the study to sample-size based criticisms.** + +For a review and statistical discussion, see Royall Statistical Science 1991, Vol 6, No. 1, 52-88 + +--- + +## Summary + +- These examples illustrate many of the difficulties of trying +to use data to create general conclusions about a population. +- Paramount among our concerns are: + - Is the sample representative of the population that we'd like to draw inferences about? + - Are there known and observed, known and unobserved or unknown and unobserved variables that contaminate our conclusions? + - Is there systematic bias created by missing data or the design or conduct of the study? + - What randomness exists in the data and how do we use or adjust for it? Here randomness can either be explicit via randomization +or random sampling, or implicit as the aggregation of many complex uknown processes. + - Are we trying to estimate an underlying mechanistic model of phenomena under study? +- Statistical inference requires navigating the set of assumptions and +tools and subsequently thinking about how to draw conclusions from data. + +--- +## Example goals of inference + +1. Estimate and quantify the uncertainty of an estimate of +a population quantity (the proportion of people who will + vote for a candidate). +2. Determine whether a population quantity + is a benchmark value ("is the treatment effective?"). +3. Infer a mechanistic relationship when quantities are measured with + noise ("What is the slope for Hooke's law?") +4. Determine the impact of a policy? ("If we reduce polution levels, + will asthma rates decline?") + + +--- +## Example tools of the trade + +1. Randomization: concerned with balancing unobserved variables that may confound inferences of interest +2. Random sampling: concerned with obtaining data that is representative +of the population of interest +3. Sampling models: concerned with creating a model for the sampling +process, the most common is so called "iid". +4. Hypothesis testing: concerned with decision making in the presence of uncertainty +5. Confidence intervals: concerned with quantifying uncertainty in +estimation +6. Probability models: a formal connection between the data and a population of interest. Often probability models are assumed or are +approximated. +7. Study design: the process of designing an experiment to minimize biases and variability. +8. Nonparametric bootstrapping: the process of using the data to, + with minimal probability model assumptions, create inferences. +9. Permutation, randomization and exchangeability testing: the process +of using data permutations to perform inferences. + +--- +## Different thinking about probability leads to different styles of inference + +We won't spend too much time talking about this, but there are several different +styles of inference. Two broad categories that get discussed a lot are: + +1. Frequency probability: is the long run proportion of + times an event occurs in independent, identically distributed + repetitions. +2. Frequency inference: uses frequency interpretations of probabilities +to control error rates. Answers questions like "What should I decide +given my data controlling the long run proportion of mistakes I make at +a tolerable level." +3. Bayesian probability: is the probability calculus of beliefs, given that beliefs follow certain rules. +4. Bayesian inference: the use of Bayesian probability representation +of beliefs to perform inference. Answers questions like "Given my subjective beliefs and the objective information from the data, what +should I believe now?" + +Data scientists tend to fall within shades of gray of these and various other schools of inference. + +--- +## In this class + +* In this class, we will primarily focus on basic sampling models, +basic probability models and frequency style analyses +to create standard inferences. +* Being data scientists, we will also consider some inferential strategies that rely heavily on the observed data, such as permutation testing +and bootstrapping. +* As probability modeling will be our starting point, we first build +up basic probability. + +--- +## Where to learn more on the topics not covered + +1. Explicit use of random sampling in inferences: look in references +on "finite population statistics". Used heavily in polling and +sample surveys. +2. Explicit use of randomization in inferences: look in references +on "causal inference" especially in clinical trials. +3. Bayesian probability and Bayesian statistics: look for basic itroductory books (there are many). +4. Missing data: well covered in biostatistics and econometric +references; look for references to "multiple imputation", a popular tool for +addressing missing data. +5. Study design: consider looking in the subject matter area that + you are interested in; some examples with rich histories in design: + 1. The epidemiological literature is very focused on using study design to investigate public health. + 2. The classical development of study design in agriculture broadly covers design and design principles. + 3. The industrial quality control literature covers design thoroughly. + diff --git a/06_StatisticalInference/01_02_Probability/index.Rmd b/06_StatisticalInference/01_02_Probability/index.Rmd index 56a9207e3..c925cc40e 100644 --- a/06_StatisticalInference/01_02_Probability/index.Rmd +++ b/06_StatisticalInference/01_02_Probability/index.Rmd @@ -1,276 +1,277 @@ ---- -title : Probability -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Notation - -- The **sample space**, $\Omega$, is the collection of possible outcomes of an experiment - - Example: die roll $\Omega = \{1,2,3,4,5,6\}$ -- An **event**, say $E$, is a subset of $\Omega$ - - Example: die roll is even $E = \{2,4,6\}$ -- An **elementary** or **simple** event is a particular result - of an experiment - - Example: die roll is a four, $\omega = 4$ -- $\emptyset$ is called the **null event** or the **empty set** - ---- - -## Interpretation of set operations - -Normal set operations have particular interpretations in this setting - -1. $\omega \in E$ implies that $E$ occurs when $\omega$ occurs -2. $\omega \not\in E$ implies that $E$ does not occur when $\omega$ occurs -3. $E \subset F$ implies that the occurrence of $E$ implies the occurrence of $F$ -4. $E \cap F$ implies the event that both $E$ and $F$ occur -5. $E \cup F$ implies the event that at least one of $E$ or $F$ occur -6. $E \cap F=\emptyset$ means that $E$ and $F$ are **mutually exclusive**, or cannot both occur -7. $E^c$ or $\bar E$ is the event that $E$ does not occur - ---- - -## Probability - -A **probability measure**, $P$, is a function from the collection of possible events so that the following hold - -1. For an event $E\subset \Omega$, $0 \leq P(E) \leq 1$ -2. $P(\Omega) = 1$ -3. If $E_1$ and $E_2$ are mutually exclusive events - $P(E_1 \cup E_2) = P(E_1) + P(E_2)$. - -Part 3 of the definition implies **finite additivity** - -$$ -P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) -$$ -where the $\{A_i\}$ are mutually exclusive. (Note a more general version of -additivity is used in advanced classes.) - - ---- - - -## Example consequences - -- $P(\emptyset) = 0$ -- $P(E) = 1 - P(E^c)$ -- $P(A \cup B) = P(A) + P(B) - P(A \cap B)$ -- if $A \subset B$ then $P(A) \leq P(B)$ -- $P\left(A \cup B\right) = 1 - P(A^c \cap B^c)$ -- $P(A \cap B^c) = P(A) - P(A \cap B)$ -- $P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)$ -- $P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)$ - ---- - -## Example - -The National Sleep Foundation ([www.sleepfoundation.org](http://www.sleepfoundation.org/)) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts? - ---- - -## Example continued - -Answer: No, the events are not mutually exclusive. To elaborate let: - -$$ -\begin{eqnarray*} - A_1 & = & \{\mbox{Person has sleep apnea}\} \\ - A_2 & = & \{\mbox{Person has RLS}\} - \end{eqnarray*} -$$ - -Then - -$$ -\begin{eqnarray*} - P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ - & = & 0.13 - \mbox{Probability of having both} - \end{eqnarray*} -$$ -Likely, some fraction of the population has both. - ---- - -## Random variables - -- A **random variable** is a numerical outcome of an experiment. -- The random variables that we study will come in two varieties, - **discrete** or **continuous**. -- Discrete random variable are random variables that take on only a -countable number of possibilities. - * $P(X = k)$ -- Continuous random variable can take any value on the real line or some subset of the real line. - * $P(X \in A)$ - ---- - -## Examples of variables that can be thought of as random variables - -- The $(0-1)$ outcome of the flip of a coin -- The outcome from the roll of a die -- The BMI of a subject four years after a baseline measurement -- The hypertension status of a subject randomly drawn from a population - ---- - -## PMF - -A probability mass function evaluated at a value corresponds to the -probability that a random variable takes that value. To be a valid -pmf a function, $p$, must satisfy - - 1. $p(x) \geq 0$ for all $x$ - 2. $\sum_{x} p(x) = 1$ - -The sum is taken over all of the possible values for $x$. - ---- - -## Example - -Let $X$ be the result of a coin flip where $X=0$ represents -tails and $X = 1$ represents heads. -$$ -p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 -$$ -Suppose that we do not know whether or not the coin is fair; Let -$\theta$ be the probability of a head expressed as a proportion -(between 0 and 1). -$$ -p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 -$$ - ---- - -## PDF - -A probability density function (pdf), is a function associated with -a continuous random variable - - *Areas under pdfs correspond to probabilities for that random variable* - -To be a valid pdf, a function $f$ must satisfy - -1. $f(x) \geq 0$ for all $x$ - -2. The area under $f(x)$ is one. - ---- -## Example - -Suppose that the proportion of help calls that get addressed in -a random day by a help line is given by -$$ -f(x) = \left\{\begin{array}{ll} - 2 x & \mbox{ for } 1 > x > 0 \\ - 0 & \mbox{ otherwise} -\end{array} \right. -$$ - -Is this a mathematically valid density? - ---- - -```{r, fig.height = 5, fig.width = 5, echo = TRUE, fig.align='center'} -x <- c(-0.5, 0, 1, 1, 1.5); y <- c( 0, 0, 2, 0, 0) -plot(x, y, lwd = 3, frame = FALSE, type = "l") -``` - ---- - -## Example continued - -What is the probability that 75% or less of calls get addressed? - -```{r, fig.height = 5, fig.width = 5, echo = FALSE, fig.align='center'} -plot(x, y, lwd = 3, frame = FALSE, type = "l") -polygon(c(0, .75, .75, 0), c(0, 0, 1.5, 0), lwd = 3, col = "lightblue") -``` - ---- -```{r} -1.5 * .75 / 2 -pbeta(.75, 2, 1) -``` ---- - -## CDF and survival function - -- The **cumulative distribution function** (CDF) of a random variable $X$ is defined as the function -$$ -F(x) = P(X \leq x) -$$ -- This definition applies regardless of whether $X$ is discrete or continuous. -- The **survival function** of a random variable $X$ is defined as -$$ -S(x) = P(X > x) -$$ -- Notice that $S(x) = 1 - F(x)$ -- For continuous random variables, the PDF is the derivative of the CDF - ---- - -## Example - -What are the survival function and CDF from the exponential density considered before? - -For $1 \geq x \geq 0$ -$$ -F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 -$$ - -$$ -S(x) = 1 - x^2 -$$ - -```{r} -pbeta(c(0.4, 0.5, 0.6), 2, 1) -``` - ---- - -## Quantiles - -- The $\alpha^{th}$ **quantile** of a distribution with distribution function $F$ is the point $x_\alpha$ so that -$$ -F(x_\alpha) = \alpha -$$ -- A **percentile** is simply a quantile with $\alpha$ expressed as a percent -- The **median** is the $50^{th}$ percentile - ---- -## Example -- We want to solve $0.5 = F(x) = x^2$ -- Resulting in the solution -```{r, echo = TRUE} -sqrt(0.5) -``` -- Therefore, about `r sqrt(0.5)` of calls being answered on a random day is the median. -- R can approximate quantiles for you for common distributions - -```{r} -qbeta(0.5, 2, 1) -``` - ---- - -## Summary - -- You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?" -- We're referring to are **population quantities**. Therefore, the median being - discussed is the **population median**. -- A probability model connects the data to the population using assumptions. -- Therefore the median we're discussing is the **estimand**, the sample median will be the **estimator** \ No newline at end of file +--- +title : Probability +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Notation + +- The **sample space**, $\Omega$, is the collection of possible outcomes of an experiment + - Example: die roll $\Omega = \{1,2,3,4,5,6\}$ +- An **event**, say $E$, is a subset of $\Omega$ + - Example: die roll is even $E = \{2,4,6\}$ +- An **elementary** or **simple** event is a particular result + of an experiment + - Example: die roll is a four, $\omega = 4$ +- $\emptyset$ is called the **null event** or the **empty set** + +--- + +## Interpretation of set operations + +Normal set operations have particular interpretations in this setting + +1. $\omega \in E$ implies that $E$ occurs when $\omega$ occurs +2. $\omega \not\in E$ implies that $E$ does not occur when $\omega$ occurs +3. $E \subset F$ implies that the occurrence of $E$ implies the occurrence of $F$ +4. $E \cap F$ implies the event that both $E$ and $F$ occur +5. $E \cup F$ implies the event that at least one of $E$ or $F$ occur +6. $E \cap F=\emptyset$ means that $E$ and $F$ are **mutually exclusive**, or cannot both occur +7. $E^c$ or $\bar E$ is the event that $E$ does not occur + +--- + +## Probability + +A **probability measure**, $P$, is a function from the collection of possible events so that the following hold + +1. For an event $E\subset \Omega$, $0 \leq P(E) \leq 1$ +2. $P(\Omega) = 1$ +3. If $E_1$ and $E_2$ are mutually exclusive events + $P(E_1 \cup E_2) = P(E_1) + P(E_2)$. + +Part 3 of the definition implies **finite additivity** + +$$ +P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) +$$ +where the $\{A_i\}$ are mutually exclusive. (Note a more general version of +additivity is used in advanced classes.) + + +--- + + +## Example consequences + +- $P(\emptyset) = 0$ +- $P(E) = 1 - P(E^c)$ +- $P(A \cup B) = P(A) + P(B) - P(A \cap B)$ +- if $A \subset B$ then $P(A) \leq P(B)$ +- $P\left(A \cup B\right) = 1 - P(A^c \cap B^c)$ +- $P(A \cap B^c) = P(A) - P(A \cap B)$ +- $P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)$ +- $P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)$ + +--- + +## Example + +The National Sleep Foundation ([www.sleepfoundation.org](http://www.sleepfoundation.org/)) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts? + +--- + +## Example continued + +Answer: No, the events are not mutually exclusive. To elaborate let: + +$$ +\begin{eqnarray*} + A_1 & = & \{\mbox{Person has sleep apnea}\} \\ + A_2 & = & \{\mbox{Person has RLS}\} + \end{eqnarray*} +$$ + +Then + +$$ +\begin{eqnarray*} + P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ + & = & 0.13 - \mbox{Probability of having both} + \end{eqnarray*} +$$ +Likely, some fraction of the population has both. + +--- + +## Random variables + +- A **random variable** is a numerical outcome of an experiment. +- The random variables that we study will come in two varieties, + **discrete** or **continuous**. +- Discrete random variable are random variables that take on only a +countable number of possibilities. + * $P(X = k)$ +- Continuous random variable can take any value on the real line or some subset of the real line. + * $P(X \in A)$ + +--- + +## Examples of variables that can be thought of as random variables + +- The $(0-1)$ outcome of the flip of a coin +- The outcome from the roll of a die +- The BMI of a subject four years after a baseline measurement +- The hypertension status of a subject randomly drawn from a population + +--- + +## PMF + +A probability mass function evaluated at a value corresponds to the +probability that a random variable takes that value. To be a valid +pmf a function, $p$, must satisfy + + 1. $p(x) \geq 0$ for all $x$ + 2. $\sum_{x} p(x) = 1$ + +The sum is taken over all of the possible values for $x$. + +--- + +## Example + +Let $X$ be the result of a coin flip where $X=0$ represents +tails and $X = 1$ represents heads. +$$ +p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 +$$ +Suppose that we do not know whether or not the coin is fair; Let +$\theta$ be the probability of a head expressed as a proportion +(between 0 and 1). +$$ +p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 +$$ + +--- + +## PDF + +A probability density function (pdf), is a function associated with +a continuous random variable + + *Areas under pdfs correspond to probabilities for that random variable* + +To be a valid pdf, a function $f$ must satisfy + +1. $f(x) \geq 0$ for all $x$ + +2. The area under $f(x)$ is one. + +--- +## Example + +Suppose that the proportion of help calls that get addressed in +a random day by a help line is given by +$$ +f(x) = \left\{\begin{array}{ll} + 2 x & \mbox{ for } 1 > x > 0 \\ + 0 & \mbox{ otherwise} +\end{array} \right. +$$ + +Is this a mathematically valid density? + +--- + +```{r, fig.height = 5, fig.width = 5, echo = TRUE, fig.align='center'} +x <- c(-0.5, 0, 1, 1, 1.5); y <- c( 0, 0, 2, 0, 0) +plot(x, y, lwd = 3, frame = FALSE, type = "l") +``` + +--- + +## Example continued + +What is the probability that 75% or fewer of calls get addressed? + +```{r, fig.height = 5, fig.width = 5, echo = FALSE, fig.align='center'} +plot(x, y, lwd = 3, frame = FALSE, type = "l") +polygon(c(0, .75, .75, 0), c(0, 0, 1.5, 0), lwd = 3, col = "lightblue") +``` + +--- +```{r} +1.5 * .75 / 2 +pbeta(.75, 2, 1) +``` +--- + +## CDF and survival function + +- The **cumulative distribution function** (CDF) of a random variable $X$ is defined as the function +$$ +F(x) = P(X \leq x) +$$ +- This definition applies regardless of whether $X$ is discrete or continuous. +- The **survival function** of a random variable $X$ is defined as +$$ +S(x) = P(X > x) +$$ +- Notice that $S(x) = 1 - F(x)$ +- For continuous random variables, the PDF is the derivative of the CDF + +--- + +## Example + +What are the survival function and CDF from the density considered before? + +For $1 \geq x \geq 0$ +$$ +F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 +$$ + +$$ +S(x) = 1 - x^2 +$$ + +```{r} +pbeta(c(0.4, 0.5, 0.6), 2, 1) +``` + +--- + +## Quantiles + +- The $\alpha^{th}$ **quantile** of a distribution with distribution function $F$ is the point $x_\alpha$ so that +$$ +F(x_\alpha) = \alpha +$$ +- A **percentile** is simply a quantile with $\alpha$ expressed as a percent +- The **median** is the $50^{th}$ percentile + +--- +## Example +- We want to solve $0.5 = F(x) = x^2$ +- Resulting in the solution +```{r, echo = TRUE} +sqrt(0.5) +``` +- Therefore, about `r sqrt(0.5)` of calls being answered on a random day is the median. +- R can approximate quantiles for you for common distributions + +```{r} +qbeta(0.5, 2, 1) +``` + +--- + +## Summary + +- You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?" +- We're referring to are **population quantities**. Therefore, the median being + discussed is the **population median**. +- A probability model connects the data to the population using assumptions. +- Therefore the median we're discussing is the **estimand**, the sample median will be the **estimator** + diff --git a/06_StatisticalInference/01_02_Probability/index.html b/06_StatisticalInference/01_02_Probability/index.html index e6f6d97c1..8e224deef 100644 --- a/06_StatisticalInference/01_02_Probability/index.html +++ b/06_StatisticalInference/01_02_Probability/index.html @@ -1,507 +1,617 @@ - - - - Probability - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Probability

-

Statistical Inference

-

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

-
-
- - - -
-

Notation

-
-
-
    -
  • The sample space, \(\Omega\), is the collection of possible outcomes of an experiment - -
      -
    • Example: die roll \(\Omega = \{1,2,3,4,5,6\}\)
    • -
  • -
  • An event, say \(E\), is a subset of \(\Omega\) - -
      -
    • Example: die roll is even \(E = \{2,4,6\}\)
    • -
  • -
  • An elementary or simple event is a particular result -of an experiment - -
      -
    • Example: die roll is a four, \(\omega = 4\)
    • -
  • -
  • \(\emptyset\) is called the null event or the empty set
  • -
- -
- -
- - -
-

Interpretation of set operations

-
-
-

Normal set operations have particular interpretations in this setting

- -
    -
  1. \(\omega \in E\) implies that \(E\) occurs when \(\omega\) occurs
  2. -
  3. \(\omega \not\in E\) implies that \(E\) does not occur when \(\omega\) occurs
  4. -
  5. \(E \subset F\) implies that the occurrence of \(E\) implies the occurrence of \(F\)
  6. -
  7. \(E \cap F\) implies the event that both \(E\) and \(F\) occur
  8. -
  9. \(E \cup F\) implies the event that at least one of \(E\) or \(F\) occur
  10. -
  11. \(E \cap F=\emptyset\) means that \(E\) and \(F\) are mutually exclusive, or cannot both occur
  12. -
  13. \(E^c\) or \(\bar E\) is the event that \(E\) does not occur
  14. -
- -
- -
- - -
-

Probability

-
-
-

A probability measure, \(P\), is a function from the collection of possible events so that the following hold

- -
    -
  1. For an event \(E\subset \Omega\), \(0 \leq P(E) \leq 1\)
  2. -
  3. \(P(\Omega) = 1\)
  4. -
  5. If \(E_1\) and \(E_2\) are mutually exclusive events -\(P(E_1 \cup E_2) = P(E_1) + P(E_2)\).
  6. -
- -

Part 3 of the definition implies finite additivity

- -

\[ -P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) -\] -where the \(\{A_i\}\) are mutually exclusive. (Note a more general version of -additivity is used in advanced classes.)

- -
- -
- - -
-

Example consequences

-
-
-
    -
  • \(P(\emptyset) = 0\)
  • -
  • \(P(E) = 1 - P(E^c)\)
  • -
  • \(P(A \cup B) = P(A) + P(B) - P(A \cap B)\)
  • -
  • if \(A \subset B\) then \(P(A) \leq P(B)\)
  • -
  • \(P\left(A \cup B\right) = 1 - P(A^c \cap B^c)\)
  • -
  • \(P(A \cap B^c) = P(A) - P(A \cap B)\)
  • -
  • \(P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)\)
  • -
  • \(P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)\)
  • -
- -
- -
- - -
-

Example

-
-
-

The National Sleep Foundation (www.sleepfoundation.org) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts?

- -
- -
- - -
-

Example continued

-
-
-

Answer: No, the events are not mutually exclusive. To elaborate let:

- -

\[ -\begin{eqnarray*} - A_1 & = & \{\mbox{Person has sleep apnea}\} \\ - A_2 & = & \{\mbox{Person has RLS}\} - \end{eqnarray*} -\]

- -

Then

- -

\[ -\begin{eqnarray*} - P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ - & = & 0.13 - \mbox{Probability of having both} - \end{eqnarray*} -\] -Likely, some fraction of the population has both.

- -
- -
- - -
-

Random variables

-
-
-
    -
  • A random variable is a numerical outcome of an experiment.
  • -
  • The random variables that we study will come in two varieties, -discrete or continuous.
  • -
  • Discrete random variable are random variables that take on only a -countable number of possibilities. - -
      -
    • \(P(X = k)\)
    • -
  • -
  • Continuous random variable can take any value on the real line or some subset of the real line. - -
      -
    • \(P(X \in A)\)
    • -
  • -
- -
- -
- - -
-

Examples of variables that can be thought of as random variables

-
-
-
    -
  • The \((0-1)\) outcome of the flip of a coin
  • -
  • The outcome from the roll of a die
  • -
  • The BMI of a subject four years after a baseline measurement
  • -
  • The hypertension status of a subject randomly drawn from a population
  • -
- -
- -
- - -
-

PMF

-
-
-

A probability mass function evaluated at a value corresponds to the -probability that a random variable takes that value. To be a valid -pmf a function, \(p\), must satisfy

- -
    -
  1. \(p(x) \geq 0\) for all \(x\)
  2. -
  3. \(\sum_{x} p(x) = 1\)
  4. -
- -

The sum is taken over all of the possible values for \(x\).

- -
- -
- - -
-

Example

-
-
-

Let \(X\) be the result of a coin flip where \(X=0\) represents -tails and \(X = 1\) represents heads. -\[ -p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 -\] -Suppose that we do not know whether or not the coin is fair; Let -\(\theta\) be the probability of a head expressed as a proportion -(between 0 and 1). -\[ -p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 -\]

- -
- -
- - -
-

PDF

-
-
-

A probability density function (pdf), is a function associated with -a continuous random variable

- -

Areas under pdfs correspond to probabilities for that random variable

- -

To be a valid pdf, a function \(f\) must satisfy

- -
    -
  1. \(f(x) \geq 0\) for all \(x\)

  2. -
  3. The area under \(f(x)\) is one.

  4. -
- -
- -
- - -
-

Example

-
-
-

Suppose that the proportion of help calls that get addressed in -a random day by a help line is given by -\[ -f(x) = \left\{\begin{array}{ll} - 2 x & \mbox{ for } 1 > x > 0 \\ - 0 & \mbox{ otherwise} -\end{array} \right. -\]

- -

Is this a mathematically valid density?

- -
- -
- - -
- -
-
-
x <- c(-0.5, 0, 1, 1, 1.5)
-y <- c(0, 0, 2, 0, 0)
-plot(x, y, lwd = 3, frame = FALSE, type = "l")
-
- -

plot of chunk unnamed-chunk-1

- -
- -
- - -
-

Example continued

-
-
-

What is the probability that 75% or less of calls get addressed?

- -

plot of chunk unnamed-chunk-2

- -
- -
- - -
- -
-
-
1.5 * 0.75/2
-
- -
## [1] 0.5625
-
- -
pbeta(0.75, 2, 1)
-
- -
## [1] 0.5625
-
- -
- -
- - -
-

CDF and survival function

-
-
-
    -
  • The cumulative distribution function (CDF) of a random variable \(X\) is defined as the function -\[ -F(x) = P(X \leq x) -\]
  • -
  • This definition applies regardless of whether \(X\) is discrete or continuous.
  • -
  • The survival function of a random variable \(X\) is defined as -\[ -S(x) = P(X > x) -\]
  • -
  • Notice that \(S(x) = 1 - F(x)\)
  • -
  • For continuous random variables, the PDF is the derivative of the CDF
  • -
- -
- -
- - -
-

Example

-
-
-

What are the survival function and CDF from the exponential density considered before?

- -

For \(1 \geq x \geq 0\) -\[ -F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 -\]

- -

\[ -S(x) = 1 - x^2 -\]

- -
pbeta(c(0.4, 0.5, 0.6), 2, 1)
-
- -
## [1] 0.16 0.25 0.36
-
- -
- -
- - -
-

Quantiles

-
-
-
    -
  • The \(\alpha^{th}\) quantile of a distribution with distribution function \(F\) is the point \(x_\alpha\) so that -\[ -F(x_\alpha) = \alpha -\]
  • -
  • A percentile is simply a quantile with \(\alpha\) expressed as a percent
  • -
  • The median is the \(50^{th}\) percentile
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • We want to solve \(0.5 = F(x) = x^2\)
  • -
  • Resulting in the solution
  • -
- -
sqrt(0.5)
-
- -
## [1] 0.7071
-
- -
    -
  • Therefore, about 0.7071 of calls being answered on a random day is the median.
  • -
  • R can approximate quantiles for you for common distributions
  • -
- -
qbeta(0.5, 2, 1)
-
- -
## [1] 0.7071
-
- -
- -
- - -
-

Summary

-
-
-
    -
  • You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?"
  • -
  • We're referring to are population quantities. Therefore, the median being -discussed is the population median.
  • -
  • A probability model connects the data to the population using assumptions.
  • -
  • Therefore the median we're discussing is the estimand, the sample median will be the estimator
  • -
- -
- -
- - -
- - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Probability + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Probability

+

Statistical Inference

+

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

+
+
+
+ + + + +
+

Notation

+
+
+
    +
  • The sample space, \(\Omega\), is the collection of possible outcomes of an experiment + +
      +
    • Example: die roll \(\Omega = \{1,2,3,4,5,6\}\)
    • +
  • +
  • An event, say \(E\), is a subset of \(\Omega\) + +
      +
    • Example: die roll is even \(E = \{2,4,6\}\)
    • +
  • +
  • An elementary or simple event is a particular result +of an experiment + +
      +
    • Example: die roll is a four, \(\omega = 4\)
    • +
  • +
  • \(\emptyset\) is called the null event or the empty set
  • +
+ +
+ +
+ + +
+

Interpretation of set operations

+
+
+

Normal set operations have particular interpretations in this setting

+ +
    +
  1. \(\omega \in E\) implies that \(E\) occurs when \(\omega\) occurs
  2. +
  3. \(\omega \not\in E\) implies that \(E\) does not occur when \(\omega\) occurs
  4. +
  5. \(E \subset F\) implies that the occurrence of \(E\) implies the occurrence of \(F\)
  6. +
  7. \(E \cap F\) implies the event that both \(E\) and \(F\) occur
  8. +
  9. \(E \cup F\) implies the event that at least one of \(E\) or \(F\) occur
  10. +
  11. \(E \cap F=\emptyset\) means that \(E\) and \(F\) are mutually exclusive, or cannot both occur
  12. +
  13. \(E^c\) or \(\bar E\) is the event that \(E\) does not occur
  14. +
+ +
+ +
+ + +
+

Probability

+
+
+

A probability measure, \(P\), is a function from the collection of possible events so that the following hold

+ +
    +
  1. For an event \(E\subset \Omega\), \(0 \leq P(E) \leq 1\)
  2. +
  3. \(P(\Omega) = 1\)
  4. +
  5. If \(E_1\) and \(E_2\) are mutually exclusive events +\(P(E_1 \cup E_2) = P(E_1) + P(E_2)\).
  6. +
+ +

Part 3 of the definition implies finite additivity

+ +

\[ +P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) +\] +where the \(\{A_i\}\) are mutually exclusive. (Note a more general version of +additivity is used in advanced classes.)

+ +
+ +
+ + +
+

Example consequences

+
+
+
    +
  • \(P(\emptyset) = 0\)
  • +
  • \(P(E) = 1 - P(E^c)\)
  • +
  • \(P(A \cup B) = P(A) + P(B) - P(A \cap B)\)
  • +
  • if \(A \subset B\) then \(P(A) \leq P(B)\)
  • +
  • \(P\left(A \cup B\right) = 1 - P(A^c \cap B^c)\)
  • +
  • \(P(A \cap B^c) = P(A) - P(A \cap B)\)
  • +
  • \(P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)\)
  • +
  • \(P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)\)
  • +
+ +
+ +
+ + +
+

Example

+
+
+

The National Sleep Foundation (www.sleepfoundation.org) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts?

+ +
+ +
+ + +
+

Example continued

+
+
+

Answer: No, the events are not mutually exclusive. To elaborate let:

+ +

\[ +\begin{eqnarray*} + A_1 & = & \{\mbox{Person has sleep apnea}\} \\ + A_2 & = & \{\mbox{Person has RLS}\} + \end{eqnarray*} +\]

+ +

Then

+ +

\[ +\begin{eqnarray*} + P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ + & = & 0.13 - \mbox{Probability of having both} + \end{eqnarray*} +\] +Likely, some fraction of the population has both.

+ +
+ +
+ + +
+

Random variables

+
+
+
    +
  • A random variable is a numerical outcome of an experiment.
  • +
  • The random variables that we study will come in two varieties, +discrete or continuous.
  • +
  • Discrete random variable are random variables that take on only a +countable number of possibilities. + +
      +
    • \(P(X = k)\)
    • +
  • +
  • Continuous random variable can take any value on the real line or some subset of the real line. + +
      +
    • \(P(X \in A)\)
    • +
  • +
+ +
+ +
+ + +
+

Examples of variables that can be thought of as random variables

+
+
+
    +
  • The \((0-1)\) outcome of the flip of a coin
  • +
  • The outcome from the roll of a die
  • +
  • The BMI of a subject four years after a baseline measurement
  • +
  • The hypertension status of a subject randomly drawn from a population
  • +
+ +
+ +
+ + +
+

PMF

+
+
+

A probability mass function evaluated at a value corresponds to the +probability that a random variable takes that value. To be a valid +pmf a function, \(p\), must satisfy

+ +
    +
  1. \(p(x) \geq 0\) for all \(x\)
  2. +
  3. \(\sum_{x} p(x) = 1\)
  4. +
+ +

The sum is taken over all of the possible values for \(x\).

+ +
+ +
+ + +
+

Example

+
+
+

Let \(X\) be the result of a coin flip where \(X=0\) represents +tails and \(X = 1\) represents heads. +\[ +p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 +\] +Suppose that we do not know whether or not the coin is fair; Let +\(\theta\) be the probability of a head expressed as a proportion +(between 0 and 1). +\[ +p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 +\]

+ +
+ +
+ + +
+

PDF

+
+
+

A probability density function (pdf), is a function associated with +a continuous random variable

+ +

Areas under pdfs correspond to probabilities for that random variable

+ +

To be a valid pdf, a function \(f\) must satisfy

+ +
    +
  1. \(f(x) \geq 0\) for all \(x\)

  2. +
  3. The area under \(f(x)\) is one.

  4. +
+ +
+ +
+ + +
+

Example

+
+
+

Suppose that the proportion of help calls that get addressed in +a random day by a help line is given by +\[ +f(x) = \left\{\begin{array}{ll} + 2 x & \mbox{ for } 1 > x > 0 \\ + 0 & \mbox{ otherwise} +\end{array} \right. +\]

+ +

Is this a mathematically valid density?

+ +
+ +
+ + +
+
x <- c(-0.5, 0, 1, 1, 1.5)
+y <- c(0, 0, 2, 0, 0)
+plot(x, y, lwd = 3, frame = FALSE, type = "l")
+
+ +

plot of chunk unnamed-chunk-1

+ +
+ +
+ + +
+

Example continued

+
+
+

What is the probability that 75% or fewer of calls get addressed?

+ +

plot of chunk unnamed-chunk-2

+ +
+ +
+ + +
+
1.5 * 0.75/2
+
+ +
## [1] 0.5625
+
+ +
pbeta(0.75, 2, 1)
+
+ +
## [1] 0.5625
+
+ +
+ +
+ + +
+

CDF and survival function

+
+
+
    +
  • The cumulative distribution function (CDF) of a random variable \(X\) is defined as the function +\[ +F(x) = P(X \leq x) +\]
  • +
  • This definition applies regardless of whether \(X\) is discrete or continuous.
  • +
  • The survival function of a random variable \(X\) is defined as +\[ +S(x) = P(X > x) +\]
  • +
  • Notice that \(S(x) = 1 - F(x)\)
  • +
  • For continuous random variables, the PDF is the derivative of the CDF
  • +
+ +
+ +
+ + +
+

Example

+
+
+

What are the survival function and CDF from the density considered before?

+ +

For \(1 \geq x \geq 0\) +\[ +F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 +\]

+ +

\[ +S(x) = 1 - x^2 +\]

+ +
pbeta(c(0.4, 0.5, 0.6), 2, 1)
+
+ +
## [1] 0.16 0.25 0.36
+
+ +
+ +
+ + +
+

Quantiles

+
+
+
    +
  • The \(\alpha^{th}\) quantile of a distribution with distribution function \(F\) is the point \(x_\alpha\) so that +\[ +F(x_\alpha) = \alpha +\]
  • +
  • A percentile is simply a quantile with \(\alpha\) expressed as a percent
  • +
  • The median is the \(50^{th}\) percentile
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • We want to solve \(0.5 = F(x) = x^2\)
  • +
  • Resulting in the solution
  • +
+ +
sqrt(0.5)
+
+ +
## [1] 0.7071
+
+ +
    +
  • Therefore, about 0.7071 of calls being answered on a random day is the median.
  • +
  • R can approximate quantiles for you for common distributions
  • +
+ +
qbeta(0.5, 2, 1)
+
+ +
## [1] 0.7071
+
+ +
+ +
+ + +
+

Summary

+
+
+
    +
  • You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?"
  • +
  • We're referring to are population quantities. Therefore, the median being +discussed is the population median.
  • +
  • A probability model connects the data to the population using assumptions.
  • +
  • Therefore the median we're discussing is the estimand, the sample median will be the estimator
  • +
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/06_StatisticalInference/01_02_Probability/index.md b/06_StatisticalInference/01_02_Probability/index.md index 86a05088d..61a470797 100644 --- a/06_StatisticalInference/01_02_Probability/index.md +++ b/06_StatisticalInference/01_02_Probability/index.md @@ -1,310 +1,311 @@ ---- -title : Probability -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Notation - -- The **sample space**, $\Omega$, is the collection of possible outcomes of an experiment - - Example: die roll $\Omega = \{1,2,3,4,5,6\}$ -- An **event**, say $E$, is a subset of $\Omega$ - - Example: die roll is even $E = \{2,4,6\}$ -- An **elementary** or **simple** event is a particular result - of an experiment - - Example: die roll is a four, $\omega = 4$ -- $\emptyset$ is called the **null event** or the **empty set** - ---- - -## Interpretation of set operations - -Normal set operations have particular interpretations in this setting - -1. $\omega \in E$ implies that $E$ occurs when $\omega$ occurs -2. $\omega \not\in E$ implies that $E$ does not occur when $\omega$ occurs -3. $E \subset F$ implies that the occurrence of $E$ implies the occurrence of $F$ -4. $E \cap F$ implies the event that both $E$ and $F$ occur -5. $E \cup F$ implies the event that at least one of $E$ or $F$ occur -6. $E \cap F=\emptyset$ means that $E$ and $F$ are **mutually exclusive**, or cannot both occur -7. $E^c$ or $\bar E$ is the event that $E$ does not occur - ---- - -## Probability - -A **probability measure**, $P$, is a function from the collection of possible events so that the following hold - -1. For an event $E\subset \Omega$, $0 \leq P(E) \leq 1$ -2. $P(\Omega) = 1$ -3. If $E_1$ and $E_2$ are mutually exclusive events - $P(E_1 \cup E_2) = P(E_1) + P(E_2)$. - -Part 3 of the definition implies **finite additivity** - -$$ -P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) -$$ -where the $\{A_i\}$ are mutually exclusive. (Note a more general version of -additivity is used in advanced classes.) - - ---- - - -## Example consequences - -- $P(\emptyset) = 0$ -- $P(E) = 1 - P(E^c)$ -- $P(A \cup B) = P(A) + P(B) - P(A \cap B)$ -- if $A \subset B$ then $P(A) \leq P(B)$ -- $P\left(A \cup B\right) = 1 - P(A^c \cap B^c)$ -- $P(A \cap B^c) = P(A) - P(A \cap B)$ -- $P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)$ -- $P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)$ - ---- - -## Example - -The National Sleep Foundation ([www.sleepfoundation.org](http://www.sleepfoundation.org/)) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts? - ---- - -## Example continued - -Answer: No, the events are not mutually exclusive. To elaborate let: - -$$ -\begin{eqnarray*} - A_1 & = & \{\mbox{Person has sleep apnea}\} \\ - A_2 & = & \{\mbox{Person has RLS}\} - \end{eqnarray*} -$$ - -Then - -$$ -\begin{eqnarray*} - P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ - & = & 0.13 - \mbox{Probability of having both} - \end{eqnarray*} -$$ -Likely, some fraction of the population has both. - ---- - -## Random variables - -- A **random variable** is a numerical outcome of an experiment. -- The random variables that we study will come in two varieties, - **discrete** or **continuous**. -- Discrete random variable are random variables that take on only a -countable number of possibilities. - * $P(X = k)$ -- Continuous random variable can take any value on the real line or some subset of the real line. - * $P(X \in A)$ - ---- - -## Examples of variables that can be thought of as random variables - -- The $(0-1)$ outcome of the flip of a coin -- The outcome from the roll of a die -- The BMI of a subject four years after a baseline measurement -- The hypertension status of a subject randomly drawn from a population - ---- - -## PMF - -A probability mass function evaluated at a value corresponds to the -probability that a random variable takes that value. To be a valid -pmf a function, $p$, must satisfy - - 1. $p(x) \geq 0$ for all $x$ - 2. $\sum_{x} p(x) = 1$ - -The sum is taken over all of the possible values for $x$. - ---- - -## Example - -Let $X$ be the result of a coin flip where $X=0$ represents -tails and $X = 1$ represents heads. -$$ -p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 -$$ -Suppose that we do not know whether or not the coin is fair; Let -$\theta$ be the probability of a head expressed as a proportion -(between 0 and 1). -$$ -p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 -$$ - ---- - -## PDF - -A probability density function (pdf), is a function associated with -a continuous random variable - - *Areas under pdfs correspond to probabilities for that random variable* - -To be a valid pdf, a function $f$ must satisfy - -1. $f(x) \geq 0$ for all $x$ - -2. The area under $f(x)$ is one. - ---- -## Example - -Suppose that the proportion of help calls that get addressed in -a random day by a help line is given by -$$ -f(x) = \left\{\begin{array}{ll} - 2 x & \mbox{ for } 1 > x > 0 \\ - 0 & \mbox{ otherwise} -\end{array} \right. -$$ - -Is this a mathematically valid density? - ---- - - -```r -x <- c(-0.5, 0, 1, 1, 1.5) -y <- c(0, 0, 2, 0, 0) -plot(x, y, lwd = 3, frame = FALSE, type = "l") -``` - -![plot of chunk unnamed-chunk-1](figure/unnamed-chunk-1.png) - - ---- - -## Example continued - -What is the probability that 75% or less of calls get addressed? - -![plot of chunk unnamed-chunk-2](figure/unnamed-chunk-2.png) - - ---- - -```r -1.5 * 0.75/2 -``` - -``` -## [1] 0.5625 -``` - -```r -pbeta(0.75, 2, 1) -``` - -``` -## [1] 0.5625 -``` - ---- - -## CDF and survival function - -- The **cumulative distribution function** (CDF) of a random variable $X$ is defined as the function -$$ -F(x) = P(X \leq x) -$$ -- This definition applies regardless of whether $X$ is discrete or continuous. -- The **survival function** of a random variable $X$ is defined as -$$ -S(x) = P(X > x) -$$ -- Notice that $S(x) = 1 - F(x)$ -- For continuous random variables, the PDF is the derivative of the CDF - ---- - -## Example - -What are the survival function and CDF from the exponential density considered before? - -For $1 \geq x \geq 0$ -$$ -F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 -$$ - -$$ -S(x) = 1 - x^2 -$$ - - -```r -pbeta(c(0.4, 0.5, 0.6), 2, 1) -``` - -``` -## [1] 0.16 0.25 0.36 -``` - - ---- - -## Quantiles - -- The $\alpha^{th}$ **quantile** of a distribution with distribution function $F$ is the point $x_\alpha$ so that -$$ -F(x_\alpha) = \alpha -$$ -- A **percentile** is simply a quantile with $\alpha$ expressed as a percent -- The **median** is the $50^{th}$ percentile - ---- -## Example -- We want to solve $0.5 = F(x) = x^2$ -- Resulting in the solution - -```r -sqrt(0.5) -``` - -``` -## [1] 0.7071 -``` - -- Therefore, about 0.7071 of calls being answered on a random day is the median. -- R can approximate quantiles for you for common distributions - - -```r -qbeta(0.5, 2, 1) -``` - -``` -## [1] 0.7071 -``` - - ---- - -## Summary - -- You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?" -- We're referring to are **population quantities**. Therefore, the median being - discussed is the **population median**. -- A probability model connects the data to the population using assumptions. -- Therefore the median we're discussing is the **estimand**, the sample median will be the **estimator** +--- +title : Probability +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Notation + +- The **sample space**, $\Omega$, is the collection of possible outcomes of an experiment + - Example: die roll $\Omega = \{1,2,3,4,5,6\}$ +- An **event**, say $E$, is a subset of $\Omega$ + - Example: die roll is even $E = \{2,4,6\}$ +- An **elementary** or **simple** event is a particular result + of an experiment + - Example: die roll is a four, $\omega = 4$ +- $\emptyset$ is called the **null event** or the **empty set** + +--- + +## Interpretation of set operations + +Normal set operations have particular interpretations in this setting + +1. $\omega \in E$ implies that $E$ occurs when $\omega$ occurs +2. $\omega \not\in E$ implies that $E$ does not occur when $\omega$ occurs +3. $E \subset F$ implies that the occurrence of $E$ implies the occurrence of $F$ +4. $E \cap F$ implies the event that both $E$ and $F$ occur +5. $E \cup F$ implies the event that at least one of $E$ or $F$ occur +6. $E \cap F=\emptyset$ means that $E$ and $F$ are **mutually exclusive**, or cannot both occur +7. $E^c$ or $\bar E$ is the event that $E$ does not occur + +--- + +## Probability + +A **probability measure**, $P$, is a function from the collection of possible events so that the following hold + +1. For an event $E\subset \Omega$, $0 \leq P(E) \leq 1$ +2. $P(\Omega) = 1$ +3. If $E_1$ and $E_2$ are mutually exclusive events + $P(E_1 \cup E_2) = P(E_1) + P(E_2)$. + +Part 3 of the definition implies **finite additivity** + +$$ +P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i) +$$ +where the $\{A_i\}$ are mutually exclusive. (Note a more general version of +additivity is used in advanced classes.) + + +--- + + +## Example consequences + +- $P(\emptyset) = 0$ +- $P(E) = 1 - P(E^c)$ +- $P(A \cup B) = P(A) + P(B) - P(A \cap B)$ +- if $A \subset B$ then $P(A) \leq P(B)$ +- $P\left(A \cup B\right) = 1 - P(A^c \cap B^c)$ +- $P(A \cap B^c) = P(A) - P(A \cap B)$ +- $P(\cup_{i=1}^n E_i) \leq \sum_{i=1}^n P(E_i)$ +- $P(\cup_{i=1}^n E_i) \geq \max_i P(E_i)$ + +--- + +## Example + +The National Sleep Foundation ([www.sleepfoundation.org](http://www.sleepfoundation.org/)) reports that around 3% of the American population has sleep apnea. They also report that around 10% of the North American and European population has restless leg syndrome. Does this imply that 13% of people will have at least one sleep problems of these sorts? + +--- + +## Example continued + +Answer: No, the events are not mutually exclusive. To elaborate let: + +$$ +\begin{eqnarray*} + A_1 & = & \{\mbox{Person has sleep apnea}\} \\ + A_2 & = & \{\mbox{Person has RLS}\} + \end{eqnarray*} +$$ + +Then + +$$ +\begin{eqnarray*} + P(A_1 \cup A_2 ) & = & P(A_1) + P(A_2) - P(A_1 \cap A_2) \\ + & = & 0.13 - \mbox{Probability of having both} + \end{eqnarray*} +$$ +Likely, some fraction of the population has both. + +--- + +## Random variables + +- A **random variable** is a numerical outcome of an experiment. +- The random variables that we study will come in two varieties, + **discrete** or **continuous**. +- Discrete random variable are random variables that take on only a +countable number of possibilities. + * $P(X = k)$ +- Continuous random variable can take any value on the real line or some subset of the real line. + * $P(X \in A)$ + +--- + +## Examples of variables that can be thought of as random variables + +- The $(0-1)$ outcome of the flip of a coin +- The outcome from the roll of a die +- The BMI of a subject four years after a baseline measurement +- The hypertension status of a subject randomly drawn from a population + +--- + +## PMF + +A probability mass function evaluated at a value corresponds to the +probability that a random variable takes that value. To be a valid +pmf a function, $p$, must satisfy + + 1. $p(x) \geq 0$ for all $x$ + 2. $\sum_{x} p(x) = 1$ + +The sum is taken over all of the possible values for $x$. + +--- + +## Example + +Let $X$ be the result of a coin flip where $X=0$ represents +tails and $X = 1$ represents heads. +$$ +p(x) = (1/2)^{x} (1/2)^{1-x} ~~\mbox{ for }~~x = 0,1 +$$ +Suppose that we do not know whether or not the coin is fair; Let +$\theta$ be the probability of a head expressed as a proportion +(between 0 and 1). +$$ +p(x) = \theta^{x} (1 - \theta)^{1-x} ~~\mbox{ for }~~x = 0,1 +$$ + +--- + +## PDF + +A probability density function (pdf), is a function associated with +a continuous random variable + + *Areas under pdfs correspond to probabilities for that random variable* + +To be a valid pdf, a function $f$ must satisfy + +1. $f(x) \geq 0$ for all $x$ + +2. The area under $f(x)$ is one. + +--- +## Example + +Suppose that the proportion of help calls that get addressed in +a random day by a help line is given by +$$ +f(x) = \left\{\begin{array}{ll} + 2 x & \mbox{ for } 1 > x > 0 \\ + 0 & \mbox{ otherwise} +\end{array} \right. +$$ + +Is this a mathematically valid density? + +--- + + +```r +x <- c(-0.5, 0, 1, 1, 1.5) +y <- c(0, 0, 2, 0, 0) +plot(x, y, lwd = 3, frame = FALSE, type = "l") +``` + +![plot of chunk unnamed-chunk-1](assets/fig/unnamed-chunk-1.png) + + +--- + +## Example continued + +What is the probability that 75% or fewer of calls get addressed? + +![plot of chunk unnamed-chunk-2](assets/fig/unnamed-chunk-2.png) + + +--- + +```r +1.5 * 0.75/2 +``` + +``` +## [1] 0.5625 +``` + +```r +pbeta(0.75, 2, 1) +``` + +``` +## [1] 0.5625 +``` + +--- + +## CDF and survival function + +- The **cumulative distribution function** (CDF) of a random variable $X$ is defined as the function +$$ +F(x) = P(X \leq x) +$$ +- This definition applies regardless of whether $X$ is discrete or continuous. +- The **survival function** of a random variable $X$ is defined as +$$ +S(x) = P(X > x) +$$ +- Notice that $S(x) = 1 - F(x)$ +- For continuous random variables, the PDF is the derivative of the CDF + +--- + +## Example + +What are the survival function and CDF from the density considered before? + +For $1 \geq x \geq 0$ +$$ +F(x) = P(X \leq x) = \frac{1}{2} Base \times Height = \frac{1}{2} (x) \times (2 x) = x^2 +$$ + +$$ +S(x) = 1 - x^2 +$$ + + +```r +pbeta(c(0.4, 0.5, 0.6), 2, 1) +``` + +``` +## [1] 0.16 0.25 0.36 +``` + + +--- + +## Quantiles + +- The $\alpha^{th}$ **quantile** of a distribution with distribution function $F$ is the point $x_\alpha$ so that +$$ +F(x_\alpha) = \alpha +$$ +- A **percentile** is simply a quantile with $\alpha$ expressed as a percent +- The **median** is the $50^{th}$ percentile + +--- +## Example +- We want to solve $0.5 = F(x) = x^2$ +- Resulting in the solution + +```r +sqrt(0.5) +``` + +``` +## [1] 0.7071 +``` + +- Therefore, about 0.7071 of calls being answered on a random day is the median. +- R can approximate quantiles for you for common distributions + + +```r +qbeta(0.5, 2, 1) +``` + +``` +## [1] 0.7071 +``` + + +--- + +## Summary + +- You might be wondering at this point "I've heard of a median before, it didn't require integration. Where's the data?" +- We're referring to are **population quantities**. Therefore, the median being + discussed is the **population median**. +- A probability model connects the data to the population using assumptions. +- Therefore the median we're discussing is the **estimand**, the sample median will be the **estimator** + diff --git a/06_StatisticalInference/01_03_Expectations/index.Rmd b/06_StatisticalInference/01_03_Expectations/index.Rmd index fbb49e872..a2e25e813 100644 --- a/06_StatisticalInference/01_03_Expectations/index.Rmd +++ b/06_StatisticalInference/01_03_Expectations/index.Rmd @@ -1,237 +1,238 @@ ---- -title : Expected values -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- -## Expected values - -- The **expected value** or **mean** of a random variable is the center of its distribution -- For discrete random variable $X$ with PMF $p(x)$, it is defined as follows - $$ - E[X] = \sum_x xp(x). - $$ - where the sum is taken over the possible values of $x$ -- $E[X]$ represents the center of mass of a collection of locations and weights, $\{x, p(x)\}$ - ---- - -## Example -### Find the center of mass of the bars -```{r ,fig.height=3.5,fig.width=8, fig.align='center'} -library(UsingR); data(galton) -par(mfrow=c(1,2)) -hist(galton$child,col="blue",breaks=100) -hist(galton$parent,col="blue",breaks=100) -``` - ---- -## Using manipulate -``` -library(manipulate) -myHist <- function(mu){ - hist(galton$child,col="blue",breaks=100) - lines(c(mu, mu), c(0, 150),col="red",lwd=5) - mse <- mean((galton$child - mu)^2) - text(63, 150, paste("mu = ", mu)) - text(63, 140, paste("Imbalance = ", round(mse, 2))) -} -manipulate(myHist(mu), mu = slider(62, 74, step = 0.5)) -``` - ---- -## The center of mass is the empirical mean -```{r lsm, dependson="galton",fig.height=4,fig.width=4, fig.align='center'} - hist(galton$child,col="blue",breaks=100) - meanChild <- mean(galton$child) - lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5) -``` - ---- -## Example - -- Suppose a coin is flipped and $X$ is declared $0$ or $1$ corresponding to a head or a tail, respectively -- What is the expected value of $X$? - $$ - E[X] = .5 \times 0 + .5 \times 1 = .5 - $$ -- Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be $.5$ - -```{r, echo = FALSE, fig.height=3.5, fig.width = 3.5, fig.align='center'} -barplot(height = c(.5, .5), names = c(0, 1), border = "black", col = "lightblue", space = .75) -``` ---- - -## Example - -- Suppose that a die is rolled and $X$ is the number face up -- What is the expected value of $X$? - $$ - E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + - 3 \times \frac{1}{6} + 4 \times \frac{1}{6} + - 5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 - $$ -- Again, the geometric argument makes this answer obvious without calculation. - ---- - -## Continuous random variables - -- For a continuous random variable, $X$, with density, $f$, the expected - value is defined as follows - $$ - E[X] = \mbox{the area under the function}~~~ t f(t) - $$ -- This definition borrows from the definition of center of mass for a continuous body - ---- - -## Example - -- Consider a density where $f(x) = 1$ for $x$ between zero and one -- (Is this a valid density?) -- Suppose that $X$ follows this density; what is its expected value? -```{r, fig.height=4, fig.width=8, echo=FALSE} -par(mfrow = c(1, 2)) -plot(c(-0.25, 0, 0, 1, 1, 1.25), c(0, 0, 1, 1, 0, 0), type = "l", lwd = 3, frame = FALSE, xlab="", ylab = ""); title('f(t)') -plot(c(-0.25, 0, 1, 1, 1.25), c(0, 0, 1, 0, 0), type = "l", lwd = 3, frame = FALSE, xlab="", ylab = ""); title('t f(t)') -``` - ---- - -## Rules about expected values - -- The expected value is a linear operator -- If $a$ and $b$ are not random and $X$ and $Y$ are two random variables then - - $E[aX + b] = a E[X] + b$ - - $E[X + Y] = E[X] + E[Y]$ - ---- - -## Example - -- You flip a coin, $X$ and simulate a uniform random number $Y$, what is the expected value of their sum? - $$ - E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 - $$ -- Another example, you roll a die twice. What is the expected value of the average? -- Let $X_1$ and $X_2$ be the results of the two rolls - $$ - E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) - = \frac{1}{2}(3.5 + 3.5) = 3.5 - $$ - ---- - -## Example - -1. Let $X_i$ for $i=1,\ldots,n$ be a collection of random variables, each from a distribution with mean $\mu$ -2. Calculate the expected value of the sample average of the $X_i$ -$$ - \begin{eqnarray*} - E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] - & = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ - & = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ - & = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. - \end{eqnarray*} -$$ - ---- - -## Remark - -- Therefore, the expected value of the **sample mean** is the population mean that it's trying to estimate -- When the expected value of an estimator is what its trying to estimate, we say that the estimator is **unbiased** - ---- - -## The variance - -- The variance of a random variable is a measure of {\em spread} -- If $X$ is a random variable with mean $\mu$, the variance of $X$ is defined as - -$$ -Var(X) = E[(X - \mu)^2] -$$ - -the expected (squared) distance from the mean -- Densities with a higher variance are more spread out than densities with a lower variance - ---- - -- Convenient computational form -$$ -Var(X) = E[X^2] - E[X]^2 -$$ -- If $a$ is constant then $Var(aX) = a^2 Var(X)$ -- The square root of the variance is called the **standard deviation** -- The standard deviation has the same units as $X$ - ---- - -## Example - -- What's the sample variance from the result of a toss of a die? - - - $E[X] = 3.5$ - - $E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17$ - -- $Var(X) = E[X^2] - E[X]^2 \approx 2.92$ - ---- - -## Example - -- What's the sample variance from the result of the toss of a coin with probability of heads (1) of $p$? - - - $E[X] = 0 \times (1 - p) + 1 \times p = p$ - - $E[X^2] = E[X] = p$ - -- $Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)$ - ---- - -## Interpreting variances - -- Chebyshev's inequality is useful for interpreting variances -- This inequality states that -$$ -P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} -$$ -- For example, the probability that a random variable lies beyond $k$ standard deviations from its mean is less than $1/k^2$ -$$ -\begin{eqnarray*} - 2\sigma & \rightarrow & 25\% \\ - 3\sigma & \rightarrow & 11\% \\ - 4\sigma & \rightarrow & 6\% -\end{eqnarray*} -$$ -- Note this is only a bound; the actual probability might be quite a bit smaller - ---- - -## Example - -- IQs are often said to be distributed with a mean of $100$ and a sd of $15$ -- What is the probability of a randomly drawn person having an IQ higher than $160$ or below $40$? -- Thus we want to know the probability of a person being more than $4$ standard deviations from the mean -- Thus Chebyshev's inequality suggests that this will be no larger than 6\% -- IQs distributions are often cited as being bell shaped, in which case this bound is very conservative -- The probability of a random draw from a bell curve being $4$ standard deviations from the mean is on the order of $10^{-5}$ (one thousandth of one percent) - ---- - -## Example - -- A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts -- Chebyshev's inequality states that the probability of a "Six Sigma" event is less than $1/6^2 \approx 3\%$ -- If a bell curve is assumed, the probability of a "six sigma" event is on the order of $10^{-9}$ (one ten millionth of a percent) \ No newline at end of file +--- +title : Expected values +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- +## Expected values + +- The **expected value** or **mean** of a random variable is the center of its distribution +- For discrete random variable $X$ with PMF $p(x)$, it is defined as follows + $$ + E[X] = \sum_x xp(x). + $$ + where the sum is taken over the possible values of $x$ +- $E[X]$ represents the center of mass of a collection of locations and weights, $\{x, p(x)\}$ + +--- + +## Example +### Find the center of mass of the bars +```{r ,fig.height=3.5,fig.width=8, fig.align='center', echo = FALSE} +library(UsingR); data(galton) +par(mfrow=c(1,2)) +hist(galton$child,col="blue",breaks=100) +hist(galton$parent,col="blue",breaks=100) +``` + +--- +## Using manipulate +``` +library(manipulate) +myHist <- function(mu){ + hist(galton$child,col="blue",breaks=100) + lines(c(mu, mu), c(0, 150),col="red",lwd=5) + mse <- mean((galton$child - mu)^2) + text(63, 150, paste("mu = ", mu)) + text(63, 140, paste("Imbalance = ", round(mse, 2))) +} +manipulate(myHist(mu), mu = slider(62, 74, step = 0.5)) +``` + +--- +## The center of mass is the empirical mean +```{r lsm, dependson="galton",fig.height=4,fig.width=4, fig.align='center'} + hist(galton$child,col="blue",breaks=100) + meanChild <- mean(galton$child) + lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5) +``` + +--- +## Example + +- Suppose a coin is flipped and $X$ is declared $0$ or $1$ corresponding to a head or a tail, respectively +- What is the expected value of $X$? + $$ + E[X] = .5 \times 0 + .5 \times 1 = .5 + $$ +- Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be $.5$ + +```{r, echo = FALSE, fig.height=3.5, fig.width = 3.5, fig.align='center'} +barplot(height = c(.5, .5), names = c(0, 1), border = "black", col = "lightblue", space = .75) +``` +--- + +## Example + +- Suppose that a die is rolled and $X$ is the number face up +- What is the expected value of $X$? + $$ + E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + + 3 \times \frac{1}{6} + 4 \times \frac{1}{6} + + 5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 + $$ +- Again, the geometric argument makes this answer obvious without calculation. + +--- + +## Continuous random variables + +- For a continuous random variable, $X$, with density, $f$, the expected + value is defined as follows + $$ + E[X] = \mbox{the area under the function}~~~ t f(t) + $$ +- This definition borrows from the definition of center of mass for a continuous body + +--- + +## Example + +- Consider a density where $f(x) = 1$ for $x$ between zero and one +- (Is this a valid density?) +- Suppose that $X$ follows this density; what is its expected value? +```{r, fig.height=4, fig.width=8, echo=FALSE} +par(mfrow = c(1, 2)) +plot(c(-0.25, 0, 0, 1, 1, 1.25), c(0, 0, 1, 1, 0, 0), type = "l", lwd = 3, frame = FALSE, xlab="", ylab = ""); title('f(t)') +plot(c(-0.25, 0, 1, 1, 1.25), c(0, 0, 1, 0, 0), type = "l", lwd = 3, frame = FALSE, xlab="", ylab = ""); title('t f(t)') +``` + +--- + +## Rules about expected values + +- The expected value is a linear operator +- If $a$ and $b$ are not random and $X$ and $Y$ are two random variables then + - $E[aX + b] = a E[X] + b$ + - $E[X + Y] = E[X] + E[Y]$ + +--- + +## Example + +- You flip a coin, $X$ and simulate a uniform random number $Y$, what is the expected value of their sum? + $$ + E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 + $$ +- Another example, you roll a die twice. What is the expected value of the average? +- Let $X_1$ and $X_2$ be the results of the two rolls + $$ + E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) + = \frac{1}{2}(3.5 + 3.5) = 3.5 + $$ + +--- + +## Example + +1. Let $X_i$ for $i=1,\ldots,n$ be a collection of random variables, each from a distribution with mean $\mu$ +2. Calculate the expected value of the sample average of the $X_i$ +$$ + \begin{eqnarray*} + E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] + & = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ + & = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ + & = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. + \end{eqnarray*} +$$ + +--- + +## Remark + +- Therefore, the expected value of the **sample mean** is the population mean that it's trying to estimate +- When the expected value of an estimator is what its trying to estimate, we say that the estimator is **unbiased** + +--- + +## The variance + +- The variance of a random variable is a measure of *spread* +- If $X$ is a random variable with mean $\mu$, the variance of $X$ is defined as + +$$ +Var(X) = E[(X - \mu)^2] +$$ + +the expected (squared) distance from the mean +- Densities with a higher variance are more spread out than densities with a lower variance + +--- + +- Convenient computational form +$$ +Var(X) = E[X^2] - E[X]^2 +$$ +- If $a$ is constant then $Var(aX) = a^2 Var(X)$ +- The square root of the variance is called the **standard deviation** +- The standard deviation has the same units as $X$ + +--- + +## Example + +- What's the sample variance from the result of a toss of a die? + + - $E[X] = 3.5$ + - $E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17$ + +- $Var(X) = E[X^2] - E[X]^2 \approx 2.92$ + +--- + +## Example + +- What's the sample variance from the result of the toss of a coin with probability of heads (1) of $p$? + + - $E[X] = 0 \times (1 - p) + 1 \times p = p$ + - $E[X^2] = E[X] = p$ + +- $Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)$ + +--- + +## Interpreting variances + +- Chebyshev's inequality is useful for interpreting variances +- This inequality states that +$$ +P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} +$$ +- For example, the probability that a random variable lies beyond $k$ standard deviations from its mean is less than $1/k^2$ +$$ +\begin{eqnarray*} + 2\sigma & \rightarrow & 25\% \\ + 3\sigma & \rightarrow & 11\% \\ + 4\sigma & \rightarrow & 6\% +\end{eqnarray*} +$$ +- Note this is only a bound; the actual probability might be quite a bit smaller + +--- + +## Example + +- IQs are often said to be distributed with a mean of $100$ and a sd of $15$ +- What is the probability of a randomly drawn person having an IQ higher than $160$ or below $40$? +- Thus we want to know the probability of a person being more than $4$ standard deviations from the mean +- Thus Chebyshev's inequality suggests that this will be no larger than 6\% +- IQs distributions are often cited as being bell shaped, in which case this bound is very conservative +- The probability of a random draw from a bell curve being $4$ standard deviations from the mean is on the order of $10^{-5}$ (one thousandth of one percent) + +--- + +## Example + +- A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts +- Chebyshev's inequality states that the probability of a "Six Sigma" event is less than $1/6^2 \approx 3\%$ +- If a bell curve is assumed, the probability of a "six sigma" event is on the order of $10^{-9}$ (one ten millionth of a percent) + diff --git a/06_StatisticalInference/01_03_Expectations/index.html b/06_StatisticalInference/01_03_Expectations/index.html index 82f05d557..04a508ac3 100644 --- a/06_StatisticalInference/01_03_Expectations/index.html +++ b/06_StatisticalInference/01_03_Expectations/index.html @@ -1,449 +1,549 @@ - - - - Expected values - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Expected values

-

Statistical Inference

-

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

-
-
- - - -
-

Expected values

-
-
-
    -
  • The expected value or mean of a random variable is the center of its distribution
  • -
  • For discrete random variable \(X\) with PMF \(p(x)\), it is defined as follows -\[ -E[X] = \sum_x xp(x). -\] -where the sum is taken over the possible values of \(x\)
  • -
  • \(E[X]\) represents the center of mass of a collection of locations and weights, \(\{x, p(x)\}\)
  • -
- -
- -
- - -
-

Example

-
-
-

Find the center of mass of the bars

- -
library(UsingR)
-data(galton)
-par(mfrow = c(1, 2))
-hist(galton$child, col = "blue", breaks = 100)
-hist(galton$parent, col = "blue", breaks = 100)
-
- -

plot of chunk unnamed-chunk-1

- -
- -
- - -
-

Using manipulate

-
-
-
library(manipulate)
-myHist <- function(mu){
-  hist(galton$child,col="blue",breaks=100)
-  lines(c(mu, mu), c(0, 150),col="red",lwd=5)
-  mse <- mean((galton$child - mu)^2)
-  text(63, 150, paste("mu = ", mu))
-  text(63, 140, paste("Imbalance = ", round(mse, 2)))
-}
-manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
-
- -
- -
- - -
-

The center of mass is the empirical mean

-
-
-
hist(galton$child, col = "blue", breaks = 100)
-meanChild <- mean(galton$child)
-lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5)
-
- -

plot of chunk lsm

- -
- -
- - -
-

Example

-
-
-
    -
  • Suppose a coin is flipped and \(X\) is declared \(0\) or \(1\) corresponding to a head or a tail, respectively
  • -
  • What is the expected value of \(X\)? -\[ -E[X] = .5 \times 0 + .5 \times 1 = .5 -\]
  • -
  • Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be \(.5\)
  • -
- -

plot of chunk unnamed-chunk-2

- -
- -
- - -
-

Example

-
-
-
    -
  • Suppose that a die is rolled and \(X\) is the number face up
  • -
  • What is the expected value of \(X\)? -\[ -E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + -3 \times \frac{1}{6} + 4 \times \frac{1}{6} + -5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 -\]
  • -
  • Again, the geometric argument makes this answer obvious without calculation.
  • -
- -
- -
- - -
-

Continuous random variables

-
-
-
    -
  • For a continuous random variable, \(X\), with density, \(f\), the expected -value is defined as follows -\[ -E[X] = \mbox{the area under the function}~~~ t f(t) -\]
  • -
  • This definition borrows from the definition of center of mass for a continuous body
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • Consider a density where \(f(x) = 1\) for \(x\) between zero and one
  • -
  • (Is this a valid density?)
  • -
  • Suppose that \(X\) follows this density; what is its expected value?
    -plot of chunk unnamed-chunk-3
  • -
- -
- -
- - -
-

Rules about expected values

-
-
-
    -
  • The expected value is a linear operator
  • -
  • If \(a\) and \(b\) are not random and \(X\) and \(Y\) are two random variables then - -
      -
    • \(E[aX + b] = a E[X] + b\)
    • -
    • \(E[X + Y] = E[X] + E[Y]\)
    • -
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • You flip a coin, \(X\) and simulate a uniform random number \(Y\), what is the expected value of their sum? -\[ -E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 -\]
  • -
  • Another example, you roll a die twice. What is the expected value of the average?
  • -
  • Let \(X_1\) and \(X_2\) be the results of the two rolls -\[ -E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) -= \frac{1}{2}(3.5 + 3.5) = 3.5 -\]
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  1. Let \(X_i\) for \(i=1,\ldots,n\) be a collection of random variables, each from a distribution with mean \(\mu\)
  2. -
  3. Calculate the expected value of the sample average of the \(X_i\) -\[ -\begin{eqnarray*} -E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] -& = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ -& = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ -& = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. -\end{eqnarray*} -\]
  4. -
- -
- -
- - -
-

Remark

-
-
-
    -
  • Therefore, the expected value of the sample mean is the population mean that it's trying to estimate
  • -
  • When the expected value of an estimator is what its trying to estimate, we say that the estimator is unbiased
  • -
- -
- -
- - -
-

The variance

-
-
-
    -
  • The variance of a random variable is a measure of {\em spread}
  • -
  • If \(X\) is a random variable with mean \(\mu\), the variance of \(X\) is defined as
  • -
- -

\[ -Var(X) = E[(X - \mu)^2] -\]

- -

the expected (squared) distance from the mean

- -
    -
  • Densities with a higher variance are more spread out than densities with a lower variance
  • -
- -
- -
- - -
- -
-
-
    -
  • Convenient computational form -\[ -Var(X) = E[X^2] - E[X]^2 -\]
  • -
  • If \(a\) is constant then \(Var(aX) = a^2 Var(X)\)
  • -
  • The square root of the variance is called the standard deviation
  • -
  • The standard deviation has the same units as \(X\)
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • What's the sample variance from the result of a toss of a die?

    - -
      -
    • \(E[X] = 3.5\)
    • -
    • \(E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17\)
    • -
  • -
  • \(Var(X) = E[X^2] - E[X]^2 \approx 2.92\)

  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • What's the sample variance from the result of the toss of a coin with probability of heads (1) of \(p\)?

    - -
      -
    • \(E[X] = 0 \times (1 - p) + 1 \times p = p\)
    • -
    • \(E[X^2] = E[X] = p\)
    • -
  • -
  • \(Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)\)

  • -
- -
- -
- - -
-

Interpreting variances

-
-
-
    -
  • Chebyshev's inequality is useful for interpreting variances
  • -
  • This inequality states that -\[ -P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} -\]
  • -
  • For example, the probability that a random variable lies beyond \(k\) standard deviations from its mean is less than \(1/k^2\) -\[ -\begin{eqnarray*} -2\sigma & \rightarrow & 25\% \\ -3\sigma & \rightarrow & 11\% \\ -4\sigma & \rightarrow & 6\% -\end{eqnarray*} -\]
  • -
  • Note this is only a bound; the actual probability might be quite a bit smaller
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • IQs are often said to be distributed with a mean of \(100\) and a sd of \(15\)
  • -
  • What is the probability of a randomly drawn person having an IQ higher than \(160\) or below \(40\)?
  • -
  • Thus we want to know the probability of a person being more than \(4\) standard deviations from the mean
  • -
  • Thus Chebyshev's inequality suggests that this will be no larger than 6\%
  • -
  • IQs distributions are often cited as being bell shaped, in which case this bound is very conservative
  • -
  • The probability of a random draw from a bell curve being \(4\) standard deviations from the mean is on the order of \(10^{-5}\) (one thousandth of one percent)
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts
  • -
  • Chebyshev's inequality states that the probability of a "Six Sigma" event is less than \(1/6^2 \approx 3\%\)
  • -
  • If a bell curve is assumed, the probability of a "six sigma" event is on the order of \(10^{-9}\) (one ten millionth of a percent)
  • -
- -
- -
- - -
- - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Expected values + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Expected values

+

Statistical Inference

+

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

+
+
+
+ + + + +
+

Expected values

+
+
+
    +
  • The expected value or mean of a random variable is the center of its distribution
  • +
  • For discrete random variable \(X\) with PMF \(p(x)\), it is defined as follows +\[ +E[X] = \sum_x xp(x). +\] +where the sum is taken over the possible values of \(x\)
  • +
  • \(E[X]\) represents the center of mass of a collection of locations and weights, \(\{x, p(x)\}\)
  • +
+ +
+ +
+ + +
+

Example

+
+
+

Find the center of mass of the bars

+ +

plot of chunk unnamed-chunk-1

+ +
+ +
+ + +
+

Using manipulate

+
+
+
library(manipulate)
+myHist <- function(mu){
+  hist(galton$child,col="blue",breaks=100)
+  lines(c(mu, mu), c(0, 150),col="red",lwd=5)
+  mse <- mean((galton$child - mu)^2)
+  text(63, 150, paste("mu = ", mu))
+  text(63, 140, paste("Imbalance = ", round(mse, 2)))
+}
+manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
+
+ +
+ +
+ + +
+

The center of mass is the empirical mean

+
+
+
hist(galton$child, col = "blue", breaks = 100)
+meanChild <- mean(galton$child)
+lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5)
+
+ +

plot of chunk lsm

+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Suppose a coin is flipped and \(X\) is declared \(0\) or \(1\) corresponding to a head or a tail, respectively
  • +
  • What is the expected value of \(X\)? +\[ +E[X] = .5 \times 0 + .5 \times 1 = .5 +\]
  • +
  • Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be \(.5\)
  • +
+ +

plot of chunk unnamed-chunk-2

+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Suppose that a die is rolled and \(X\) is the number face up
  • +
  • What is the expected value of \(X\)? +\[ +E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + +3 \times \frac{1}{6} + 4 \times \frac{1}{6} + +5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 +\]
  • +
  • Again, the geometric argument makes this answer obvious without calculation.
  • +
+ +
+ +
+ + +
+

Continuous random variables

+
+
+
    +
  • For a continuous random variable, \(X\), with density, \(f\), the expected +value is defined as follows +\[ +E[X] = \mbox{the area under the function}~~~ t f(t) +\]
  • +
  • This definition borrows from the definition of center of mass for a continuous body
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Consider a density where \(f(x) = 1\) for \(x\) between zero and one
  • +
  • (Is this a valid density?)
  • +
  • Suppose that \(X\) follows this density; what is its expected value?
    +plot of chunk unnamed-chunk-3
  • +
+ +
+ +
+ + +
+

Rules about expected values

+
+
+
    +
  • The expected value is a linear operator
  • +
  • If \(a\) and \(b\) are not random and \(X\) and \(Y\) are two random variables then + +
      +
    • \(E[aX + b] = a E[X] + b\)
    • +
    • \(E[X + Y] = E[X] + E[Y]\)
    • +
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • You flip a coin, \(X\) and simulate a uniform random number \(Y\), what is the expected value of their sum? +\[ +E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 +\]
  • +
  • Another example, you roll a die twice. What is the expected value of the average?
  • +
  • Let \(X_1\) and \(X_2\) be the results of the two rolls +\[ +E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) += \frac{1}{2}(3.5 + 3.5) = 3.5 +\]
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  1. Let \(X_i\) for \(i=1,\ldots,n\) be a collection of random variables, each from a distribution with mean \(\mu\)
  2. +
  3. Calculate the expected value of the sample average of the \(X_i\) +\[ +\begin{eqnarray*} +E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] +& = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ +& = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ +& = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. +\end{eqnarray*} +\]
  4. +
+ +
+ +
+ + +
+

Remark

+
+
+
    +
  • Therefore, the expected value of the sample mean is the population mean that it's trying to estimate
  • +
  • When the expected value of an estimator is what its trying to estimate, we say that the estimator is unbiased
  • +
+ +
+ +
+ + +
+

The variance

+
+
+
    +
  • The variance of a random variable is a measure of spread
  • +
  • If \(X\) is a random variable with mean \(\mu\), the variance of \(X\) is defined as
  • +
+ +

\[ +Var(X) = E[(X - \mu)^2] +\]

+ +

the expected (squared) distance from the mean

+ +
    +
  • Densities with a higher variance are more spread out than densities with a lower variance
  • +
+ +
+ +
+ + +
+
    +
  • Convenient computational form +\[ +Var(X) = E[X^2] - E[X]^2 +\]
  • +
  • If \(a\) is constant then \(Var(aX) = a^2 Var(X)\)
  • +
  • The square root of the variance is called the standard deviation
  • +
  • The standard deviation has the same units as \(X\)
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • What's the sample variance from the result of a toss of a die?

    + +
      +
    • \(E[X] = 3.5\)
    • +
    • \(E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17\)
    • +
  • +
  • \(Var(X) = E[X^2] - E[X]^2 \approx 2.92\)

  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • What's the sample variance from the result of the toss of a coin with probability of heads (1) of \(p\)?

    + +
      +
    • \(E[X] = 0 \times (1 - p) + 1 \times p = p\)
    • +
    • \(E[X^2] = E[X] = p\)
    • +
  • +
  • \(Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)\)

  • +
+ +
+ +
+ + +
+

Interpreting variances

+
+
+
    +
  • Chebyshev's inequality is useful for interpreting variances
  • +
  • This inequality states that +\[ +P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} +\]
  • +
  • For example, the probability that a random variable lies beyond \(k\) standard deviations from its mean is less than \(1/k^2\) +\[ +\begin{eqnarray*} +2\sigma & \rightarrow & 25\% \\ +3\sigma & \rightarrow & 11\% \\ +4\sigma & \rightarrow & 6\% +\end{eqnarray*} +\]
  • +
  • Note this is only a bound; the actual probability might be quite a bit smaller
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • IQs are often said to be distributed with a mean of \(100\) and a sd of \(15\)
  • +
  • What is the probability of a randomly drawn person having an IQ higher than \(160\) or below \(40\)?
  • +
  • Thus we want to know the probability of a person being more than \(4\) standard deviations from the mean
  • +
  • Thus Chebyshev's inequality suggests that this will be no larger than 6\%
  • +
  • IQs distributions are often cited as being bell shaped, in which case this bound is very conservative
  • +
  • The probability of a random draw from a bell curve being \(4\) standard deviations from the mean is on the order of \(10^{-5}\) (one thousandth of one percent)
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts
  • +
  • Chebyshev's inequality states that the probability of a "Six Sigma" event is less than \(1/6^2 \approx 3\%\)
  • +
  • If a bell curve is assumed, the probability of a "six sigma" event is on the order of \(10^{-9}\) (one ten millionth of a percent)
  • +
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/06_StatisticalInference/01_03_Expectations/index.md b/06_StatisticalInference/01_03_Expectations/index.md index ca3a74384..8ac21861b 100644 --- a/06_StatisticalInference/01_03_Expectations/index.md +++ b/06_StatisticalInference/01_03_Expectations/index.md @@ -1,242 +1,234 @@ ---- -title : Expected values -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- -## Expected values - -- The **expected value** or **mean** of a random variable is the center of its distribution -- For discrete random variable $X$ with PMF $p(x)$, it is defined as follows - $$ - E[X] = \sum_x xp(x). - $$ - where the sum is taken over the possible values of $x$ -- $E[X]$ represents the center of mass of a collection of locations and weights, $\{x, p(x)\}$ - ---- - -## Example -### Find the center of mass of the bars - -```r -library(UsingR) -data(galton) -par(mfrow = c(1, 2)) -hist(galton$child, col = "blue", breaks = 100) -hist(galton$parent, col = "blue", breaks = 100) -``` - -![plot of chunk unnamed-chunk-1](figure/unnamed-chunk-1.png) - - ---- -## Using manipulate -``` -library(manipulate) -myHist <- function(mu){ - hist(galton$child,col="blue",breaks=100) - lines(c(mu, mu), c(0, 150),col="red",lwd=5) - mse <- mean((galton$child - mu)^2) - text(63, 150, paste("mu = ", mu)) - text(63, 140, paste("Imbalance = ", round(mse, 2))) -} -manipulate(myHist(mu), mu = slider(62, 74, step = 0.5)) -``` - ---- -## The center of mass is the empirical mean - -```r -hist(galton$child, col = "blue", breaks = 100) -meanChild <- mean(galton$child) -lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5) -``` - -![plot of chunk lsm](figure/lsm.png) - - ---- -## Example - -- Suppose a coin is flipped and $X$ is declared $0$ or $1$ corresponding to a head or a tail, respectively -- What is the expected value of $X$? - $$ - E[X] = .5 \times 0 + .5 \times 1 = .5 - $$ -- Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be $.5$ - -![plot of chunk unnamed-chunk-2](figure/unnamed-chunk-2.png) - ---- - -## Example - -- Suppose that a die is rolled and $X$ is the number face up -- What is the expected value of $X$? - $$ - E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + - 3 \times \frac{1}{6} + 4 \times \frac{1}{6} + - 5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 - $$ -- Again, the geometric argument makes this answer obvious without calculation. - ---- - -## Continuous random variables - -- For a continuous random variable, $X$, with density, $f$, the expected - value is defined as follows - $$ - E[X] = \mbox{the area under the function}~~~ t f(t) - $$ -- This definition borrows from the definition of center of mass for a continuous body - ---- - -## Example - -- Consider a density where $f(x) = 1$ for $x$ between zero and one -- (Is this a valid density?) -- Suppose that $X$ follows this density; what is its expected value? -![plot of chunk unnamed-chunk-3](figure/unnamed-chunk-3.png) - - ---- - -## Rules about expected values - -- The expected value is a linear operator -- If $a$ and $b$ are not random and $X$ and $Y$ are two random variables then - - $E[aX + b] = a E[X] + b$ - - $E[X + Y] = E[X] + E[Y]$ - ---- - -## Example - -- You flip a coin, $X$ and simulate a uniform random number $Y$, what is the expected value of their sum? - $$ - E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 - $$ -- Another example, you roll a die twice. What is the expected value of the average? -- Let $X_1$ and $X_2$ be the results of the two rolls - $$ - E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) - = \frac{1}{2}(3.5 + 3.5) = 3.5 - $$ - ---- - -## Example - -1. Let $X_i$ for $i=1,\ldots,n$ be a collection of random variables, each from a distribution with mean $\mu$ -2. Calculate the expected value of the sample average of the $X_i$ -$$ - \begin{eqnarray*} - E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] - & = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ - & = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ - & = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. - \end{eqnarray*} -$$ - ---- - -## Remark - -- Therefore, the expected value of the **sample mean** is the population mean that it's trying to estimate -- When the expected value of an estimator is what its trying to estimate, we say that the estimator is **unbiased** - ---- - -## The variance - -- The variance of a random variable is a measure of {\em spread} -- If $X$ is a random variable with mean $\mu$, the variance of $X$ is defined as - -$$ -Var(X) = E[(X - \mu)^2] -$$ - -the expected (squared) distance from the mean -- Densities with a higher variance are more spread out than densities with a lower variance - ---- - -- Convenient computational form -$$ -Var(X) = E[X^2] - E[X]^2 -$$ -- If $a$ is constant then $Var(aX) = a^2 Var(X)$ -- The square root of the variance is called the **standard deviation** -- The standard deviation has the same units as $X$ - ---- - -## Example - -- What's the sample variance from the result of a toss of a die? - - - $E[X] = 3.5$ - - $E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17$ - -- $Var(X) = E[X^2] - E[X]^2 \approx 2.92$ - ---- - -## Example - -- What's the sample variance from the result of the toss of a coin with probability of heads (1) of $p$? - - - $E[X] = 0 \times (1 - p) + 1 \times p = p$ - - $E[X^2] = E[X] = p$ - -- $Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)$ - ---- - -## Interpreting variances - -- Chebyshev's inequality is useful for interpreting variances -- This inequality states that -$$ -P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} -$$ -- For example, the probability that a random variable lies beyond $k$ standard deviations from its mean is less than $1/k^2$ -$$ -\begin{eqnarray*} - 2\sigma & \rightarrow & 25\% \\ - 3\sigma & \rightarrow & 11\% \\ - 4\sigma & \rightarrow & 6\% -\end{eqnarray*} -$$ -- Note this is only a bound; the actual probability might be quite a bit smaller - ---- - -## Example - -- IQs are often said to be distributed with a mean of $100$ and a sd of $15$ -- What is the probability of a randomly drawn person having an IQ higher than $160$ or below $40$? -- Thus we want to know the probability of a person being more than $4$ standard deviations from the mean -- Thus Chebyshev's inequality suggests that this will be no larger than 6\% -- IQs distributions are often cited as being bell shaped, in which case this bound is very conservative -- The probability of a random draw from a bell curve being $4$ standard deviations from the mean is on the order of $10^{-5}$ (one thousandth of one percent) - ---- - -## Example - -- A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts -- Chebyshev's inequality states that the probability of a "Six Sigma" event is less than $1/6^2 \approx 3\%$ -- If a bell curve is assumed, the probability of a "six sigma" event is on the order of $10^{-9}$ (one ten millionth of a percent) +--- +title : Expected values +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- +## Expected values + +- The **expected value** or **mean** of a random variable is the center of its distribution +- For discrete random variable $X$ with PMF $p(x)$, it is defined as follows + $$ + E[X] = \sum_x xp(x). + $$ + where the sum is taken over the possible values of $x$ +- $E[X]$ represents the center of mass of a collection of locations and weights, $\{x, p(x)\}$ + +--- + +## Example +### Find the center of mass of the bars +![plot of chunk unnamed-chunk-1](assets/fig/unnamed-chunk-1.png) + + +--- +## Using manipulate +``` +library(manipulate) +myHist <- function(mu){ + hist(galton$child,col="blue",breaks=100) + lines(c(mu, mu), c(0, 150),col="red",lwd=5) + mse <- mean((galton$child - mu)^2) + text(63, 150, paste("mu = ", mu)) + text(63, 140, paste("Imbalance = ", round(mse, 2))) +} +manipulate(myHist(mu), mu = slider(62, 74, step = 0.5)) +``` + +--- +## The center of mass is the empirical mean + +```r +hist(galton$child, col = "blue", breaks = 100) +meanChild <- mean(galton$child) +lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5) +``` + +![plot of chunk lsm](assets/fig/lsm.png) + + +--- +## Example + +- Suppose a coin is flipped and $X$ is declared $0$ or $1$ corresponding to a head or a tail, respectively +- What is the expected value of $X$? + $$ + E[X] = .5 \times 0 + .5 \times 1 = .5 + $$ +- Note, if thought about geometrically, this answer is obvious; if two equal weights are spaced at 0 and 1, the center of mass will be $.5$ + +![plot of chunk unnamed-chunk-2](assets/fig/unnamed-chunk-2.png) + +--- + +## Example + +- Suppose that a die is rolled and $X$ is the number face up +- What is the expected value of $X$? + $$ + E[X] = 1 \times \frac{1}{6} + 2 \times \frac{1}{6} + + 3 \times \frac{1}{6} + 4 \times \frac{1}{6} + + 5 \times \frac{1}{6} + 6 \times \frac{1}{6} = 3.5 + $$ +- Again, the geometric argument makes this answer obvious without calculation. + +--- + +## Continuous random variables + +- For a continuous random variable, $X$, with density, $f$, the expected + value is defined as follows + $$ + E[X] = \mbox{the area under the function}~~~ t f(t) + $$ +- This definition borrows from the definition of center of mass for a continuous body + +--- + +## Example + +- Consider a density where $f(x) = 1$ for $x$ between zero and one +- (Is this a valid density?) +- Suppose that $X$ follows this density; what is its expected value? +![plot of chunk unnamed-chunk-3](assets/fig/unnamed-chunk-3.png) + + +--- + +## Rules about expected values + +- The expected value is a linear operator +- If $a$ and $b$ are not random and $X$ and $Y$ are two random variables then + - $E[aX + b] = a E[X] + b$ + - $E[X + Y] = E[X] + E[Y]$ + +--- + +## Example + +- You flip a coin, $X$ and simulate a uniform random number $Y$, what is the expected value of their sum? + $$ + E[X + Y] = E[X] + E[Y] = .5 + .5 = 1 + $$ +- Another example, you roll a die twice. What is the expected value of the average? +- Let $X_1$ and $X_2$ be the results of the two rolls + $$ + E[(X_1 + X_2) / 2] = \frac{1}{2}(E[X_1] + E[X_2]) + = \frac{1}{2}(3.5 + 3.5) = 3.5 + $$ + +--- + +## Example + +1. Let $X_i$ for $i=1,\ldots,n$ be a collection of random variables, each from a distribution with mean $\mu$ +2. Calculate the expected value of the sample average of the $X_i$ +$$ + \begin{eqnarray*} + E\left[ \frac{1}{n}\sum_{i=1}^n X_i\right] + & = & \frac{1}{n} E\left[\sum_{i=1}^n X_i\right] \\ + & = & \frac{1}{n} \sum_{i=1}^n E\left[X_i\right] \\ + & = & \frac{1}{n} \sum_{i=1}^n \mu = \mu. + \end{eqnarray*} +$$ + +--- + +## Remark + +- Therefore, the expected value of the **sample mean** is the population mean that it's trying to estimate +- When the expected value of an estimator is what its trying to estimate, we say that the estimator is **unbiased** + +--- + +## The variance + +- The variance of a random variable is a measure of *spread* +- If $X$ is a random variable with mean $\mu$, the variance of $X$ is defined as + +$$ +Var(X) = E[(X - \mu)^2] +$$ + +the expected (squared) distance from the mean +- Densities with a higher variance are more spread out than densities with a lower variance + +--- + +- Convenient computational form +$$ +Var(X) = E[X^2] - E[X]^2 +$$ +- If $a$ is constant then $Var(aX) = a^2 Var(X)$ +- The square root of the variance is called the **standard deviation** +- The standard deviation has the same units as $X$ + +--- + +## Example + +- What's the sample variance from the result of a toss of a die? + + - $E[X] = 3.5$ + - $E[X^2] = 1 ^ 2 \times \frac{1}{6} + 2 ^ 2 \times \frac{1}{6} + 3 ^ 2 \times \frac{1}{6} + 4 ^ 2 \times \frac{1}{6} + 5 ^ 2 \times \frac{1}{6} + 6 ^ 2 \times \frac{1}{6} = 15.17$ + +- $Var(X) = E[X^2] - E[X]^2 \approx 2.92$ + +--- + +## Example + +- What's the sample variance from the result of the toss of a coin with probability of heads (1) of $p$? + + - $E[X] = 0 \times (1 - p) + 1 \times p = p$ + - $E[X^2] = E[X] = p$ + +- $Var(X) = E[X^2] - E[X]^2 = p - p^2 = p(1 - p)$ + +--- + +## Interpreting variances + +- Chebyshev's inequality is useful for interpreting variances +- This inequality states that +$$ +P(|X - \mu| \geq k\sigma) \leq \frac{1}{k^2} +$$ +- For example, the probability that a random variable lies beyond $k$ standard deviations from its mean is less than $1/k^2$ +$$ +\begin{eqnarray*} + 2\sigma & \rightarrow & 25\% \\ + 3\sigma & \rightarrow & 11\% \\ + 4\sigma & \rightarrow & 6\% +\end{eqnarray*} +$$ +- Note this is only a bound; the actual probability might be quite a bit smaller + +--- + +## Example + +- IQs are often said to be distributed with a mean of $100$ and a sd of $15$ +- What is the probability of a randomly drawn person having an IQ higher than $160$ or below $40$? +- Thus we want to know the probability of a person being more than $4$ standard deviations from the mean +- Thus Chebyshev's inequality suggests that this will be no larger than 6\% +- IQs distributions are often cited as being bell shaped, in which case this bound is very conservative +- The probability of a random draw from a bell curve being $4$ standard deviations from the mean is on the order of $10^{-5}$ (one thousandth of one percent) + +--- + +## Example + +- A former buzz phrase in industrial quality control is Motorola's "Six Sigma" whereby businesses are suggested to control extreme events or rare defective parts +- Chebyshev's inequality states that the probability of a "Six Sigma" event is less than $1/6^2 \approx 3\%$ +- If a bell curve is assumed, the probability of a "six sigma" event is on the order of $10^{-9}$ (one ten millionth of a percent) + diff --git a/06_StatisticalInference/01_04_Independence/index.Rmd b/06_StatisticalInference/01_04_Independence/index.Rmd index f750e2fbe..e9e31d883 100644 --- a/06_StatisticalInference/01_04_Independence/index.Rmd +++ b/06_StatisticalInference/01_04_Independence/index.Rmd @@ -1,200 +1,208 @@ ---- -title : Independence -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Independent events - -- Two events $A$ and $B$ are **independent** if $$P(A \cap B) = P(A)P(B)$$ -- Two random variables, $X$ and $Y$ are independent if for any two sets $A$ and $B$ $$P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)$$ -- If $A$ is independent of $B$ then - - - $A^c$ is independent of $B$ - - $A$ is independent of $B^c$ - - $A^c$ is independent of $B^c$ - - ---- - -## Example - -- What is the probability of getting two consecutive heads? -- $A = \{\mbox{Head on flip 1}\}$ ~ $P(A) = .5$ -- $B = \{\mbox{Head on flip 2}\}$ ~ $P(B) = .5$ -- $A \cap B = \{\mbox{Head on flips 1 and 2}\}$ -- $P(A \cap B) = P(A)P(B) = .5 \times .5 = .25$ - ---- - -## Example - -- Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial -- Based on an estimated prevalence of sudden infant death syndrome of $1$ out of $8,543$, Dr Meadow testified that that the probability of a mother having two children with SIDS was $\left(\frac{1}{8,543}\right)^2$ -- The mother on trial was convicted of murder - ---- - -## Example: continued - -- For the purposes of this class, the principal mistake was to *assume* that the probabilities of having SIDs within a family are independent -- That is, $P(A_1 \cap A_2)$ is not necessarily equal to $P(A_1)P(A_2)$ -- Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families -- (There are many other statistical points of discussion for this case.) - ---- - -## Useful fact - -We will use the following fact extensively in this class: - -*If a collection of random variables $X_1, X_2, \ldots, X_n$ are independent, then their joint distribution is the product of their individual densities or mass functions* - -*That is, if $f_i$ is the density for random variable $X_i$ we have that* -$$ -f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) -$$ - ---- - -## IID random variables - -- Random variables are said to be iid if they are independent and identically distributed -- iid random variables are the default model for random samples -- Many of the important theories of statistics are founded on assuming that variables are iid - - ---- - -## Example - -- Suppose that we flip a biased coin with success probability $p$ $n$ times, what is the join density of the collection of outcomes? -- These random variables are iid with densities $p^{x_i} (1 - p)^{1-x_i}$ -- Therefore - $$ - f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} - $$ - ---- - -## Correlation - -- The **covariance** between two random variables $X$ and $Y$ is defined as -$$ -Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] -$$ -- The following are useful facts about covariance - 1. $Cov(X, Y) = Cov(Y, X)$ - 2. $Cov(X, Y)$ can be negative or positive - 3. $|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}$ - ---- - -## Correlation - -- The **correlation** between $X$ and $Y$ is -$$ -Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} -$$ - - 1. $-1 \leq Cor(X, Y) \leq 1$ - 2. $Cor(X, Y) = \pm 1$ if and only if $X = a + bY$ for some constants $a$ and $b$ - 3. $Cor(X, Y)$ is unitless - 4. $X$ and $Y$ are **uncorrelated** if $Cor(X, Y) = 0$ - 5. $X$ and $Y$ are more positively correlated, the closer $Cor(X,Y)$ is to $1$ - 6. $X$ and $Y$ are more negatively correlated, the closer $Cor(X,Y)$ is to $-1$ - ---- - -## Some useful results - -- Let $\{X_i\}_{i=1}^n$ be a collection of random variables - - When the $\{X_i\}$ are uncorrelated $$Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)$$ - -- A commonly used subcase from these properties is that *if a collection of random variables $\{X_i\}$ are uncorrelated*, then the variance of the sum is the sum of the variances -$$ -Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) -$$ -- Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations - ---- - -## The sample mean - -Suppose $X_i$ are iid with variance $\sigma^2$ - -$$ -\begin{eqnarray*} - Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ - & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ - & = & \frac{\sigma^2}{n} - \end{eqnarray*} -$$ - ---- - -## Some comments - -- When $X_i$ are independent with a common variance $Var(\bar X) = \frac{\sigma^2}{n}$ -- $\sigma/\sqrt{n}$ is called *the standard error* of the sample mean -- The standard error of the sample mean is the standard deviation of the distribution of the sample mean -- $\sigma$ is the standard deviation of the distribution of a single observation -- Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a $\sqrt{n}$ - ---- - -## The sample variance -- The **sample variance** is defined as -$$ -S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} -$$ -- The sample variance is an estimator of $\sigma^2$ -- The numerator has a version that's quicker for calculation -$$ -\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 -$$ -- The sample variance is (nearly) the mean of the squared deviations from the mean - ---- - -## The sample variance is unbiased - -$$ - \begin{eqnarray*} - E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ - & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ - & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ - & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ - & = & (n - 1) \sigma^2 - \end{eqnarray*} -$$ - ---- - -## Hoping to avoid some confusion - -- Suppose $X_i$ are iid with mean $\mu$ and variance $\sigma^2$ -- $S^2$ estimates $\sigma^2$ -- The calculation of $S^2$ involves dividing by $n-1$ -- $S / \sqrt{n}$ estimates $\sigma / \sqrt{n}$ the standard error of the mean -- $S / \sqrt{n}$ is called the sample standard error (of the mean) - ---- -## Example -```{r, fig.height=3, fig.width=3} -data(father.son); hist(father.son$sheight, col="lightblue", border="black") -x <- father.son$sheight; n<-length(x) -c(sum( (x - mean(x) )^ 2) / (n-1), var(x), var(x) / n, sd(x), sd(x) / sqrt(n)) -``` \ No newline at end of file +--- +title : Independence +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Independent events + +- Two events $A$ and $B$ are **independent** if $$P(A \cap B) = P(A)P(B)$$ +- Two random variables, $X$ and $Y$ are independent if for any two sets $A$ and $B$ $$P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)$$ +- If $A$ is independent of $B$ then + + - $A^c$ is independent of $B$ + - $A$ is independent of $B^c$ + - $A^c$ is independent of $B^c$ + + +--- + +## Example + +- What is the probability of getting two consecutive heads? +- $A = \{\mbox{Head on flip 1}\}$ ~ $P(A) = .5$ +- $B = \{\mbox{Head on flip 2}\}$ ~ $P(B) = .5$ +- $A \cap B = \{\mbox{Head on flips 1 and 2}\}$ +- $P(A \cap B) = P(A)P(B) = .5 \times .5 = .25$ + +--- + +## Example + +- Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial +- Based on an estimated prevalence of sudden infant death syndrome of $1$ out of $8,543$, Dr Meadow testified that that the probability of a mother having two children with SIDS was $\left(\frac{1}{8,543}\right)^2$ +- The mother on trial was convicted of murder + +--- + +## Example: continued + +- For the purposes of this class, the principal mistake was to *assume* that the probabilities of having SIDs within a family are independent +- That is, $P(A_1 \cap A_2)$ is not necessarily equal to $P(A_1)P(A_2)$ +- Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families +- (There are many other statistical points of discussion for this case.) + +--- + +## Useful fact + +We will use the following fact extensively in this class: + +*If a collection of random variables $X_1, X_2, \ldots, X_n$ are independent, then their joint distribution is the product of their individual densities or mass functions* + +*That is, if $f_i$ is the density for random variable $X_i$ we have that* +$$ +f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) +$$ + +--- + +## IID random variables + +- Random variables are said to be iid if they are independent and identically distributed +- iid random variables are the default model for random samples +- Many of the important theories of statistics are founded on assuming that variables are iid + + +--- + +## Example + +- Suppose that we flip a biased coin with success probability $p$ $n$ times, what is the join density of the collection of outcomes? +- These random variables are iid with densities $p^{x_i} (1 - p)^{1-x_i}$ +- Therefore + $$ + f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} + $$ + +--- + +## Correlation + +- The **covariance** between two random variables $X$ and $Y$ is defined as +$$ +Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] +$$ +- The following are useful facts about covariance + 1. $Cov(X, Y) = Cov(Y, X)$ + 2. $Cov(X, Y)$ can be negative or positive + 3. $|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}$ + +--- + +## Correlation + +- The **correlation** between $X$ and $Y$ is +$$ +Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} +$$ + + 1. $-1 \leq Cor(X, Y) \leq 1$ + 2. $Cor(X, Y) = \pm 1$ if and only if $X = a + bY$ for some constants $a$ and $b$ + 3. $Cor(X, Y)$ is unitless + 4. $X$ and $Y$ are **uncorrelated** if $Cor(X, Y) = 0$ + 5. $X$ and $Y$ are more positively correlated, the closer $Cor(X,Y)$ is to $1$ + 6. $X$ and $Y$ are more negatively correlated, the closer $Cor(X,Y)$ is to $-1$ + +--- + +## Some useful results + +- Let $\{X_i\}_{i=1}^n$ be a collection of random variables + - When the $\{X_i\}$ are uncorrelated $$Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)$$ + +- A commonly used subcase from these properties is that *if a collection of random variables $\{X_i\}$ are uncorrelated*, then the variance of the sum is the sum of the variances +$$ +Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) +$$ +- Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations + +--- + +## The sample mean + +Suppose $X_i$ are iid with variance $\sigma^2$ + +$$ +\begin{eqnarray*} + Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ + & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ + & = & \frac{\sigma^2}{n} + \end{eqnarray*} +$$ + +--- + +## Some comments + +- When $X_i$ are independent with a common variance $Var(\bar X) = \frac{\sigma^2}{n}$ +- $\sigma/\sqrt{n}$ is called *the standard error* of the sample mean +- The standard error of the sample mean is the standard deviation of the distribution of the sample mean +- $\sigma$ is the standard deviation of the distribution of a single observation +- Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a $\sqrt{n}$ + +--- + +## The sample variance +- The **sample variance** is defined as +$$ +S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} +$$ +- The sample variance is an estimator of $\sigma^2$ +- The numerator has a version that's quicker for calculation +$$ +\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 +$$ +- The sample variance is (nearly) the mean of the squared deviations from the mean + +--- + +## The sample variance is unbiased + +$$ + \begin{eqnarray*} + E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ + & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ + & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ + & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ + & = & (n - 1) \sigma^2 + \end{eqnarray*} +$$ + +--- + +## Hoping to avoid some confusion + +- Suppose $X_i$ are iid with mean $\mu$ and variance $\sigma^2$ +- $S^2$ estimates $\sigma^2$ +- The calculation of $S^2$ involves dividing by $n-1$ +- $S / \sqrt{n}$ estimates $\sigma / \sqrt{n}$ the standard error of the mean +- $S / \sqrt{n}$ is called the sample standard error (of the mean) + +--- +## Example +```{r} +data(father.son); +x <- father.son$sheight +n<-length(x) +``` + +--- +```{r, fig.height=5, fig.width=5, echo=FALSE} +hist(father.son$sheight, col="lightblue", border="black") +``` +```{r} +round(c(sum( (x - mean(x) )^ 2) / (n-1), var(x), var(x) / n, sd(x), sd(x) / sqrt(n)),2) +``` diff --git a/06_StatisticalInference/01_04_Independence/index.html b/06_StatisticalInference/01_04_Independence/index.html index df11e39ac..57c94e8b1 100644 --- a/06_StatisticalInference/01_04_Independence/index.html +++ b/06_StatisticalInference/01_04_Independence/index.html @@ -1,388 +1,496 @@ - - - - Independence - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Independence

-

Statistical Inference

-

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

-
-
- - - -
-

Independent events

-
-
-
    -
  • Two events \(A\) and \(B\) are independent if \[P(A \cap B) = P(A)P(B)\]
  • -
  • Two random variables, \(X\) and \(Y\) are independent if for any two sets \(A\) and \(B\) \[P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)\]
  • -
  • If \(A\) is independent of \(B\) then

    - -
      -
    • \(A^c\) is independent of \(B\)
    • -
    • \(A\) is independent of \(B^c\)
    • -
    • \(A^c\) is independent of \(B^c\)
    • -
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • What is the probability of getting two consecutive heads?
  • -
  • \(A = \{\mbox{Head on flip 1}\}\) ~ \(P(A) = .5\)
  • -
  • \(B = \{\mbox{Head on flip 2}\}\) ~ \(P(B) = .5\)
  • -
  • \(A \cap B = \{\mbox{Head on flips 1 and 2}\}\)
  • -
  • \(P(A \cap B) = P(A)P(B) = .5 \times .5 = .25\)
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial
  • -
  • Based on an estimated prevalence of sudden infant death syndrome of \(1\) out of \(8,543\), Dr Meadow testified that that the probability of a mother having two children with SIDS was \(\left(\frac{1}{8,543}\right)^2\)
  • -
  • The mother on trial was convicted of murder
  • -
- -
- -
- - -
-

Example: continued

-
-
-
    -
  • For the purposes of this class, the principal mistake was to assume that the probabilities of having SIDs within a family are independent
  • -
  • That is, \(P(A_1 \cap A_2)\) is not necessarily equal to \(P(A_1)P(A_2)\)
  • -
  • Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families
  • -
  • (There are many other statistical points of discussion for this case.)
  • -
- -
- -
- - -
-

Useful fact

-
-
-

We will use the following fact extensively in this class:

- -

If a collection of random variables \(X_1, X_2, \ldots, X_n\) are independent, then their joint distribution is the product of their individual densities or mass functions

- -

That is, if \(f_i\) is the density for random variable \(X_i\) we have that -\[ -f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) -\]

- -
- -
- - -
-

IID random variables

-
-
-
    -
  • Random variables are said to be iid if they are independent and identically distributed
  • -
  • iid random variables are the default model for random samples
  • -
  • Many of the important theories of statistics are founded on assuming that variables are iid
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • Suppose that we flip a biased coin with success probability \(p\) \(n\) times, what is the join density of the collection of outcomes?
  • -
  • These random variables are iid with densities \(p^{x_i} (1 - p)^{1-x_i}\)
  • -
  • Therefore -\[ -f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} -\]
  • -
- -
- -
- - -
-

Correlation

-
-
-
    -
  • The covariance between two random variables \(X\) and \(Y\) is defined as -\[ -Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] -\]
  • -
  • The following are useful facts about covariance - -
      -
    1. \(Cov(X, Y) = Cov(Y, X)\)
    2. -
    3. \(Cov(X, Y)\) can be negative or positive
    4. -
    5. \(|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}\)
    6. -
  • -
- -
- -
- - -
-

Correlation

-
-
-
    -
  • The correlation between \(X\) and \(Y\) is -\[ -Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} -\]
  • -
- -
    -
  1. \(-1 \leq Cor(X, Y) \leq 1\)
  2. -
  3. \(Cor(X, Y) = \pm 1\) if and only if \(X = a + bY\) for some constants \(a\) and \(b\)
  4. -
  5. \(Cor(X, Y)\) is unitless
  6. -
  7. \(X\) and \(Y\) are uncorrelated if \(Cor(X, Y) = 0\)
  8. -
  9. \(X\) and \(Y\) are more positively correlated, the closer \(Cor(X,Y)\) is to \(1\)
  10. -
  11. \(X\) and \(Y\) are more negatively correlated, the closer \(Cor(X,Y)\) is to \(-1\)
  12. -
- -
- -
- - -
-

Some useful results

-
-
-
    -
  • Let \(\{X_i\}_{i=1}^n\) be a collection of random variables

    - -
      -
    • When the \(\{X_i\}\) are uncorrelated \[Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)\]
    • -
  • -
  • A commonly used subcase from these properties is that if a collection of random variables \(\{X_i\}\) are uncorrelated, then the variance of the sum is the sum of the variances -\[ -Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) -\]

  • -
  • Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations

  • -
- -
- -
- - -
-

The sample mean

-
-
-

Suppose \(X_i\) are iid with variance \(\sigma^2\)

- -

\[ -\begin{eqnarray*} - Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ - & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ - & = & \frac{\sigma^2}{n} - \end{eqnarray*} -\]

- -
- -
- - -
-

Some comments

-
-
-
    -
  • When \(X_i\) are independent with a common variance \(Var(\bar X) = \frac{\sigma^2}{n}\)
  • -
  • \(\sigma/\sqrt{n}\) is called the standard error of the sample mean
  • -
  • The standard error of the sample mean is the standard deviation of the distribution of the sample mean
  • -
  • \(\sigma\) is the standard deviation of the distribution of a single observation
  • -
  • Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a \(\sqrt{n}\)
  • -
- -
- -
- - -
-

The sample variance

-
-
-
    -
  • The sample variance is defined as -\[ -S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} -\]
  • -
  • The sample variance is an estimator of \(\sigma^2\)
  • -
  • The numerator has a version that's quicker for calculation -\[ -\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 -\]
  • -
  • The sample variance is (nearly) the mean of the squared deviations from the mean
  • -
- -
- -
- - -
-

The sample variance is unbiased

-
-
-

\[ - \begin{eqnarray*} - E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ - & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ - & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ - & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ - & = & (n - 1) \sigma^2 - \end{eqnarray*} -\]

- -
- -
- - -
-

Hoping to avoid some confusion

-
-
-
    -
  • Suppose \(X_i\) are iid with mean \(\mu\) and variance \(\sigma^2\)
  • -
  • \(S^2\) estimates \(\sigma^2\)
  • -
  • The calculation of \(S^2\) involves dividing by \(n-1\)
  • -
  • \(S / \sqrt{n}\) estimates \(\sigma / \sqrt{n}\) the standard error of the mean
  • -
  • \(S / \sqrt{n}\) is called the sample standard error (of the mean)
  • -
- -
- -
- - -
-

Example

-
-
-
data(father.son)
-hist(father.son$sheight, col = "lightblue", border = "black")
-x <- father.son$sheight
-n <- length(x)
-c(sum((x - mean(x))^2)/(n - 1), var(x), var(x)/n, sd(x), sd(x)/sqrt(n))
-
- -
## [1] 7.922545 7.922545 0.007349 2.814702 0.085728
-
- -
- -
- - -
- - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Independence + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Independence

+

Statistical Inference

+

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

+
+
+
+ + + + +
+

Independent events

+
+
+
    +
  • Two events \(A\) and \(B\) are independent if \[P(A \cap B) = P(A)P(B)\]
  • +
  • Two random variables, \(X\) and \(Y\) are independent if for any two sets \(A\) and \(B\) \[P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)\]
  • +
  • If \(A\) is independent of \(B\) then

    + +
      +
    • \(A^c\) is independent of \(B\)
    • +
    • \(A\) is independent of \(B^c\)
    • +
    • \(A^c\) is independent of \(B^c\)
    • +
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • What is the probability of getting two consecutive heads?
  • +
  • \(A = \{\mbox{Head on flip 1}\}\) ~ \(P(A) = .5\)
  • +
  • \(B = \{\mbox{Head on flip 2}\}\) ~ \(P(B) = .5\)
  • +
  • \(A \cap B = \{\mbox{Head on flips 1 and 2}\}\)
  • +
  • \(P(A \cap B) = P(A)P(B) = .5 \times .5 = .25\)
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial
  • +
  • Based on an estimated prevalence of sudden infant death syndrome of \(1\) out of \(8,543\), Dr Meadow testified that that the probability of a mother having two children with SIDS was \(\left(\frac{1}{8,543}\right)^2\)
  • +
  • The mother on trial was convicted of murder
  • +
+ +
+ +
+ + +
+

Example: continued

+
+
+
    +
  • For the purposes of this class, the principal mistake was to assume that the probabilities of having SIDs within a family are independent
  • +
  • That is, \(P(A_1 \cap A_2)\) is not necessarily equal to \(P(A_1)P(A_2)\)
  • +
  • Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families
  • +
  • (There are many other statistical points of discussion for this case.)
  • +
+ +
+ +
+ + +
+

Useful fact

+
+
+

We will use the following fact extensively in this class:

+ +

If a collection of random variables \(X_1, X_2, \ldots, X_n\) are independent, then their joint distribution is the product of their individual densities or mass functions

+ +

That is, if \(f_i\) is the density for random variable \(X_i\) we have that +\[ +f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) +\]

+ +
+ +
+ + +
+

IID random variables

+
+
+
    +
  • Random variables are said to be iid if they are independent and identically distributed
  • +
  • iid random variables are the default model for random samples
  • +
  • Many of the important theories of statistics are founded on assuming that variables are iid
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Suppose that we flip a biased coin with success probability \(p\) \(n\) times, what is the join density of the collection of outcomes?
  • +
  • These random variables are iid with densities \(p^{x_i} (1 - p)^{1-x_i}\)
  • +
  • Therefore +\[ +f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} +\]
  • +
+ +
+ +
+ + +
+

Correlation

+
+
+
    +
  • The covariance between two random variables \(X\) and \(Y\) is defined as +\[ +Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] +\]
  • +
  • The following are useful facts about covariance + +
      +
    1. \(Cov(X, Y) = Cov(Y, X)\)
    2. +
    3. \(Cov(X, Y)\) can be negative or positive
    4. +
    5. \(|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}\)
    6. +
  • +
+ +
+ +
+ + +
+

Correlation

+
+
+
    +
  • The correlation between \(X\) and \(Y\) is +\[ +Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} +\]
  • +
+ +
    +
  1. \(-1 \leq Cor(X, Y) \leq 1\)
  2. +
  3. \(Cor(X, Y) = \pm 1\) if and only if \(X = a + bY\) for some constants \(a\) and \(b\)
  4. +
  5. \(Cor(X, Y)\) is unitless
  6. +
  7. \(X\) and \(Y\) are uncorrelated if \(Cor(X, Y) = 0\)
  8. +
  9. \(X\) and \(Y\) are more positively correlated, the closer \(Cor(X,Y)\) is to \(1\)
  10. +
  11. \(X\) and \(Y\) are more negatively correlated, the closer \(Cor(X,Y)\) is to \(-1\)
  12. +
+ +
+ +
+ + +
+

Some useful results

+
+
+
    +
  • Let \(\{X_i\}_{i=1}^n\) be a collection of random variables

    + +
      +
    • When the \(\{X_i\}\) are uncorrelated \[Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)\]
    • +
  • +
  • A commonly used subcase from these properties is that if a collection of random variables \(\{X_i\}\) are uncorrelated, then the variance of the sum is the sum of the variances +\[ +Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) +\]

  • +
  • Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations

  • +
+ +
+ +
+ + +
+

The sample mean

+
+
+

Suppose \(X_i\) are iid with variance \(\sigma^2\)

+ +

\[ +\begin{eqnarray*} + Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ + & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ + & = & \frac{\sigma^2}{n} + \end{eqnarray*} +\]

+ +
+ +
+ + +
+

Some comments

+
+
+
    +
  • When \(X_i\) are independent with a common variance \(Var(\bar X) = \frac{\sigma^2}{n}\)
  • +
  • \(\sigma/\sqrt{n}\) is called the standard error of the sample mean
  • +
  • The standard error of the sample mean is the standard deviation of the distribution of the sample mean
  • +
  • \(\sigma\) is the standard deviation of the distribution of a single observation
  • +
  • Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a \(\sqrt{n}\)
  • +
+ +
+ +
+ + +
+

The sample variance

+
+
+
    +
  • The sample variance is defined as +\[ +S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} +\]
  • +
  • The sample variance is an estimator of \(\sigma^2\)
  • +
  • The numerator has a version that's quicker for calculation +\[ +\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 +\]
  • +
  • The sample variance is (nearly) the mean of the squared deviations from the mean
  • +
+ +
+ +
+ + +
+

The sample variance is unbiased

+
+
+

\[ + \begin{eqnarray*} + E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ + & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ + & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ + & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ + & = & (n - 1) \sigma^2 + \end{eqnarray*} +\]

+ +
+ +
+ + +
+

Hoping to avoid some confusion

+
+
+
    +
  • Suppose \(X_i\) are iid with mean \(\mu\) and variance \(\sigma^2\)
  • +
  • \(S^2\) estimates \(\sigma^2\)
  • +
  • The calculation of \(S^2\) involves dividing by \(n-1\)
  • +
  • \(S / \sqrt{n}\) estimates \(\sigma / \sqrt{n}\) the standard error of the mean
  • +
  • \(S / \sqrt{n}\) is called the sample standard error (of the mean)
  • +
+ +
+ +
+ + +
+

Example

+
+
+
data(father.son)
+x <- father.son$sheight
+n <- length(x)
+
+ +
+ +
+ + +
+

plot of chunk unnamed-chunk-2

+ +
round(c(sum((x - mean(x))^2)/(n - 1), var(x), var(x)/n, sd(x), sd(x)/sqrt(n)), 
+    2)
+
+ +
## [1] 7.92 7.92 0.01 2.81 0.09
+
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/06_StatisticalInference/01_04_Independence/index.md b/06_StatisticalInference/01_04_Independence/index.md index cadf22c0a..12b7e2264 100644 --- a/06_StatisticalInference/01_04_Independence/index.md +++ b/06_StatisticalInference/01_04_Independence/index.md @@ -1,208 +1,216 @@ ---- -title : Independence -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Independent events - -- Two events $A$ and $B$ are **independent** if $$P(A \cap B) = P(A)P(B)$$ -- Two random variables, $X$ and $Y$ are independent if for any two sets $A$ and $B$ $$P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)$$ -- If $A$ is independent of $B$ then - - - $A^c$ is independent of $B$ - - $A$ is independent of $B^c$ - - $A^c$ is independent of $B^c$ - - ---- - -## Example - -- What is the probability of getting two consecutive heads? -- $A = \{\mbox{Head on flip 1}\}$ ~ $P(A) = .5$ -- $B = \{\mbox{Head on flip 2}\}$ ~ $P(B) = .5$ -- $A \cap B = \{\mbox{Head on flips 1 and 2}\}$ -- $P(A \cap B) = P(A)P(B) = .5 \times .5 = .25$ - ---- - -## Example - -- Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial -- Based on an estimated prevalence of sudden infant death syndrome of $1$ out of $8,543$, Dr Meadow testified that that the probability of a mother having two children with SIDS was $\left(\frac{1}{8,543}\right)^2$ -- The mother on trial was convicted of murder - ---- - -## Example: continued - -- For the purposes of this class, the principal mistake was to *assume* that the probabilities of having SIDs within a family are independent -- That is, $P(A_1 \cap A_2)$ is not necessarily equal to $P(A_1)P(A_2)$ -- Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families -- (There are many other statistical points of discussion for this case.) - ---- - -## Useful fact - -We will use the following fact extensively in this class: - -*If a collection of random variables $X_1, X_2, \ldots, X_n$ are independent, then their joint distribution is the product of their individual densities or mass functions* - -*That is, if $f_i$ is the density for random variable $X_i$ we have that* -$$ -f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) -$$ - ---- - -## IID random variables - -- Random variables are said to be iid if they are independent and identically distributed -- iid random variables are the default model for random samples -- Many of the important theories of statistics are founded on assuming that variables are iid - - ---- - -## Example - -- Suppose that we flip a biased coin with success probability $p$ $n$ times, what is the join density of the collection of outcomes? -- These random variables are iid with densities $p^{x_i} (1 - p)^{1-x_i}$ -- Therefore - $$ - f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} - $$ - ---- - -## Correlation - -- The **covariance** between two random variables $X$ and $Y$ is defined as -$$ -Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] -$$ -- The following are useful facts about covariance - 1. $Cov(X, Y) = Cov(Y, X)$ - 2. $Cov(X, Y)$ can be negative or positive - 3. $|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}$ - ---- - -## Correlation - -- The **correlation** between $X$ and $Y$ is -$$ -Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} -$$ - - 1. $-1 \leq Cor(X, Y) \leq 1$ - 2. $Cor(X, Y) = \pm 1$ if and only if $X = a + bY$ for some constants $a$ and $b$ - 3. $Cor(X, Y)$ is unitless - 4. $X$ and $Y$ are **uncorrelated** if $Cor(X, Y) = 0$ - 5. $X$ and $Y$ are more positively correlated, the closer $Cor(X,Y)$ is to $1$ - 6. $X$ and $Y$ are more negatively correlated, the closer $Cor(X,Y)$ is to $-1$ - ---- - -## Some useful results - -- Let $\{X_i\}_{i=1}^n$ be a collection of random variables - - When the $\{X_i\}$ are uncorrelated $$Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)$$ - -- A commonly used subcase from these properties is that *if a collection of random variables $\{X_i\}$ are uncorrelated*, then the variance of the sum is the sum of the variances -$$ -Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) -$$ -- Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations - ---- - -## The sample mean - -Suppose $X_i$ are iid with variance $\sigma^2$ - -$$ -\begin{eqnarray*} - Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ - & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ - & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ - & = & \frac{\sigma^2}{n} - \end{eqnarray*} -$$ - ---- - -## Some comments - -- When $X_i$ are independent with a common variance $Var(\bar X) = \frac{\sigma^2}{n}$ -- $\sigma/\sqrt{n}$ is called *the standard error* of the sample mean -- The standard error of the sample mean is the standard deviation of the distribution of the sample mean -- $\sigma$ is the standard deviation of the distribution of a single observation -- Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a $\sqrt{n}$ - ---- - -## The sample variance -- The **sample variance** is defined as -$$ -S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} -$$ -- The sample variance is an estimator of $\sigma^2$ -- The numerator has a version that's quicker for calculation -$$ -\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 -$$ -- The sample variance is (nearly) the mean of the squared deviations from the mean - ---- - -## The sample variance is unbiased - -$$ - \begin{eqnarray*} - E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ - & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ - & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ - & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ - & = & (n - 1) \sigma^2 - \end{eqnarray*} -$$ - ---- - -## Hoping to avoid some confusion - -- Suppose $X_i$ are iid with mean $\mu$ and variance $\sigma^2$ -- $S^2$ estimates $\sigma^2$ -- The calculation of $S^2$ involves dividing by $n-1$ -- $S / \sqrt{n}$ estimates $\sigma / \sqrt{n}$ the standard error of the mean -- $S / \sqrt{n}$ is called the sample standard error (of the mean) - ---- -## Example - -```r -data(father.son) -hist(father.son$sheight, col = "lightblue", border = "black") -x <- father.son$sheight -n <- length(x) -c(sum((x - mean(x))^2)/(n - 1), var(x), var(x)/n, sd(x), sd(x)/sqrt(n)) -``` - -``` -## [1] 7.922545 7.922545 0.007349 2.814702 0.085728 -``` - +--- +title : Independence +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Independent events + +- Two events $A$ and $B$ are **independent** if $$P(A \cap B) = P(A)P(B)$$ +- Two random variables, $X$ and $Y$ are independent if for any two sets $A$ and $B$ $$P([X \in A] \cap [Y \in B]) = P(X\in A)P(Y\in B)$$ +- If $A$ is independent of $B$ then + + - $A^c$ is independent of $B$ + - $A$ is independent of $B^c$ + - $A^c$ is independent of $B^c$ + + +--- + +## Example + +- What is the probability of getting two consecutive heads? +- $A = \{\mbox{Head on flip 1}\}$ ~ $P(A) = .5$ +- $B = \{\mbox{Head on flip 2}\}$ ~ $P(B) = .5$ +- $A \cap B = \{\mbox{Head on flips 1 and 2}\}$ +- $P(A \cap B) = P(A)P(B) = .5 \times .5 = .25$ + +--- + +## Example + +- Volume 309 of Science reports on a physician who was on trial for expert testimony in a criminal trial +- Based on an estimated prevalence of sudden infant death syndrome of $1$ out of $8,543$, Dr Meadow testified that that the probability of a mother having two children with SIDS was $\left(\frac{1}{8,543}\right)^2$ +- The mother on trial was convicted of murder + +--- + +## Example: continued + +- For the purposes of this class, the principal mistake was to *assume* that the probabilities of having SIDs within a family are independent +- That is, $P(A_1 \cap A_2)$ is not necessarily equal to $P(A_1)P(A_2)$ +- Biological processes that have a believed genetic or familiar environmental component, of course, tend to be dependent within families +- (There are many other statistical points of discussion for this case.) + +--- + +## Useful fact + +We will use the following fact extensively in this class: + +*If a collection of random variables $X_1, X_2, \ldots, X_n$ are independent, then their joint distribution is the product of their individual densities or mass functions* + +*That is, if $f_i$ is the density for random variable $X_i$ we have that* +$$ +f(x_1,\ldots, x_n) = \prod_{i=1}^n f_i(x_i) +$$ + +--- + +## IID random variables + +- Random variables are said to be iid if they are independent and identically distributed +- iid random variables are the default model for random samples +- Many of the important theories of statistics are founded on assuming that variables are iid + + +--- + +## Example + +- Suppose that we flip a biased coin with success probability $p$ $n$ times, what is the join density of the collection of outcomes? +- These random variables are iid with densities $p^{x_i} (1 - p)^{1-x_i}$ +- Therefore + $$ + f(x_1,\ldots,x_n) = \prod_{i=1}^n p^{x_i} (1 - p)^{1-x_i} = p^{\sum x_i} (1 - p)^{n - \sum x_i} + $$ + +--- + +## Correlation + +- The **covariance** between two random variables $X$ and $Y$ is defined as +$$ +Cov(X, Y) = E[(X - \mu_x)(Y - \mu_y)] = E[X Y] - E[X]E[Y] +$$ +- The following are useful facts about covariance + 1. $Cov(X, Y) = Cov(Y, X)$ + 2. $Cov(X, Y)$ can be negative or positive + 3. $|Cov(X, Y)| \leq \sqrt{Var(X) Var(y)}$ + +--- + +## Correlation + +- The **correlation** between $X$ and $Y$ is +$$ +Cor(X, Y) = Cov(X, Y) / \sqrt{Var(X) Var(y)} +$$ + + 1. $-1 \leq Cor(X, Y) \leq 1$ + 2. $Cor(X, Y) = \pm 1$ if and only if $X = a + bY$ for some constants $a$ and $b$ + 3. $Cor(X, Y)$ is unitless + 4. $X$ and $Y$ are **uncorrelated** if $Cor(X, Y) = 0$ + 5. $X$ and $Y$ are more positively correlated, the closer $Cor(X,Y)$ is to $1$ + 6. $X$ and $Y$ are more negatively correlated, the closer $Cor(X,Y)$ is to $-1$ + +--- + +## Some useful results + +- Let $\{X_i\}_{i=1}^n$ be a collection of random variables + - When the $\{X_i\}$ are uncorrelated $$Var\left(\sum_{i=1}^n a_i X_i + b\right) = \sum_{i=1}^n a_i^2 Var(X_i)$$ + +- A commonly used subcase from these properties is that *if a collection of random variables $\{X_i\}$ are uncorrelated*, then the variance of the sum is the sum of the variances +$$ +Var\left(\sum_{i=1}^n X_i \right) = \sum_{i=1}^n Var(X_i) +$$ +- Therefore, it is sums of variances that tend to be useful, not sums of standard deviations; that is, the standard deviation of the sum of bunch of independent random variables is the square root of the sum of the variances, not the sum of the standard deviations + +--- + +## The sample mean + +Suppose $X_i$ are iid with variance $\sigma^2$ + +$$ +\begin{eqnarray*} + Var(\bar X) & = & Var \left( \frac{1}{n}\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} Var\left(\sum_{i=1}^n X_i \right)\\ \\ + & = & \frac{1}{n^2} \sum_{i=1}^n Var(X_i) \\ \\ + & = & \frac{1}{n^2} \times n\sigma^2 \\ \\ + & = & \frac{\sigma^2}{n} + \end{eqnarray*} +$$ + +--- + +## Some comments + +- When $X_i$ are independent with a common variance $Var(\bar X) = \frac{\sigma^2}{n}$ +- $\sigma/\sqrt{n}$ is called *the standard error* of the sample mean +- The standard error of the sample mean is the standard deviation of the distribution of the sample mean +- $\sigma$ is the standard deviation of the distribution of a single observation +- Easy way to remember, the sample mean has to be less variable than a single observation, therefore its standard deviation is divided by a $\sqrt{n}$ + +--- + +## The sample variance +- The **sample variance** is defined as +$$ +S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n-1} +$$ +- The sample variance is an estimator of $\sigma^2$ +- The numerator has a version that's quicker for calculation +$$ +\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n \bar X^2 +$$ +- The sample variance is (nearly) the mean of the squared deviations from the mean + +--- + +## The sample variance is unbiased + +$$ + \begin{eqnarray*} + E\left[\sum_{i=1}^n (X_i - \bar X)^2\right] & = & \sum_{i=1}^n E\left[X_i^2\right] - n E\left[\bar X^2\right] \\ \\ + & = & \sum_{i=1}^n \left\{Var(X_i) + \mu^2\right\} - n \left\{Var(\bar X) + \mu^2\right\} \\ \\ + & = & \sum_{i=1}^n \left\{\sigma^2 + \mu^2\right\} - n \left\{\sigma^2 / n + \mu^2\right\} \\ \\ + & = & n \sigma^2 + n \mu ^ 2 - \sigma^2 - n \mu^2 \\ \\ + & = & (n - 1) \sigma^2 + \end{eqnarray*} +$$ + +--- + +## Hoping to avoid some confusion + +- Suppose $X_i$ are iid with mean $\mu$ and variance $\sigma^2$ +- $S^2$ estimates $\sigma^2$ +- The calculation of $S^2$ involves dividing by $n-1$ +- $S / \sqrt{n}$ estimates $\sigma / \sqrt{n}$ the standard error of the mean +- $S / \sqrt{n}$ is called the sample standard error (of the mean) + +--- +## Example + +```r +data(father.son) +x <- father.son$sheight +n <- length(x) +``` + + +--- +![plot of chunk unnamed-chunk-2](assets/fig/unnamed-chunk-2.png) + + +```r +round(c(sum((x - mean(x))^2)/(n - 1), var(x), var(x)/n, sd(x), sd(x)/sqrt(n)), + 2) +``` + +``` +## [1] 7.92 7.92 0.01 2.81 0.09 +``` + diff --git a/06_StatisticalInference/01_05_ConditionalProbability/index.Rmd b/06_StatisticalInference/01_05_ConditionalProbability/index.Rmd index f70377cde..db2151e51 100644 --- a/06_StatisticalInference/01_05_ConditionalProbability/index.Rmd +++ b/06_StatisticalInference/01_05_ConditionalProbability/index.Rmd @@ -1,168 +1,169 @@ ---- -title : Conditional Probability -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Conditional probability, motivation - -- The probability of getting a one when rolling a (standard) die - is usually assumed to be one sixth -- Suppose you were given the extra information that the die roll - was an odd number (hence 1, 3 or 5) -- *conditional on this new information*, the probability of a - one is now one third - ---- - -## Conditional probability, definition - -- Let $B$ be an event so that $P(B) > 0$ -- Then the conditional probability of an event $A$ given that $B$ has occurred is - $$ - P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} - $$ -- Notice that if $A$ and $B$ are independent, then - $$ - P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) - $$ - ---- - -## Example - -- Consider our die roll example -- $B = \{1, 3, 5\}$ -- $A = \{1\}$ -$$ - \begin{eqnarray*} -P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ - & = & \frac{P(A \cap B)}{P(B)} \\ \\ - & = & \frac{P(A)}{P(B)} \\ \\ - & = & \frac{1/6}{3/6} = \frac{1}{3} - \end{eqnarray*} -$$ - - - ---- - -## Bayes' rule - -$$ -P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. -$$ - - ---- - -## Diagnostic tests - -- Let $+$ and $-$ be the events that the result of a diagnostic test is positive or negative respectively -- Let $D$ and $D^c$ be the event that the subject of the test has or does not have the disease respectively -- The **sensitivity** is the probability that the test is positive given that the subject actually has the disease, $P(+ ~|~ D)$ -- The **specificity** is the probability that the test is negative given that the subject does not have the disease, $P(- ~|~ D^c)$ - ---- - -## More definitions - -- The **positive predictive value** is the probability that the subject has the disease given that the test is positive, $P(D ~|~ +)$ -- The **negative predictive value** is the probability that the subject does not have the disease given that the test is negative, $P(D^c ~|~ -)$ -- The **prevalence of the disease** is the marginal probability of disease, $P(D)$ - ---- - -## More definitions - -- The **diagnostic likelihood ratio of a positive test**, labeled $DLR_+$, is $P(+ ~|~ D) / P(+ ~|~ D^c)$, which is the $$sensitivity / (1 - specificity)$$ -- The **diagnostic likelihood ratio of a negative test**, labeled $DLR_-$, is $P(- ~|~ D) / P(- ~|~ D^c)$, which is the $$(1 - sensitivity) / specificity$$ - ---- - -## Example - -- A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5% -- Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV? -- Mathematically, we want $P(D ~|~ +)$ given the sensitivity, $P(+ ~|~ D) = .997$, the specificity, $P(- ~|~ D^c) =.985$, and the prevalence $P(D) = .001$ - ---- - -## Using Bayes' formula - -$$ -\begin{eqnarray*} - P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ - & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ - & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ - & = & .062 -\end{eqnarray*} -$$ - -- In this population a positive test result only suggests a 6% probability that the subject has the disease -- (The positive predictive value is 6% for this test) - ---- - -## More on this example - -- The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity -- Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner -- Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes - ---- - -## Likelihood ratios - -- Using Bayes rule, we have - $$ - P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} - $$ - and - $$ - P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. - $$ - ---- - -## Likelihood ratios - -- Therefore -$$ -\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} -$$ -ie -$$ -\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D -$$ -- Similarly, $DLR_-$ relates the decrease in the odds of the - disease after a negative test result to the odds of disease prior to - the test. - ---- - -## HIV example revisited - -- Suppose a subject has a positive HIV test -- $DLR_+ = .997 / (1 - .985) \approx 66$ -- The result of the positive test is that the odds of disease is now 66 times the pretest odds -- Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease - ---- - -## HIV example revisited - -- Suppose that a subject has a negative test result -- $DLR_- = (1 - .997) / .985 \approx .003$ -- Therefore, the post-test odds of disease is now $.3\%$ of the pretest odds given the negative test. -- Or, the hypothesis of disease is supported $.003$ times that of the hypothesis of absence of disease given the negative test result \ No newline at end of file +--- +title : Conditional Probability +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Conditional probability, motivation + +- The probability of getting a one when rolling a (standard) die + is usually assumed to be one sixth +- Suppose you were given the extra information that the die roll + was an odd number (hence 1, 3 or 5) +- *conditional on this new information*, the probability of a + one is now one third + +--- + +## Conditional probability, definition + +- Let $B$ be an event so that $P(B) > 0$ +- Then the conditional probability of an event $A$ given that $B$ has occurred is + $$ + P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} + $$ +- Notice that if $A$ and $B$ are independent, then + $$ + P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) + $$ + +--- + +## Example + +- Consider our die roll example +- $B = \{1, 3, 5\}$ +- $A = \{1\}$ +$$ + \begin{eqnarray*} +P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ + & = & \frac{P(A \cap B)}{P(B)} \\ \\ + & = & \frac{P(A)}{P(B)} \\ \\ + & = & \frac{1/6}{3/6} = \frac{1}{3} + \end{eqnarray*} +$$ + + + +--- + +## Bayes' rule + +$$ +P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. +$$ + + +--- + +## Diagnostic tests + +- Let $+$ and $-$ be the events that the result of a diagnostic test is positive or negative respectively +- Let $D$ and $D^c$ be the event that the subject of the test has or does not have the disease respectively +- The **sensitivity** is the probability that the test is positive given that the subject actually has the disease, $P(+ ~|~ D)$ +- The **specificity** is the probability that the test is negative given that the subject does not have the disease, $P(- ~|~ D^c)$ + +--- + +## More definitions + +- The **positive predictive value** is the probability that the subject has the disease given that the test is positive, $P(D ~|~ +)$ +- The **negative predictive value** is the probability that the subject does not have the disease given that the test is negative, $P(D^c ~|~ -)$ +- The **prevalence of the disease** is the marginal probability of disease, $P(D)$ + +--- + +## More definitions + +- The **diagnostic likelihood ratio of a positive test**, labeled $DLR_+$, is $P(+ ~|~ D) / P(+ ~|~ D^c)$, which is the $$sensitivity / (1 - specificity)$$ +- The **diagnostic likelihood ratio of a negative test**, labeled $DLR_-$, is $P(- ~|~ D) / P(- ~|~ D^c)$, which is the $$(1 - sensitivity) / specificity$$ + +--- + +## Example + +- A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5% +- Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV? +- Mathematically, we want $P(D ~|~ +)$ given the sensitivity, $P(+ ~|~ D) = .997$, the specificity, $P(- ~|~ D^c) =.985$, and the prevalence $P(D) = .001$ + +--- + +## Using Bayes' formula + +$$ +\begin{eqnarray*} + P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ + & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ + & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ + & = & .062 +\end{eqnarray*} +$$ + +- In this population a positive test result only suggests a 6% probability that the subject has the disease +- (The positive predictive value is 6% for this test) + +--- + +## More on this example + +- The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity +- Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner +- Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes + +--- + +## Likelihood ratios + +- Using Bayes rule, we have + $$ + P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} + $$ + and + $$ + P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. + $$ + +--- + +## Likelihood ratios + +- Therefore +$$ +\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} +$$ +ie +$$ +\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D +$$ +- Similarly, $DLR_-$ relates the decrease in the odds of the + disease after a negative test result to the odds of disease prior to + the test. + +--- + +## HIV example revisited + +- Suppose a subject has a positive HIV test +- $DLR_+ = .997 / (1 - .985) \approx 66$ +- The result of the positive test is that the odds of disease is now 66 times the pretest odds +- Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease + +--- + +## HIV example revisited + +- Suppose that a subject has a negative test result +- $DLR_- = (1 - .997) / .985 \approx .003$ +- Therefore, the post-test odds of disease is now $.3\%$ of the pretest odds given the negative test. +- Or, the hypothesis of disease is supported $.003$ times that of the hypothesis of absence of disease given the negative test result + diff --git a/06_StatisticalInference/01_05_ConditionalProbability/index.html b/06_StatisticalInference/01_05_ConditionalProbability/index.html index 45832d65d..dba5dc912 100644 --- a/06_StatisticalInference/01_05_ConditionalProbability/index.html +++ b/06_StatisticalInference/01_05_ConditionalProbability/index.html @@ -1,331 +1,411 @@ - - - - Conditional Probability - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Conditional Probability

-

Statistical Inference

-

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

-
-
- - - -
-

Conditional probability, motivation

-
-
-
    -
  • The probability of getting a one when rolling a (standard) die -is usually assumed to be one sixth
  • -
  • Suppose you were given the extra information that the die roll -was an odd number (hence 1, 3 or 5)
  • -
  • conditional on this new information, the probability of a -one is now one third
  • -
- -
- -
- - -
-

Conditional probability, definition

-
-
-
    -
  • Let \(B\) be an event so that \(P(B) > 0\)
  • -
  • Then the conditional probability of an event \(A\) given that \(B\) has occurred is -\[ -P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} -\]
  • -
  • Notice that if \(A\) and \(B\) are independent, then -\[ -P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) -\]
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • Consider our die roll example
  • -
  • \(B = \{1, 3, 5\}\)
  • -
  • \(A = \{1\}\) -\[ -\begin{eqnarray*} -P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ -& = & \frac{P(A \cap B)}{P(B)} \\ \\ -& = & \frac{P(A)}{P(B)} \\ \\ -& = & \frac{1/6}{3/6} = \frac{1}{3} -\end{eqnarray*} -\]
  • -
- -
- -
- - -
-

Bayes' rule

-
-
-

\[ -P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. -\]

- -
- -
- - -
-

Diagnostic tests

-
-
-
    -
  • Let \(+\) and \(-\) be the events that the result of a diagnostic test is positive or negative respectively
  • -
  • Let \(D\) and \(D^c\) be the event that the subject of the test has or does not have the disease respectively
  • -
  • The sensitivity is the probability that the test is positive given that the subject actually has the disease, \(P(+ ~|~ D)\)
  • -
  • The specificity is the probability that the test is negative given that the subject does not have the disease, \(P(- ~|~ D^c)\)
  • -
- -
- -
- - -
-

More definitions

-
-
-
    -
  • The positive predictive value is the probability that the subject has the disease given that the test is positive, \(P(D ~|~ +)\)
  • -
  • The negative predictive value is the probability that the subject does not have the disease given that the test is negative, \(P(D^c ~|~ -)\)
  • -
  • The prevalence of the disease is the marginal probability of disease, \(P(D)\)
  • -
- -
- -
- - -
-

More definitions

-
-
-
    -
  • The diagnostic likelihood ratio of a positive test, labeled \(DLR_+\), is \(P(+ ~|~ D) / P(+ ~|~ D^c)\), which is the \[sensitivity / (1 - specificity)\]
  • -
  • The diagnostic likelihood ratio of a negative test, labeled \(DLR_-\), is \(P(- ~|~ D) / P(- ~|~ D^c)\), which is the \[(1 - sensitivity) / specificity\]
  • -
- -
- -
- - -
-

Example

-
-
-
    -
  • A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5%
  • -
  • Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV?
  • -
  • Mathematically, we want \(P(D ~|~ +)\) given the sensitivity, \(P(+ ~|~ D) = .997\), the specificity, \(P(- ~|~ D^c) =.985\), and the prevalence \(P(D) = .001\)
  • -
- -
- -
- - -
-

Using Bayes' formula

-
-
-

\[ -\begin{eqnarray*} - P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ - & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ - & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ - & = & .062 -\end{eqnarray*} -\]

- -
    -
  • In this population a positive test result only suggests a 6% probability that the subject has the disease
  • -
  • (The positive predictive value is 6% for this test)
  • -
- -
- -
- - -
-

More on this example

-
-
-
    -
  • The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity
  • -
  • Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner
  • -
  • Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes
  • -
- -
- -
- - -
-

Likelihood ratios

-
-
-
    -
  • Using Bayes rule, we have -\[ -P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} -\] -and -\[ -P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. -\]
  • -
- -
- -
- - -
-

Likelihood ratios

-
-
-
    -
  • Therefore -\[ -\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} -\] -ie -\[ -\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D -\]
  • -
  • Similarly, \(DLR_-\) relates the decrease in the odds of the -disease after a negative test result to the odds of disease prior to -the test.
  • -
- -
- -
- - -
-

HIV example revisited

-
-
-
    -
  • Suppose a subject has a positive HIV test
  • -
  • \(DLR_+ = .997 / (1 - .985) \approx 66\)
  • -
  • The result of the positive test is that the odds of disease is now 66 times the pretest odds
  • -
  • Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease
  • -
- -
- -
- - -
-

HIV example revisited

-
-
-
    -
  • Suppose that a subject has a negative test result
  • -
  • \(DLR_- = (1 - .997) / .985 \approx .003\)
  • -
  • Therefore, the post-test odds of disease is now \(.3\%\) of the pretest odds given the negative test.
  • -
  • Or, the hypothesis of disease is supported \(.003\) times that of the hypothesis of absence of disease given the negative test result
  • -
- -
- -
- - -
- - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Conditional Probability + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Conditional Probability

+

Statistical Inference

+

Brian Caffo, Jeff Leek, Roger Peng
Johns Hopkins Bloomberg School of Public Health

+
+
+
+ + + + +
+

Conditional probability, motivation

+
+
+
    +
  • The probability of getting a one when rolling a (standard) die +is usually assumed to be one sixth
  • +
  • Suppose you were given the extra information that the die roll +was an odd number (hence 1, 3 or 5)
  • +
  • conditional on this new information, the probability of a +one is now one third
  • +
+ +
+ +
+ + +
+

Conditional probability, definition

+
+
+
    +
  • Let \(B\) be an event so that \(P(B) > 0\)
  • +
  • Then the conditional probability of an event \(A\) given that \(B\) has occurred is +\[ +P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} +\]
  • +
  • Notice that if \(A\) and \(B\) are independent, then +\[ +P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) +\]
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • Consider our die roll example
  • +
  • \(B = \{1, 3, 5\}\)
  • +
  • \(A = \{1\}\) +\[ +\begin{eqnarray*} +P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ +& = & \frac{P(A \cap B)}{P(B)} \\ \\ +& = & \frac{P(A)}{P(B)} \\ \\ +& = & \frac{1/6}{3/6} = \frac{1}{3} +\end{eqnarray*} +\]
  • +
+ +
+ +
+ + +
+

Bayes' rule

+
+
+

\[ +P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. +\]

+ +
+ +
+ + +
+

Diagnostic tests

+
+
+
    +
  • Let \(+\) and \(-\) be the events that the result of a diagnostic test is positive or negative respectively
  • +
  • Let \(D\) and \(D^c\) be the event that the subject of the test has or does not have the disease respectively
  • +
  • The sensitivity is the probability that the test is positive given that the subject actually has the disease, \(P(+ ~|~ D)\)
  • +
  • The specificity is the probability that the test is negative given that the subject does not have the disease, \(P(- ~|~ D^c)\)
  • +
+ +
+ +
+ + +
+

More definitions

+
+
+
    +
  • The positive predictive value is the probability that the subject has the disease given that the test is positive, \(P(D ~|~ +)\)
  • +
  • The negative predictive value is the probability that the subject does not have the disease given that the test is negative, \(P(D^c ~|~ -)\)
  • +
  • The prevalence of the disease is the marginal probability of disease, \(P(D)\)
  • +
+ +
+ +
+ + +
+

More definitions

+
+
+
    +
  • The diagnostic likelihood ratio of a positive test, labeled \(DLR_+\), is \(P(+ ~|~ D) / P(+ ~|~ D^c)\), which is the \[sensitivity / (1 - specificity)\]
  • +
  • The diagnostic likelihood ratio of a negative test, labeled \(DLR_-\), is \(P(- ~|~ D) / P(- ~|~ D^c)\), which is the \[(1 - sensitivity) / specificity\]
  • +
+ +
+ +
+ + +
+

Example

+
+
+
    +
  • A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5%
  • +
  • Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV?
  • +
  • Mathematically, we want \(P(D ~|~ +)\) given the sensitivity, \(P(+ ~|~ D) = .997\), the specificity, \(P(- ~|~ D^c) =.985\), and the prevalence \(P(D) = .001\)
  • +
+ +
+ +
+ + +
+

Using Bayes' formula

+
+
+

\[ +\begin{eqnarray*} + P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ + & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ + & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ + & = & .062 +\end{eqnarray*} +\]

+ +
    +
  • In this population a positive test result only suggests a 6% probability that the subject has the disease
  • +
  • (The positive predictive value is 6% for this test)
  • +
+ +
+ +
+ + +
+

More on this example

+
+
+
    +
  • The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity
  • +
  • Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner
  • +
  • Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes
  • +
+ +
+ +
+ + +
+

Likelihood ratios

+
+
+
    +
  • Using Bayes rule, we have +\[ +P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} +\] +and +\[ +P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. +\]
  • +
+ +
+ +
+ + +
+

Likelihood ratios

+
+
+
    +
  • Therefore +\[ +\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} +\] +ie +\[ +\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D +\]
  • +
  • Similarly, \(DLR_-\) relates the decrease in the odds of the +disease after a negative test result to the odds of disease prior to +the test.
  • +
+ +
+ +
+ + +
+

HIV example revisited

+
+
+
    +
  • Suppose a subject has a positive HIV test
  • +
  • \(DLR_+ = .997 / (1 - .985) \approx 66\)
  • +
  • The result of the positive test is that the odds of disease is now 66 times the pretest odds
  • +
  • Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease
  • +
+ +
+ +
+ + +
+

HIV example revisited

+
+
+
    +
  • Suppose that a subject has a negative test result
  • +
  • \(DLR_- = (1 - .997) / .985 \approx .003\)
  • +
  • Therefore, the post-test odds of disease is now \(.3\%\) of the pretest odds given the negative test.
  • +
  • Or, the hypothesis of disease is supported \(.003\) times that of the hypothesis of absence of disease given the negative test result
  • +
+ +
+ +
+ + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/06_StatisticalInference/01_05_ConditionalProbability/index.md b/06_StatisticalInference/01_05_ConditionalProbability/index.md index 7bac9be1d..db2151e51 100644 --- a/06_StatisticalInference/01_05_ConditionalProbability/index.md +++ b/06_StatisticalInference/01_05_ConditionalProbability/index.md @@ -1,168 +1,169 @@ ---- -title : Conditional Probability -subtitle : Statistical Inference -author : Brian Caffo, Jeff Leek, Roger Peng -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - -## Conditional probability, motivation - -- The probability of getting a one when rolling a (standard) die - is usually assumed to be one sixth -- Suppose you were given the extra information that the die roll - was an odd number (hence 1, 3 or 5) -- *conditional on this new information*, the probability of a - one is now one third - ---- - -## Conditional probability, definition - -- Let $B$ be an event so that $P(B) > 0$ -- Then the conditional probability of an event $A$ given that $B$ has occurred is - $$ - P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} - $$ -- Notice that if $A$ and $B$ are independent, then - $$ - P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) - $$ - ---- - -## Example - -- Consider our die roll example -- $B = \{1, 3, 5\}$ -- $A = \{1\}$ -$$ - \begin{eqnarray*} -P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ - & = & \frac{P(A \cap B)}{P(B)} \\ \\ - & = & \frac{P(A)}{P(B)} \\ \\ - & = & \frac{1/6}{3/6} = \frac{1}{3} - \end{eqnarray*} -$$ - - - ---- - -## Bayes' rule - -$$ -P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. -$$ - - ---- - -## Diagnostic tests - -- Let $+$ and $-$ be the events that the result of a diagnostic test is positive or negative respectively -- Let $D$ and $D^c$ be the event that the subject of the test has or does not have the disease respectively -- The **sensitivity** is the probability that the test is positive given that the subject actually has the disease, $P(+ ~|~ D)$ -- The **specificity** is the probability that the test is negative given that the subject does not have the disease, $P(- ~|~ D^c)$ - ---- - -## More definitions - -- The **positive predictive value** is the probability that the subject has the disease given that the test is positive, $P(D ~|~ +)$ -- The **negative predictive value** is the probability that the subject does not have the disease given that the test is negative, $P(D^c ~|~ -)$ -- The **prevalence of the disease** is the marginal probability of disease, $P(D)$ - ---- - -## More definitions - -- The **diagnostic likelihood ratio of a positive test**, labeled $DLR_+$, is $P(+ ~|~ D) / P(+ ~|~ D^c)$, which is the $$sensitivity / (1 - specificity)$$ -- The **diagnostic likelihood ratio of a negative test**, labeled $DLR_-$, is $P(- ~|~ D) / P(- ~|~ D^c)$, which is the $$(1 - sensitivity) / specificity$$ - ---- - -## Example - -- A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5% -- Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV? -- Mathematically, we want $P(D ~|~ +)$ given the sensitivity, $P(+ ~|~ D) = .997$, the specificity, $P(- ~|~ D^c) =.985$, and the prevalence $P(D) = .001$ - ---- - -## Using Bayes' formula - -$$ -\begin{eqnarray*} - P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ - & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ - & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ - & = & .062 -\end{eqnarray*} -$$ - -- In this population a positive test result only suggests a 6% probability that the subject has the disease -- (The positive predictive value is 6% for this test) - ---- - -## More on this example - -- The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity -- Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner -- Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes - ---- - -## Likelihood ratios - -- Using Bayes rule, we have - $$ - P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} - $$ - and - $$ - P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. - $$ - ---- - -## Likelihood ratios - -- Therefore -$$ -\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} -$$ -ie -$$ -\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D -$$ -- Similarly, $DLR_-$ relates the decrease in the odds of the - disease after a negative test result to the odds of disease prior to - the test. - ---- - -## HIV example revisited - -- Suppose a subject has a positive HIV test -- $DLR_+ = .997 / (1 - .985) \approx 66$ -- The result of the positive test is that the odds of disease is now 66 times the pretest odds -- Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease - ---- - -## HIV example revisited - -- Suppose that a subject has a negative test result -- $DLR_- = (1 - .997) / .985 \approx .003$ -- Therefore, the post-test odds of disease is now $.3\%$ of the pretest odds given the negative test. -- Or, the hypothesis of disease is supported $.003$ times that of the hypothesis of absence of disease given the negative test result +--- +title : Conditional Probability +subtitle : Statistical Inference +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + +## Conditional probability, motivation + +- The probability of getting a one when rolling a (standard) die + is usually assumed to be one sixth +- Suppose you were given the extra information that the die roll + was an odd number (hence 1, 3 or 5) +- *conditional on this new information*, the probability of a + one is now one third + +--- + +## Conditional probability, definition + +- Let $B$ be an event so that $P(B) > 0$ +- Then the conditional probability of an event $A$ given that $B$ has occurred is + $$ + P(A ~|~ B) = \frac{P(A \cap B)}{P(B)} + $$ +- Notice that if $A$ and $B$ are independent, then + $$ + P(A ~|~ B) = \frac{P(A) P(B)}{P(B)} = P(A) + $$ + +--- + +## Example + +- Consider our die roll example +- $B = \{1, 3, 5\}$ +- $A = \{1\}$ +$$ + \begin{eqnarray*} +P(\mbox{one given that roll is odd}) & = & P(A ~|~ B) \\ \\ + & = & \frac{P(A \cap B)}{P(B)} \\ \\ + & = & \frac{P(A)}{P(B)} \\ \\ + & = & \frac{1/6}{3/6} = \frac{1}{3} + \end{eqnarray*} +$$ + + + +--- + +## Bayes' rule + +$$ +P(B ~|~ A) = \frac{P(A ~|~ B) P(B)}{P(A ~|~ B) P(B) + P(A ~|~ B^c)P(B^c)}. +$$ + + +--- + +## Diagnostic tests + +- Let $+$ and $-$ be the events that the result of a diagnostic test is positive or negative respectively +- Let $D$ and $D^c$ be the event that the subject of the test has or does not have the disease respectively +- The **sensitivity** is the probability that the test is positive given that the subject actually has the disease, $P(+ ~|~ D)$ +- The **specificity** is the probability that the test is negative given that the subject does not have the disease, $P(- ~|~ D^c)$ + +--- + +## More definitions + +- The **positive predictive value** is the probability that the subject has the disease given that the test is positive, $P(D ~|~ +)$ +- The **negative predictive value** is the probability that the subject does not have the disease given that the test is negative, $P(D^c ~|~ -)$ +- The **prevalence of the disease** is the marginal probability of disease, $P(D)$ + +--- + +## More definitions + +- The **diagnostic likelihood ratio of a positive test**, labeled $DLR_+$, is $P(+ ~|~ D) / P(+ ~|~ D^c)$, which is the $$sensitivity / (1 - specificity)$$ +- The **diagnostic likelihood ratio of a negative test**, labeled $DLR_-$, is $P(- ~|~ D) / P(- ~|~ D^c)$, which is the $$(1 - sensitivity) / specificity$$ + +--- + +## Example + +- A study comparing the efficacy of HIV tests, reports on an experiment which concluded that HIV antibody tests have a sensitivity of 99.7% and a specificity of 98.5% +- Suppose that a subject, from a population with a .1% prevalence of HIV, receives a positive test result. What is the probability that this subject has HIV? +- Mathematically, we want $P(D ~|~ +)$ given the sensitivity, $P(+ ~|~ D) = .997$, the specificity, $P(- ~|~ D^c) =.985$, and the prevalence $P(D) = .001$ + +--- + +## Using Bayes' formula + +$$ +\begin{eqnarray*} + P(D ~|~ +) & = &\frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}\\ \\ + & = & \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + \{1-P(-~|~D^c)\}\{1 - P(D)\}} \\ \\ + & = & \frac{.997\times .001}{.997 \times .001 + .015 \times .999}\\ \\ + & = & .062 +\end{eqnarray*} +$$ + +- In this population a positive test result only suggests a 6% probability that the subject has the disease +- (The positive predictive value is 6% for this test) + +--- + +## More on this example + +- The low positive predictive value is due to low prevalence of disease and the somewhat modest specificity +- Suppose it was known that the subject was an intravenous drug user and routinely had intercourse with an HIV infected partner +- Notice that the evidence implied by a positive test result does not change because of the prevalence of disease in the subject's population, only our interpretation of that evidence changes + +--- + +## Likelihood ratios + +- Using Bayes rule, we have + $$ + P(D ~|~ +) = \frac{P(+~|~D)P(D)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)} + $$ + and + $$ + P(D^c ~|~ +) = \frac{P(+~|~D^c)P(D^c)}{P(+~|~D)P(D) + P(+~|~D^c)P(D^c)}. + $$ + +--- + +## Likelihood ratios + +- Therefore +$$ +\frac{P(D ~|~ +)}{P(D^c ~|~ +)} = \frac{P(+~|~D)}{P(+~|~D^c)}\times \frac{P(D)}{P(D^c)} +$$ +ie +$$ +\mbox{post-test odds of }D = DLR_+\times\mbox{pre-test odds of }D +$$ +- Similarly, $DLR_-$ relates the decrease in the odds of the + disease after a negative test result to the odds of disease prior to + the test. + +--- + +## HIV example revisited + +- Suppose a subject has a positive HIV test +- $DLR_+ = .997 / (1 - .985) \approx 66$ +- The result of the positive test is that the odds of disease is now 66 times the pretest odds +- Or, equivalently, the hypothesis of disease is 66 times more supported by the data than the hypothesis of no disease + +--- + +## HIV example revisited + +- Suppose that a subject has a negative test result +- $DLR_- = (1 - .997) / .985 \approx .003$ +- Therefore, the post-test odds of disease is now $.3\%$ of the pretest odds given the negative test. +- Or, the hypothesis of disease is supported $.003$ times that of the hypothesis of absence of disease given the negative test result + diff --git a/07_RegressionModels/03_02_binaryOutcomes/index.Rmd b/07_RegressionModels/03_02_binaryOutcomes/index.Rmd index fafaa05a7..6945c3e54 100644 --- a/07_RegressionModels/03_02_binaryOutcomes/index.Rmd +++ b/07_RegressionModels/03_02_binaryOutcomes/index.Rmd @@ -143,16 +143,17 @@ $\exp(b_1)$ - Odds ratio of win probability for each point scored (compared to z - (If $p < 0.5$ you have to pay less if you lose than you get if you win.) --- - ---- -```{r, echo = TRUE, fig.height=5, fig.width=5} -x <- seq(-10, 10, length = 1000); p <- exp(beta0 + beta1 * x) / (1 + exp(beta0 + beta1 * x)) +## Visualizing fitting logistic regression curves +``` +x <- seq(-10, 10, length = 1000) manipulate( plot(x, exp(beta0 + beta1 * x) / (1 + exp(beta0 + beta1 * x)), type = "l", lwd = 3, frame = FALSE), beta1 = slider(-2, 2, step = .1, initial = 2), beta0 = slider(-2, 2, step = .1, initial = 0) ) +``` + --- ## Ravens logistic regression diff --git a/07_RegressionModels/03_02_binaryOutcomes/index.html b/07_RegressionModels/03_02_binaryOutcomes/index.html index 05a0fb619..d1cffc5c9 100644 --- a/07_RegressionModels/03_02_binaryOutcomes/index.html +++ b/07_RegressionModels/03_02_binaryOutcomes/index.html @@ -222,6 +222,24 @@

Odds

+
+

Visualizing fitting logistic regression curves

+
+
+
x <- seq(-10, 10, length = 1000)
+manipulate(
+    plot(x, exp(beta0 + beta1 * x) / (1 + exp(beta0 + beta1 * x)), 
+         type = "l", lwd = 3, frame = FALSE),
+    beta1 = slider(-2, 2, step = .1, initial = 2),
+    beta0 = slider(-2, 2, step = .1, initial = 0)
+    )
+
+ +
+ +
+ +

Ravens logistic regression

@@ -257,7 +275,7 @@

Ravens logistic regression

- +

Ravens fitted values

@@ -271,7 +289,7 @@

Ravens fitted values

- +

Odds ratios and confidence intervals

@@ -295,7 +313,7 @@

Odds ratios and confidence intervals

- +

ANOVA for logistic regression

@@ -322,19 +340,6 @@

ANOVA for logistic regression

- -
-

Simpson's paradox

-
- - -
-

Interpreting Odds Ratios

@@ -425,31 +430,31 @@

Further resources

  • + data-slide=9 title='Visualizing fitting logistic regression curves'> 9
  • + data-slide=10 title='Ravens logistic regression'> 10
  • + data-slide=11 title='Ravens fitted values'> 11
  • + data-slide=12 title='Odds ratios and confidence intervals'> 12
  • + data-slide=13 title='ANOVA for logistic regression'> 13
  • diff --git a/07_RegressionModels/03_02_binaryOutcomes/index.md b/07_RegressionModels/03_02_binaryOutcomes/index.md index b5f6901c3..47c955edd 100644 --- a/07_RegressionModels/03_02_binaryOutcomes/index.md +++ b/07_RegressionModels/03_02_binaryOutcomes/index.md @@ -149,6 +149,18 @@ $\exp(b_1)$ - Odds ratio of win probability for each point scored (compared to z - (If $p > 0.5$ you have to pay more if you lose than you get if you win.) - (If $p < 0.5$ you have to pay less if you lose than you get if you win.) +--- +## Visualizing fitting logistic regression curves +``` +x <- seq(-10, 10, length = 1000) +manipulate( + plot(x, exp(beta0 + beta1 * x) / (1 + exp(beta0 + beta1 * x)), + type = "l", lwd = 3, frame = FALSE), + beta1 = slider(-2, 2, step = .1, initial = 2), + beta0 = slider(-2, 2, step = .1, initial = 0) + ) +``` + --- ## Ravens logistic regression @@ -252,14 +264,6 @@ Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ---- - -## Simpson's paradox - - - -[http://en.wikipedia.org/wiki/Simpson's_paradox](http://en.wikipedia.org/wiki/Simpson's_paradox) - --- ## Interpreting Odds Ratios diff --git a/07_RegressionModels/03_03_countOutcomes/fig/linReg.png b/07_RegressionModels/03_03_countOutcomes/fig/linReg.png index c7f4c9c0f..88847e3ab 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/linReg.png and b/07_RegressionModels/03_03_countOutcomes/fig/linReg.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/poisReg.png b/07_RegressionModels/03_03_countOutcomes/fig/poisReg.png index ee3e1a3ae..12ed8ac0a 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/poisReg.png and b/07_RegressionModels/03_03_countOutcomes/fig/poisReg.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/ratesFit.png b/07_RegressionModels/03_03_countOutcomes/fig/ratesFit.png index ea78f63df..6bfd8f674 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/ratesFit.png and b/07_RegressionModels/03_03_countOutcomes/fig/ratesFit.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/simPois.png b/07_RegressionModels/03_03_countOutcomes/fig/simPois.png index ae05d3616..530485e01 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/simPois.png and b/07_RegressionModels/03_03_countOutcomes/fig/simPois.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-2.png b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-2.png index 67bb3d3ed..6f3c19700 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-2.png and b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-2.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-3.png b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-3.png index 119f0049d..6f2ff8d43 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-3.png and b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-3.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-4.png b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-4.png index 119f0049d..2e4ae2ae3 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-4.png and b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-4.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-5.png b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-5.png index 275bc0b39..f5237fcc3 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-5.png and b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-5.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-6.png b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-6.png index 5e9051d62..f5237fcc3 100644 Binary files a/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-6.png and b/07_RegressionModels/03_03_countOutcomes/fig/unnamed-chunk-6.png differ diff --git a/07_RegressionModels/03_03_countOutcomes/index.Rmd b/07_RegressionModels/03_03_countOutcomes/index.Rmd index f1fad64c1..15c5bf301 100644 --- a/07_RegressionModels/03_03_countOutcomes/index.Rmd +++ b/07_RegressionModels/03_03_countOutcomes/index.Rmd @@ -1,262 +1,316 @@ ---- -title : Count outcomes -subtitle : -author : Jeffrey Leek, Assistant Professor of Biostatistics -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - - -```{r setup, cache = F, echo = F, message = F, warning = F, tidy = F} -# make this an external chunk that can be included in any file -options(width = 100) -opts_chunk$set(message = F, error = F, warning = F, comment = NA, fig.align = 'center', dpi = 100, tidy = F, cache.path = '.cache/', fig.path = 'fig/') - -options(xtable.type = 'html') -knit_hooks$set(inline = function(x) { - if(is.numeric(x)) { - round(x, getOption('digits')) - } else { - paste(as.character(x), collapse = ', ') - } -}) -knit_hooks$set(plot = knitr:::hook_plot_html) -``` - -## Key ideas - -* Many data take the form of counts - * Calls to a call center - * Number of flu cases in an area - * Number of cars that cross a bridge -* Data may also be in the form of rates - * Percent of children passing a test - * Percent of hits to a website from a country -* Linear regression with transformation is an option - ---- - -## Poisson distribution - -```{r simPois,fig.height=4,fig.width=8, cache=TRUE} -set.seed(3433); par(mfrow=c(1,2)) -poisData2 <- rpois(100,lambda=100); poisData1 <- rpois(100,lambda=50) -hist(poisData1,col="blue",xlim=c(0,150)); hist(poisData2,col="blue",xlim=c(0,150)) -``` - ---- - -## Poisson distribution - - -```{r dependson="simPois"} -c(mean(poisData1),var(poisData1)) -c(mean(poisData2),var(poisData2)) -``` - ---- - -## Example: Leek Group Website Traffic - - - -[http://biostat.jhsph.edu/~jleek/](http://biostat.jhsph.edu/~jleek/) - ---- - -## Website data - -```{r leekLoad,cache=TRUE} -download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl") -load("./data/gaData.rda") -gaData$julian <- julian(gaData$date) -head(gaData) -``` - -[http://skardhamar.github.com/rga/](http://skardhamar.github.com/rga/) - - ---- - -## Plot data - -```{r, dependson="leekLoad",fig.height=4.5,fig.width=4.5} -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -``` - - ---- - -## Linear regression - -$$ NH_i = b_0 + b_1 JD_i + e_i $$ - -$NH_i$ - number of hits to the website - -$JD_i$ - day of the year (Julian day) - -$b_0$ - number of hits on Julian day 0 (1970-01-01) - -$b_1$ - increase in number of hits per unit day - -$e_i$ - variation due to everything we didn't measure - - ---- - -## Linear regression line - -```{r linReg, dependson="leekLoad",fig.height=4,fig.width=4, cache=TRUE} -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -lm1 <- lm(gaData$visits ~ gaData$julian) -abline(lm1,col="red",lwd=3) -``` - - ---- - -## Linear vs. Poisson regression - -__Linear__ - -$$ NH_i = b_0 + b_1 JD_i + e_i $$ - -or - -$$ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i$$ - -__Poisson/log-linear__ - -$$ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i $$ - -or - -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ - - ---- - -## Multiplicative differences - -

    -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ - -

    - -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) $$ - -

    - -If $JD_i$ is increased by one unit, $E[NH_i | JD_i, b_0, b_1]$ is multiplied by $\exp\left(b_1\right)$ - ---- - -## Poisson regression in R - -```{r poisReg, dependson="linReg",fig.height=4.5,fig.width=4.5, cache=TRUE} -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson") -abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3) -``` - - ---- - -## Mean-variance relationship? - -```{r, dependson="poisReg",fig.height=4.5,fig.width=4.5} -plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Date") -``` - ---- - -## Model agnostic standard errors - -```{r agnostic, dependson="poisReg",fig.height=4.5,fig.width=4.5,cache=TRUE} -library(sandwich) -confint.agnostic <- function (object, parm, level = 0.95, ...) -{ - cf <- coef(object); pnames <- names(cf) - if (missing(parm)) - parm <- pnames - else if (is.numeric(parm)) - parm <- pnames[parm] - a <- (1 - level)/2; a <- c(a, 1 - a) - pct <- stats:::format.perc(a, 3) - fac <- qnorm(a) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) - ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] - ci[] <- cf[parm] + ses %o% fac - ci -} -``` -[http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval](http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval) - ---- - -## Estimating confidence intervals - -```{r, dependson="agnostic",fig.height=4.5,fig.width=4.5} -confint(glm1) -confint.agnostic(glm1) -``` - - ---- - -## Rates - - -

    - - -$$ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) $$ - -

    - -$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i $$ - -

    - -$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i $$ - ---- - -## Fitting rates in R - -```{r ratesFit,dependson="agnostic", cache=TRUE,fig.height=4,fig.width=4} -glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), - family="poisson",data=gaData) -plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts") -points(julian(gaData$date),glm1$fitted,col="red",pch=19) -``` - ---- - -## Fitting rates in R - -```{r,dependson="ratesFit",fig.height=4,fig.width=4} -glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), - family="poisson",data=gaData) -plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date", - ylab="Fitted Rates",pch=19) -lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3) -``` - ---- - -## More information - -* [Log-linear models and multiway tables](http://ww2.coastal.edu/kingw/statistics/R-tutorials/loglin.html) -* [Wikipedia on Poisson regression](http://en.wikipedia.org/wiki/Poisson_regression), [Wikipedia on overdispersion](http://en.wikipedia.org/wiki/Overdispersion) -* [Regression models for count data in R](http://cran.r-project.org/web/packages/pscl/vignettes/countreg.pdf) -* [pscl package](http://cran.r-project.org/web/packages/pscl/index.html) - the function _zeroinfl_ fits zero inflated models. +--- +title : Count outcomes, Poisson GLMs +subtitle : Regression Models +author : Brian Caffo, Jeffrey Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + + +```{r setup, cache = F, echo = F, message = F, warning = F, tidy = F} +# make this an external chunk that can be included in any file +options(width = 100) +opts_chunk$set(message = F, error = F, warning = F, comment = NA, fig.align = 'center', dpi = 100, tidy = F, cache.path = '.cache/', fig.path = 'fig/') + +options(xtable.type = 'html') +knit_hooks$set(inline = function(x) { + if(is.numeric(x)) { + round(x, getOption('digits')) + } else { + paste(as.character(x), collapse = ', ') + } +}) +knit_hooks$set(plot = knitr:::hook_plot_html) +``` + +## Key ideas + +* Many data take the form of counts + * Calls to a call center + * Number of flu cases in an area + * Number of cars that cross a bridge +* Data may also be in the form of rates + * Percent of children passing a test + * Percent of hits to a website from a country +* Linear regression with transformation is an option + +--- + +## Poisson distribution +- The Poisson distribution is a useful model for counts and rates +- Here a rate is count per some monitoring time +- Some examples uses of the Poisson distribution + - Modeling web traffic hits + - Incidence rates + - Approximating binomial probabilities with small $p$ and large $n$ + - Analyzing contigency table data + +--- +## The Poisson mass function +- $X \sim Poisson(t\lambda)$ if +$$ +P(X = x) = \frac{(t\lambda)^x e^{-t\lambda}}{x!} +$$ +For $x = 0, 1, \ldots$. +- The mean of the Poisson is $E[X] = t\lambda$, thus $E[X / t] = \lambda$ +- The variance of the Poisson is $Var(X) = t\lambda$. +- The Poisson tends to a normal as $t\lambda$ gets large. + +--- + +```{r simPois,fig.height=4,fig.width=8, cache=TRUE} +par(mfrow = c(1, 3)) +plot(0 : 10, dpois(0 : 10, lambda = 2), type = "h", frame = FALSE) +plot(0 : 20, dpois(0 : 20, lambda = 10), type = "h", frame = FALSE) +plot(0 : 200, dpois(0 : 200, lambda = 100), type = "h", frame = FALSE) +``` + +--- + +## Poisson distribution +### Sort of, showing that the mean and variance are equal +```{r} +x <- 0 : 10000; lambda = 3 +mu <- sum(x * dpois(x, lambda = lambda)) +sigmasq <- sum((x - mu)^2 * dpois(x, lambda = lambda)) +c(mu, sigmasq) +``` + +--- + +## Example: Leek Group Website Traffic +* Consider the daily counts to Jeff Leek's web site + +[http://biostat.jhsph.edu/~jleek/](http://biostat.jhsph.edu/~jleek/) + +* Since the unit of time is always one day, set $t = 1$ and then +the Poisson mean is interpretted as web hits per day. (If we set $t = 24$, it would +be web hits per hour). + +--- + +## Website data + +```{r leekLoad,cache=TRUE} +download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl") +load("./data/gaData.rda") +gaData$julian <- julian(gaData$date) +head(gaData) +``` + +[http://skardhamar.github.com/rga/](http://skardhamar.github.com/rga/) + + +--- + +## Plot data + +```{r, dependson="leekLoad",fig.height=4.5,fig.width=4.5} +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +``` + + +--- + +## Linear regression + +$$ NH_i = b_0 + b_1 JD_i + e_i $$ + +$NH_i$ - number of hits to the website + +$JD_i$ - day of the year (Julian day) + +$b_0$ - number of hits on Julian day 0 (1970-01-01) + +$b_1$ - increase in number of hits per unit day + +$e_i$ - variation due to everything we didn't measure + + +--- + +## Linear regression line + +```{r linReg, dependson="leekLoad",fig.height=4,fig.width=4, cache=TRUE} +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +lm1 <- lm(gaData$visits ~ gaData$julian) +abline(lm1,col="red",lwd=3) +``` +--- + +## Aside, taking the log of the outcome +- Taking the natural log of the outcome has a specific interpretation. +- Consider the model + +$$ \log(NH_i) = b_0 + b_1 JD_i + e_i $$ + +$NH_i$ - number of hits to the website + +$JD_i$ - day of the year (Julian day) + +$b_0$ - log number of hits on Julian day 0 (1970-01-01) + +$b_1$ - increase in log number of hits per unit day + +$e_i$ - variation due to everything we didn't measure + +--- +## Exponentiating coefficients +- $e^{E[\log(Y)]}$ geometric mean of $Y$. + - With no covariates, this is estimated by $e^{\frac{1}{n}\sum_{i=1}^n \log(y_i)} = (\prod_{i=1}^n y_i)^{1/n}$ +- When you take the natural log of outcomes and fit a regression model, your exponentiated coefficients +estimate things about geometric means. +- $e^{\beta_0}$ estimated geometric mean hits on day 0 +- $e^{\beta_1}$ estimated relative increase or decrease in geometric mean hits per day +- There's a problem with logs with you have zero counts, adding a constant works +```{r} +round(exp(coef(lm(I(log(gaData$visits + 1)) ~ gaData$julian))), 5) +``` + +--- + +## Linear vs. Poisson regression + +__Linear__ + +$$ NH_i = b_0 + b_1 JD_i + e_i $$ + +or + +$$ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i$$ + +__Poisson/log-linear__ + +$$ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i $$ + +or + +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ + + +--- + +## Multiplicative differences + +

    +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ + +

    + +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) $$ + +

    + +If $JD_i$ is increased by one unit, $E[NH_i | JD_i, b_0, b_1]$ is multiplied by $\exp\left(b_1\right)$ + +--- + +## Poisson regression in R + +```{r poisReg, dependson="linReg",fig.height=4.5,fig.width=4.5, cache=TRUE} +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson") +abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3) +``` + + +--- + +## Mean-variance relationship? + +```{r, dependson="poisReg",fig.height=4.5,fig.width=4.5} +plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Fitted") +``` + +--- + +## Model agnostic standard errors + +```{r agnostic} +library(sandwich) +confint.agnostic <- function (object, parm, level = 0.95, ...) +{ + cf <- coef(object); pnames <- names(cf) + if (missing(parm)) + parm <- pnames + else if (is.numeric(parm)) + parm <- pnames[parm] + a <- (1 - level)/2; a <- c(a, 1 - a) + pct <- stats:::format.perc(a, 3) + fac <- qnorm(a) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, + pct)) + ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] + ci[] <- cf[parm] + ses %o% fac + ci +} +``` +[http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval](http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval) + +--- + +## Estimating confidence intervals + +```{r} +confint(glm1) +confint.agnostic(glm1) +``` + + +--- + +## Rates + + +

    + + +$$ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) $$ + +

    + +$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i $$ + +

    + +$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i $$ + +--- + +## Fitting rates in R + +```{r ratesFit,dependson="agnostic", cache=TRUE,fig.height=4,fig.width=4} +glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), + family="poisson",data=gaData) +plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts") +points(julian(gaData$date),glm1$fitted,col="red",pch=19) +``` + +--- + +## Fitting rates in R + +```{r,dependson="ratesFit",fig.height=4,fig.width=4} +glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), + family="poisson",data=gaData) +plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date", + ylab="Fitted Rates",pch=19) +lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3) +``` + +--- + +## More information + +* [Log-linear models and multiway tables](http://ww2.coastal.edu/kingw/statistics/R-tutorials/loglin.html) +* [Wikipedia on Poisson regression](http://en.wikipedia.org/wiki/Poisson_regression), [Wikipedia on overdispersion](http://en.wikipedia.org/wiki/Overdispersion) +* [Regression models for count data in R](http://cran.r-project.org/web/packages/pscl/vignettes/countreg.pdf) +* [pscl package](http://cran.r-project.org/web/packages/pscl/index.html) - the function _zeroinfl_ fits zero inflated models. diff --git a/07_RegressionModels/03_03_countOutcomes/index.html b/07_RegressionModels/03_03_countOutcomes/index.html index a2ddb498a..f01f2f899 100644 --- a/07_RegressionModels/03_03_countOutcomes/index.html +++ b/07_RegressionModels/03_03_countOutcomes/index.html @@ -1,428 +1,668 @@ - - - - Count outcomes - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -

    Count outcomes

    -

    -

    Jeffrey Leek, Assistant Professor of Biostatistics
    Johns Hopkins Bloomberg School of Public Health

    -
    -
    - - - -
    -

    Key ideas

    -
    -
    -
      -
    • Many data take the form of counts - -
        -
      • Calls to a call center
      • -
      • Number of flu cases in an area
      • -
      • Number of cars that cross a bridge
      • -
    • -
    • Data may also be in the form of rates - -
        -
      • Percent of children passing a test
      • -
      • Percent of hits to a website from a country
      • -
    • -
    • Linear regression with transformation is an option
    • -
    - -
    - -
    - - -
    -

    Poisson distribution

    -
    -
    -
    set.seed(3433); par(mfrow=c(1,2))
    -poisData2 <- rpois(100,lambda=100); poisData1 <- rpois(100,lambda=50)
    -hist(poisData1,col="blue",xlim=c(0,150)); hist(poisData2,col="blue",xlim=c(0,150))
    -
    - -
    plot of chunk simPois
    - -
    - -
    - - -
    -

    Poisson distribution

    -
    -
    -
    c(mean(poisData1),var(poisData1))
    -
    - -
    [1] 49.85 49.38
    -
    - -
    c(mean(poisData2),var(poisData2))
    -
    - -
    [1] 100.12  95.26
    -
    - -
    - -
    - - -
    -

    Example: Leek Group Website Traffic

    -
    - - -
    - - -
    -

    Website data

    -
    -
    -
    download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl")
    -load("./data/gaData.rda")
    -gaData$julian <- julian(gaData$date)
    -head(gaData)
    -
    - -
            date visits simplystats julian
    -1 2011-01-01      0           0  14975
    -2 2011-01-02      0           0  14976
    -3 2011-01-03      0           0  14977
    -4 2011-01-04      0           0  14978
    -5 2011-01-05      0           0  14979
    -6 2011-01-06      0           0  14980
    -
    - -

    http://skardhamar.github.com/rga/

    - -
    - -
    - - -
    -

    Plot data

    -
    -
    -
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    -
    - -
    plot of chunk unnamed-chunk-2
    - -
    - -
    - - -
    -

    Linear regression

    -
    -
    -

    \[ NH_i = b_0 + b_1 JD_i + e_i \]

    - -

    \(NH_i\) - number of hits to the website

    - -

    \(JD_i\) - day of the year (Julian day)

    - -

    \(b_0\) - number of hits on Julian day 0 (1970-01-01)

    - -

    \(b_1\) - increase in number of hits per unit day

    - -

    \(e_i\) - variation due to everything we didn't measure

    - -
    - -
    - - -
    -

    Linear regression line

    -
    -
    -
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    -lm1 <- lm(gaData$visits ~ gaData$julian)
    -abline(lm1,col="red",lwd=3)
    -
    - -
    plot of chunk linReg
    - -
    - -
    - - -
    -

    Linear vs. Poisson regression

    -
    -
    -

    Linear

    - -

    \[ NH_i = b_0 + b_1 JD_i + e_i \]

    - -

    or

    - -

    \[ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i\]

    - -

    Poisson/log-linear

    - -

    \[ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i \]

    - -

    or

    - -

    \[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) \]

    - -
    - -
    - - -
    -

    Multiplicative differences

    -
    -
    -



    -\[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) \]

    - -



    - -

    \[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) \]

    - -



    - -

    If \(JD_i\) is increased by one unit, \(E[NH_i | JD_i, b_0, b_1]\) is multiplied by \(\exp\left(b_1\right)\)

    - -
    - -
    - - -
    -

    Poisson regression in R

    -
    -
    -
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    -glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson")
    -abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3)
    -
    - -
    plot of chunk poisReg
    - -
    - -
    - - -
    -

    Mean-variance relationship?

    -
    -
    -
    plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Date")
    -
    - -
    plot of chunk unnamed-chunk-3
    - -
    - -
    - - -
    -

    Model agnostic standard errors

    -
    -
    -
    library(sandwich)
    -confint.agnostic <- function (object, parm, level = 0.95, ...)
    -{
    -    cf <- coef(object); pnames <- names(cf)
    -    if (missing(parm))
    -        parm <- pnames
    -    else if (is.numeric(parm))
    -        parm <- pnames[parm]
    -    a <- (1 - level)/2; a <- c(a, 1 - a)
    -    pct <- stats:::format.perc(a, 3)
    -    fac <- qnorm(a)
    -    ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
    -                                                               pct))
    -    ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
    -    ci[] <- cf[parm] + ses %o% fac
    -    ci
    -}
    -
    - -

    http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval

    - -
    - -
    - - -
    -

    Estimating confidence intervals

    -
    -
    -
    confint(glm1)
    -
    - -
                      2.5 %     97.5 %
    -(Intercept)   -34.34658 -31.159716
    -gaData$julian   0.00219   0.002396
    -
    - -
    confint.agnostic(glm1)
    -
    - -
    - -
    - - -
    -

    Rates

    -
    -
    -



    - -

    \[ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) \]

    - -



    - -

    \[ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i \]

    - -



    - -

    \[ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i \]

    - -
    - -
    - - -
    -

    Fitting rates in R

    -
    -
    -
    glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
    -            family="poisson",data=gaData)
    -plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts")
    -points(julian(gaData$date),glm1$fitted,col="red",pch=19)
    -
    - -
    plot of chunk ratesFit
    - -
    - -
    - - -
    -

    Fitting rates in R

    -
    -
    -
    glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
    -            family="poisson",data=gaData)
    -plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date",
    -     ylab="Fitted Rates",pch=19)
    -lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3)
    -
    - -
    plot of chunk unnamed-chunk-5
    - -
    - -
    - - -
    -

    More information

    -
    - - -
    - - -
    - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + Count outcomes, Poisson GLMs + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Count outcomes, Poisson GLMs

    +

    Regression Models

    +

    Brian Caffo, Jeffrey Leek, Roger Peng
    Johns Hopkins Bloomberg School of Public Health

    +
    +
    +
    + + + + +
    +

    Key ideas

    +
    +
    +
      +
    • Many data take the form of counts + +
        +
      • Calls to a call center
      • +
      • Number of flu cases in an area
      • +
      • Number of cars that cross a bridge
      • +
    • +
    • Data may also be in the form of rates + +
        +
      • Percent of children passing a test
      • +
      • Percent of hits to a website from a country
      • +
    • +
    • Linear regression with transformation is an option
    • +
    + +
    + +
    + + +
    +

    Poisson distribution

    +
    +
    +
      +
    • The Poisson distribution is a useful model for counts and rates
    • +
    • Here a rate is count per some monitoring time
    • +
    • Some examples uses of the Poisson distribution + +
        +
      • Modeling web traffic hits
      • +
      • Incidence rates
      • +
      • Approximating binomial probabilities with small \(p\) and large \(n\)
      • +
      • Analyzing contigency table data
      • +
    • +
    + +
    + +
    + + +
    +

    The Poisson mass function

    +
    +
    +
      +
    • \(X \sim Poisson(t\lambda)\) if +\[ +P(X = x) = \frac{(t\lambda)^x e^{-t\lambda}}{x!} +\] +For \(x = 0, 1, \ldots\).
    • +
    • The mean of the Poisson is \(E[X] = t\lambda\), thus \(E[X / t] = \lambda\)
    • +
    • The variance of the Poisson is \(Var(X) = t\lambda\).
    • +
    • The Poisson tends to a normal as \(t\lambda\) gets large.
    • +
    + +
    + +
    + + +
    +
    par(mfrow = c(1, 3))
    +plot(0 : 10, dpois(0 : 10, lambda = 2), type = "h", frame = FALSE)
    +plot(0 : 20, dpois(0 : 20, lambda = 10), type = "h", frame = FALSE)
    +plot(0 : 200, dpois(0 : 200, lambda = 100), type = "h", frame = FALSE) 
    +
    + +
    plot of chunk simPois
    + +
    + +
    + + +
    +

    Poisson distribution

    +
    +
    +

    Sort of, showing that the mean and variance are equal

    + +
    x <- 0 : 10000; lambda = 3
    +mu <- sum(x * dpois(x, lambda = lambda))
    +sigmasq <- sum((x - mu)^2 * dpois(x, lambda = lambda))
    +c(mu, sigmasq)
    +
    + +
    [1] 3 3
    +
    + +
    + +
    + + +
    +

    Example: Leek Group Website Traffic

    +
    +
    +
      +
    • Consider the daily counts to Jeff Leek's web site
    • +
    + +

    http://biostat.jhsph.edu/~jleek/

    + +
      +
    • Since the unit of time is always one day, set \(t = 1\) and then +the Poisson mean is interpretted as web hits per day. (If we set \(t = 24\), it would +be web hits per hour).
    • +
    + +
    + +
    + + +
    +

    Website data

    +
    +
    +
    download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl")
    +load("./data/gaData.rda")
    +gaData$julian <- julian(gaData$date)
    +head(gaData)
    +
    + +
            date visits simplystats julian
    +1 2011-01-01      0           0  14975
    +2 2011-01-02      0           0  14976
    +3 2011-01-03      0           0  14977
    +4 2011-01-04      0           0  14978
    +5 2011-01-05      0           0  14979
    +6 2011-01-06      0           0  14980
    +
    + +

    http://skardhamar.github.com/rga/

    + +
    + +
    + + +
    +

    Plot data

    +
    +
    +
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    +
    + +
    plot of chunk unnamed-chunk-2
    + +
    + +
    + + +
    +

    Linear regression

    +
    +
    +

    \[ NH_i = b_0 + b_1 JD_i + e_i \]

    + +

    \(NH_i\) - number of hits to the website

    + +

    \(JD_i\) - day of the year (Julian day)

    + +

    \(b_0\) - number of hits on Julian day 0 (1970-01-01)

    + +

    \(b_1\) - increase in number of hits per unit day

    + +

    \(e_i\) - variation due to everything we didn't measure

    + +
    + +
    + + +
    +

    Linear regression line

    +
    +
    +
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    +lm1 <- lm(gaData$visits ~ gaData$julian)
    +abline(lm1,col="red",lwd=3)
    +
    + +
    plot of chunk linReg
    + +
    + +
    + + +
    +

    Aside, taking the log of the outcome

    +
    +
    +
      +
    • Taking the natural log of the outcome has a specific interpretation.
    • +
    • Consider the model
    • +
    + +

    \[ \log(NH_i) = b_0 + b_1 JD_i + e_i \]

    + +

    \(NH_i\) - number of hits to the website

    + +

    \(JD_i\) - day of the year (Julian day)

    + +

    \(b_0\) - log number of hits on Julian day 0 (1970-01-01)

    + +

    \(b_1\) - increase in log number of hits per unit day

    + +

    \(e_i\) - variation due to everything we didn't measure

    + +
    + +
    + + +
    +

    Exponentiating coefficients

    +
    +
    +
      +
    • \(e^{E[\log(Y)]}\) geometric mean of \(Y\). + +
        +
      • With no covariates, this is estimated by \(e^{\frac{1}{n}\sum_{i=1}^n \log(y_i)} = (\prod_{i=1}^n y_i)^{1/n}\)
      • +
    • +
    • When you take the natural log of outcomes and fit a regression model, your exponentiated coefficients +estimate things about geometric means.
    • +
    • \(e^{\beta_0}\) estimated geometric mean hits on day 0
    • +
    • \(e^{\beta_1}\) estimated relative increase or decrease in geometric mean hits per day
    • +
    • There's a problem with logs with you have zero counts, adding a constant works
    • +
    + +
    round(exp(coef(lm(I(log(gaData$visits + 1)) ~ gaData$julian))), 5)
    +
    + +
      (Intercept) gaData$julian 
    +        0.000         1.002 
    +
    + +
    + +
    + + +
    +

    Linear vs. Poisson regression

    +
    +
    +

    Linear

    + +

    \[ NH_i = b_0 + b_1 JD_i + e_i \]

    + +

    or

    + +

    \[ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i\]

    + +

    Poisson/log-linear

    + +

    \[ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i \]

    + +

    or

    + +

    \[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) \]

    + +
    + +
    + + +
    +

    Multiplicative differences

    +
    +
    +



    +\[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) \]

    + +



    + +

    \[ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) \]

    + +



    + +

    If \(JD_i\) is increased by one unit, \(E[NH_i | JD_i, b_0, b_1]\) is multiplied by \(\exp\left(b_1\right)\)

    + +
    + +
    + + +
    +

    Poisson regression in R

    +
    +
    +
    plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
    +glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson")
    +abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3)
    +
    + +
    plot of chunk poisReg
    + +
    + +
    + + +
    +

    Mean-variance relationship?

    +
    +
    +
    plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Fitted")
    +
    + +
    plot of chunk unnamed-chunk-4
    + +
    + +
    + + +
    +

    Model agnostic standard errors

    +
    +
    +
    library(sandwich)
    +confint.agnostic <- function (object, parm, level = 0.95, ...)
    +{
    +    cf <- coef(object); pnames <- names(cf)
    +    if (missing(parm))
    +        parm <- pnames
    +    else if (is.numeric(parm))
    +        parm <- pnames[parm]
    +    a <- (1 - level)/2; a <- c(a, 1 - a)
    +    pct <- stats:::format.perc(a, 3)
    +    fac <- qnorm(a)
    +    ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
    +                                                               pct))
    +    ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
    +    ci[] <- cf[parm] + ses %o% fac
    +    ci
    +}
    +
    + +

    http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval

    + +
    + +
    + + +
    +

    Estimating confidence intervals

    +
    +
    +
    confint(glm1)
    +
    + +
                      2.5 %     97.5 %
    +(Intercept)   -34.34658 -31.159716
    +gaData$julian   0.00219   0.002396
    +
    + +
    confint.agnostic(glm1)
    +
    + +
                       2.5 %     97.5 %
    +(Intercept)   -36.362675 -29.136997
    +gaData$julian   0.002058   0.002528
    +
    + +
    + +
    + + +
    +

    Rates

    +
    +
    +



    + +

    \[ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) \]

    + +



    + +

    \[ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i \]

    + +



    + +

    \[ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i \]

    + +
    + +
    + + +
    +

    Fitting rates in R

    +
    +
    +
    glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
    +            family="poisson",data=gaData)
    +plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts")
    +points(julian(gaData$date),glm1$fitted,col="red",pch=19)
    +
    + +
    plot of chunk ratesFit
    + +
    + +
    + + +
    +

    Fitting rates in R

    +
    +
    +
    glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
    +            family="poisson",data=gaData)
    +plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date",
    +     ylab="Fitted Rates",pch=19)
    +lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3)
    +
    + +
    plot of chunk unnamed-chunk-6
    + +
    + +
    + + +
    +

    More information

    +
    + + +
    + + +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/07_RegressionModels/03_03_countOutcomes/index.md b/07_RegressionModels/03_03_countOutcomes/index.md index 3d0b7b897..7ff158f82 100644 --- a/07_RegressionModels/03_03_countOutcomes/index.md +++ b/07_RegressionModels/03_03_countOutcomes/index.md @@ -1,315 +1,375 @@ ---- -title : Count outcomes -subtitle : -author : Jeffrey Leek, Assistant Professor of Biostatistics -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../libraries - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - - - - - -## Key ideas - -* Many data take the form of counts - * Calls to a call center - * Number of flu cases in an area - * Number of cars that cross a bridge -* Data may also be in the form of rates - * Percent of children passing a test - * Percent of hits to a website from a country -* Linear regression with transformation is an option - ---- - -## Poisson distribution - - -```r -set.seed(3433); par(mfrow=c(1,2)) -poisData2 <- rpois(100,lambda=100); poisData1 <- rpois(100,lambda=50) -hist(poisData1,col="blue",xlim=c(0,150)); hist(poisData2,col="blue",xlim=c(0,150)) -``` - -
    plot of chunk simPois
    - - ---- - -## Poisson distribution - - - -```r -c(mean(poisData1),var(poisData1)) -``` - -``` -[1] 49.85 49.38 -``` - -```r -c(mean(poisData2),var(poisData2)) -``` - -``` -[1] 100.12 95.26 -``` - - ---- - -## Example: Leek Group Website Traffic - - - -[http://biostat.jhsph.edu/~jleek/](http://biostat.jhsph.edu/~jleek/) - ---- - -## Website data - - -```r -download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl") -load("./data/gaData.rda") -gaData$julian <- julian(gaData$date) -head(gaData) -``` - -``` - date visits simplystats julian -1 2011-01-01 0 0 14975 -2 2011-01-02 0 0 14976 -3 2011-01-03 0 0 14977 -4 2011-01-04 0 0 14978 -5 2011-01-05 0 0 14979 -6 2011-01-06 0 0 14980 -``` - - -[http://skardhamar.github.com/rga/](http://skardhamar.github.com/rga/) - - ---- - -## Plot data - - -```r -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -``` - -
    plot of chunk unnamed-chunk-2
    - - - ---- - -## Linear regression - -$$ NH_i = b_0 + b_1 JD_i + e_i $$ - -$NH_i$ - number of hits to the website - -$JD_i$ - day of the year (Julian day) - -$b_0$ - number of hits on Julian day 0 (1970-01-01) - -$b_1$ - increase in number of hits per unit day - -$e_i$ - variation due to everything we didn't measure - - ---- - -## Linear regression line - - -```r -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -lm1 <- lm(gaData$visits ~ gaData$julian) -abline(lm1,col="red",lwd=3) -``` - -
    plot of chunk linReg
    - - - ---- - -## Linear vs. Poisson regression - -__Linear__ - -$$ NH_i = b_0 + b_1 JD_i + e_i $$ - -or - -$$ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i$$ - -__Poisson/log-linear__ - -$$ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i $$ - -or - -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ - - ---- - -## Multiplicative differences - -

    -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ - -

    - -$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) $$ - -

    - -If $JD_i$ is increased by one unit, $E[NH_i | JD_i, b_0, b_1]$ is multiplied by $\exp\left(b_1\right)$ - ---- - -## Poisson regression in R - - -```r -plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") -glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson") -abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3) -``` - -
    plot of chunk poisReg
    - - - ---- - -## Mean-variance relationship? - - -```r -plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Date") -``` - -
    plot of chunk unnamed-chunk-3
    - - ---- - -## Model agnostic standard errors - - -```r -library(sandwich) -confint.agnostic <- function (object, parm, level = 0.95, ...) -{ - cf <- coef(object); pnames <- names(cf) - if (missing(parm)) - parm <- pnames - else if (is.numeric(parm)) - parm <- pnames[parm] - a <- (1 - level)/2; a <- c(a, 1 - a) - pct <- stats:::format.perc(a, 3) - fac <- qnorm(a) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) - ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] - ci[] <- cf[parm] + ses %o% fac - ci -} -``` - -[http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval](http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval) - ---- - -## Estimating confidence intervals - - -```r -confint(glm1) -``` - -``` - 2.5 % 97.5 % -(Intercept) -34.34658 -31.159716 -gaData$julian 0.00219 0.002396 -``` - -```r -confint.agnostic(glm1) -``` - - - ---- - -## Rates - - -

    - - -$$ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) $$ - -

    - -$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i $$ - -

    - -$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i $$ - ---- - -## Fitting rates in R - - -```r -glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), - family="poisson",data=gaData) -plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts") -points(julian(gaData$date),glm1$fitted,col="red",pch=19) -``` - -
    plot of chunk ratesFit
    - - ---- - -## Fitting rates in R - - -```r -glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), - family="poisson",data=gaData) -plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date", - ylab="Fitted Rates",pch=19) -lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3) -``` - -
    plot of chunk unnamed-chunk-5
    - - ---- - -## More information - -* [Log-linear models and multiway tables](http://ww2.coastal.edu/kingw/statistics/R-tutorials/loglin.html) -* [Wikipedia on Poisson regression](http://en.wikipedia.org/wiki/Poisson_regression), [Wikipedia on overdispersion](http://en.wikipedia.org/wiki/Overdispersion) -* [Regression models for count data in R](http://cran.r-project.org/web/packages/pscl/vignettes/countreg.pdf) -* [pscl package](http://cran.r-project.org/web/packages/pscl/index.html) - the function _zeroinfl_ fits zero inflated models. +--- +title : Count outcomes, Poisson GLMs +subtitle : Regression Models +author : Brian Caffo, Jeffrey Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + + + + + +## Key ideas + +* Many data take the form of counts + * Calls to a call center + * Number of flu cases in an area + * Number of cars that cross a bridge +* Data may also be in the form of rates + * Percent of children passing a test + * Percent of hits to a website from a country +* Linear regression with transformation is an option + +--- + +## Poisson distribution +- The Poisson distribution is a useful model for counts and rates +- Here a rate is count per some monitoring time +- Some examples uses of the Poisson distribution + - Modeling web traffic hits + - Incidence rates + - Approximating binomial probabilities with small $p$ and large $n$ + - Analyzing contigency table data + +--- +## The Poisson mass function +- $X \sim Poisson(t\lambda)$ if +$$ +P(X = x) = \frac{(t\lambda)^x e^{-t\lambda}}{x!} +$$ +For $x = 0, 1, \ldots$. +- The mean of the Poisson is $E[X] = t\lambda$, thus $E[X / t] = \lambda$ +- The variance of the Poisson is $Var(X) = t\lambda$. +- The Poisson tends to a normal as $t\lambda$ gets large. + +--- + + +```r +par(mfrow = c(1, 3)) +plot(0 : 10, dpois(0 : 10, lambda = 2), type = "h", frame = FALSE) +plot(0 : 20, dpois(0 : 20, lambda = 10), type = "h", frame = FALSE) +plot(0 : 200, dpois(0 : 200, lambda = 100), type = "h", frame = FALSE) +``` + +
    plot of chunk simPois
    + + +--- + +## Poisson distribution +### Sort of, showing that the mean and variance are equal + +```r +x <- 0 : 10000; lambda = 3 +mu <- sum(x * dpois(x, lambda = lambda)) +sigmasq <- sum((x - mu)^2 * dpois(x, lambda = lambda)) +c(mu, sigmasq) +``` + +``` +[1] 3 3 +``` + + +--- + +## Example: Leek Group Website Traffic +* Consider the daily counts to Jeff Leek's web site + +[http://biostat.jhsph.edu/~jleek/](http://biostat.jhsph.edu/~jleek/) + +* Since the unit of time is always one day, set $t = 1$ and then +the Poisson mean is interpretted as web hits per day. (If we set $t = 24$, it would +be web hits per hour). + +--- + +## Website data + + +```r +download.file("https://dl.dropboxusercontent.com/u/7710864/data/gaData.rda",destfile="./data/gaData.rda",method="curl") +load("./data/gaData.rda") +gaData$julian <- julian(gaData$date) +head(gaData) +``` + +``` + date visits simplystats julian +1 2011-01-01 0 0 14975 +2 2011-01-02 0 0 14976 +3 2011-01-03 0 0 14977 +4 2011-01-04 0 0 14978 +5 2011-01-05 0 0 14979 +6 2011-01-06 0 0 14980 +``` + + +[http://skardhamar.github.com/rga/](http://skardhamar.github.com/rga/) + + +--- + +## Plot data + + +```r +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +``` + +
    plot of chunk unnamed-chunk-2
    + + + +--- + +## Linear regression + +$$ NH_i = b_0 + b_1 JD_i + e_i $$ + +$NH_i$ - number of hits to the website + +$JD_i$ - day of the year (Julian day) + +$b_0$ - number of hits on Julian day 0 (1970-01-01) + +$b_1$ - increase in number of hits per unit day + +$e_i$ - variation due to everything we didn't measure + + +--- + +## Linear regression line + + +```r +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +lm1 <- lm(gaData$visits ~ gaData$julian) +abline(lm1,col="red",lwd=3) +``` + +
    plot of chunk linReg
    + +--- + +## Aside, taking the log of the outcome +- Taking the natural log of the outcome has a specific interpretation. +- Consider the model + +$$ \log(NH_i) = b_0 + b_1 JD_i + e_i $$ + +$NH_i$ - number of hits to the website + +$JD_i$ - day of the year (Julian day) + +$b_0$ - log number of hits on Julian day 0 (1970-01-01) + +$b_1$ - increase in log number of hits per unit day + +$e_i$ - variation due to everything we didn't measure + +--- +## Exponentiating coefficients +- $e^{E[\log(Y)]}$ geometric mean of $Y$. + - With no covariates, this is estimated by $e^{\frac{1}{n}\sum_{i=1}^n \log(y_i)} = (\prod_{i=1}^n y_i)^{1/n}$ +- When you take the natural log of outcomes and fit a regression model, your exponentiated coefficients +estimate things about geometric means. +- $e^{\beta_0}$ estimated geometric mean hits on day 0 +- $e^{\beta_1}$ estimated relative increase or decrease in geometric mean hits per day +- There's a problem with logs with you have zero counts, adding a constant works + +```r +round(exp(coef(lm(I(log(gaData$visits + 1)) ~ gaData$julian))), 5) +``` + +``` + (Intercept) gaData$julian + 0.000 1.002 +``` + + +--- + +## Linear vs. Poisson regression + +__Linear__ + +$$ NH_i = b_0 + b_1 JD_i + e_i $$ + +or + +$$ E[NH_i | JD_i, b_0, b_1] = b_0 + b_1 JD_i$$ + +__Poisson/log-linear__ + +$$ \log\left(E[NH_i | JD_i, b_0, b_1]\right) = b_0 + b_1 JD_i $$ + +or + +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ + + +--- + +## Multiplicative differences + +

    +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 + b_1 JD_i\right) $$ + +

    + +$$ E[NH_i | JD_i, b_0, b_1] = \exp\left(b_0 \right)\exp\left(b_1 JD_i\right) $$ + +

    + +If $JD_i$ is increased by one unit, $E[NH_i | JD_i, b_0, b_1]$ is multiplied by $\exp\left(b_1\right)$ + +--- + +## Poisson regression in R + + +```r +plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") +glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson") +abline(lm1,col="red",lwd=3); lines(gaData$julian,glm1$fitted,col="blue",lwd=3) +``` + +
    plot of chunk poisReg
    + + + +--- + +## Mean-variance relationship? + + +```r +plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Fitted") +``` + +
    plot of chunk unnamed-chunk-4
    + + +--- + +## Model agnostic standard errors + + +```r +library(sandwich) +confint.agnostic <- function (object, parm, level = 0.95, ...) +{ + cf <- coef(object); pnames <- names(cf) + if (missing(parm)) + parm <- pnames + else if (is.numeric(parm)) + parm <- pnames[parm] + a <- (1 - level)/2; a <- c(a, 1 - a) + pct <- stats:::format.perc(a, 3) + fac <- qnorm(a) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, + pct)) + ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] + ci[] <- cf[parm] + ses %o% fac + ci +} +``` + +[http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval](http://stackoverflow.com/questions/3817182/vcovhc-and-confidence-interval) + +--- + +## Estimating confidence intervals + + +```r +confint(glm1) +``` + +``` + 2.5 % 97.5 % +(Intercept) -34.34658 -31.159716 +gaData$julian 0.00219 0.002396 +``` + +```r +confint.agnostic(glm1) +``` + +``` + 2.5 % 97.5 % +(Intercept) -36.362675 -29.136997 +gaData$julian 0.002058 0.002528 +``` + + + +--- + +## Rates + + +

    + + +$$ E[NHSS_i | JD_i, b_0, b_1]/NH_i = \exp\left(b_0 + b_1 JD_i\right) $$ + +

    + +$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) - \log(NH_i) = b_0 + b_1 JD_i $$ + +

    + +$$ \log\left(E[NHSS_i | JD_i, b_0, b_1]\right) = \log(NH_i) + b_0 + b_1 JD_i $$ + +--- + +## Fitting rates in R + + +```r +glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), + family="poisson",data=gaData) +plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts") +points(julian(gaData$date),glm1$fitted,col="red",pch=19) +``` + +
    plot of chunk ratesFit
    + + +--- + +## Fitting rates in R + + +```r +glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), + family="poisson",data=gaData) +plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date", + ylab="Fitted Rates",pch=19) +lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3) +``` + +
    plot of chunk unnamed-chunk-6
    + + +--- + +## More information + +* [Log-linear models and multiway tables](http://ww2.coastal.edu/kingw/statistics/R-tutorials/loglin.html) +* [Wikipedia on Poisson regression](http://en.wikipedia.org/wiki/Poisson_regression), [Wikipedia on overdispersion](http://en.wikipedia.org/wiki/Overdispersion) +* [Regression models for count data in R](http://cran.r-project.org/web/packages/pscl/vignettes/countreg.pdf) +* [pscl package](http://cran.r-project.org/web/packages/pscl/index.html) - the function _zeroinfl_ fits zero inflated models. diff --git a/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-1.png b/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-1.png index aaac00d96..c00c59551 100644 Binary files a/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-1.png and b/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-1.png differ diff --git a/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-2.png b/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-2.png index 5ad8b0108..3480f2884 100644 Binary files a/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-2.png and b/07_RegressionModels/03_04_bonus/fig/unnamed-chunk-2.png differ diff --git a/07_RegressionModels/03_04_bonus/index.Rmd b/07_RegressionModels/03_04_bonus/index.Rmd index 7dc8849f8..ac9afbed2 100644 --- a/07_RegressionModels/03_04_bonus/index.Rmd +++ b/07_RegressionModels/03_04_bonus/index.Rmd @@ -1,116 +1,116 @@ ---- -title : Hodgepodge -subtitle : Regression models -author : Brian Caffo, PhD -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../librariesNew - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- -```{r setup, cache = F, echo = F, message = F, warning = F, tidy = F, results='hide'} -# make this an external chunk that can be included in any file -options(width = 100) -opts_chunk$set(message = F, error = F, warning = F, comment = NA, fig.align = 'center', dpi = 100, tidy = F, cache.path = '.cache/', fig.path = 'fig/') - -options(xtable.type = 'html') -knit_hooks$set(inline = function(x) { - if(is.numeric(x)) { - round(x, getOption('digits')) - } else { - paste(as.character(x), collapse = ', ') - } -}) -knit_hooks$set(plot = knitr:::hook_plot_html) -runif(1) -``` - -## How to fit functions using linear models -* Consider a model $Y_i = f(X_i) + \epsilon$. -* How can we fit such a model using linear models (called scatterplot smoothing) -* Consider the model - $$ - Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} - $$ -where $(a)_+ = a$ if $a > 0$ and $0$ otherwise and $\xi_1 \leq ... \leq \xi_d$ are known knot points. -* Prove to yourelf that the mean function -$$ -\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k -$$ -is continuous at the knot points. - ---- -## Simulated example -```{r, fig.height=4, fig.width=4} -n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3) -knots <- seq(0, 8 * pi, length = 20); -splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)) -xMat <- cbind(1, x, splineTerms) -yhat <- predict(lm(y ~ xMat - 1)) -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) -lines(x, yhat, col = "red", lwd = 2) -``` - ---- -## Adding squared terms -* Adding squared terms makes it continuously differentiable at the knot points. -* Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. -$$ - Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} -$$ - ---- -```{r, fig.height=4, fig.width=4} -splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2) -xMat <- cbind(1, x, x^2, splineTerms) -yhat <- predict(lm(y ~ xMat - 1)) -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) -lines(x, yhat, col = "red", lwd = 2) -``` - ---- -## Notes -* The collection of regressors is called a basis. - * People have spent **a lot** of time thinking about bases for this kind of problem. So, consider this as just a teaser. -* Single knot point terms can fit hockey stick like processes. -* These bases can be used in GLMs as well. -* An issue with these approaches is the large number of parameters introduced. - * Requires some method of so called regularization. - ---- -## Harmonics using linear models -```{r} -##Chord finder, playing the white keys on a piano from octave c4 - c5 -notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25) -t <- seq(0, 2, by = .001); n <- length(t) -c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); -g4 <- sin(2 * pi * notes4[5] * t) -chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3) -x <- sapply(notes4, function(freq) sin(2 * pi * freq * t)) -fit <- lm(chord ~ x - 1) -``` - ---- -```{r, fig.height=5,fig.width=5, echo=FALSE} -plot(c(0, 9), c(0, 1.5), xlab = "Note", ylab = "Coef^2", axes = FALSE, frame = TRUE, type = "n") -axis(2) -axis(1, at = 1 : 8, labels = c("c4", "d4", "e4", "f4", "g4", "a4", "b4", "c5")) -for (i in 1 : 8) abline(v = i, lwd = 3, col = grey(.8)) -lines(c(0, 1 : 8, 9), c(0, coef(fit)^2, 0), type = "l", lwd = 3, col = "red") -``` - ---- -```{r, fig.height=5, fig.wdith=5} -##(How you would really do it) -a <- fft(chord); plot(Re(a)^2, type = "l") -``` - - - - - +--- +title : Hodgepodge +subtitle : Regression models +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- +```{r setup, cache = F, echo = F, message = F, warning = F, tidy = F, results='hide'} +# make this an external chunk that can be included in any file +options(width = 100) +opts_chunk$set(message = F, error = F, warning = F, comment = NA, fig.align = 'center', dpi = 100, tidy = F, cache.path = '.cache/', fig.path = 'fig/') + +options(xtable.type = 'html') +knit_hooks$set(inline = function(x) { + if(is.numeric(x)) { + round(x, getOption('digits')) + } else { + paste(as.character(x), collapse = ', ') + } +}) +knit_hooks$set(plot = knitr:::hook_plot_html) +runif(1) +``` + +## How to fit functions using linear models +* Consider a model $Y_i = f(X_i) + \epsilon$. +* How can we fit such a model using linear models (called scatterplot smoothing) +* Consider the model + $$ + Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} + $$ +where $(a)_+ = a$ if $a > 0$ and $0$ otherwise and $\xi_1 \leq ... \leq \xi_d$ are known knot points. +* Prove to yourelf that the mean function +$$ +\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k +$$ +is continuous at the knot points. + +--- +## Simulated example +```{r, fig.height=4, fig.width=4} +n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3) +knots <- seq(0, 8 * pi, length = 20); +splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)) +xMat <- cbind(1, x, splineTerms) +yhat <- predict(lm(y ~ xMat - 1)) +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) +lines(x, yhat, col = "red", lwd = 2) +``` + +--- +## Adding squared terms +* Adding squared terms makes it continuously differentiable at the knot points. +* Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. +$$ + Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} +$$ + +--- +```{r, fig.height=4, fig.width=4} +splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2) +xMat <- cbind(1, x, x^2, splineTerms) +yhat <- predict(lm(y ~ xMat - 1)) +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) +lines(x, yhat, col = "red", lwd = 2) +``` + +--- +## Notes +* The collection of regressors is called a basis. + * People have spent **a lot** of time thinking about bases for this kind of problem. So, consider this as just a teaser. +* Single knot point terms can fit hockey stick like processes. +* These bases can be used in GLMs as well. +* An issue with these approaches is the large number of parameters introduced. + * Requires some method of so called regularization. + +--- +## Harmonics using linear models +```{r} +##Chord finder, playing the white keys on a piano from octave c4 - c5 +notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25) +t <- seq(0, 2, by = .001); n <- length(t) +c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); +g4 <- sin(2 * pi * notes4[5] * t) +chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3) +x <- sapply(notes4, function(freq) sin(2 * pi * freq * t)) +fit <- lm(chord ~ x - 1) +``` + +--- +```{r, fig.height=5,fig.width=5, echo=FALSE} +plot(c(0, 9), c(0, 1.5), xlab = "Note", ylab = "Coef^2", axes = FALSE, frame = TRUE, type = "n") +axis(2) +axis(1, at = 1 : 8, labels = c("c4", "d4", "e4", "f4", "g4", "a4", "b4", "c5")) +for (i in 1 : 8) abline(v = i, lwd = 3, col = grey(.8)) +lines(c(0, 1 : 8, 9), c(0, coef(fit)^2, 0), type = "l", lwd = 3, col = "red") +``` + +--- +```{r, fig.height=5, fig.wdith=5} +##(How you would really do it) +a <- fft(chord); plot(Re(a)^2, type = "l") +``` + + + + + diff --git a/07_RegressionModels/03_04_bonus/index.html b/07_RegressionModels/03_04_bonus/index.html index 4f5ad846c..6d28a79ad 100644 --- a/07_RegressionModels/03_04_bonus/index.html +++ b/07_RegressionModels/03_04_bonus/index.html @@ -1,266 +1,266 @@ - - - - Hodgepodge - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -

    Hodgepodge

    -

    Regression models

    -

    Brian Caffo, PhD
    Johns Hopkins Bloomberg School of Public Health

    -
    -
    -
    - - - - -
    -

    How to fit functions using linear models

    -
    -
    -
      -
    • Consider a model \(Y_i = f(X_i) + \epsilon\).
    • -
    • How can we fit such a model using linear models (called scatterplot smoothing)
    • -
    • Consider the model -\[ -Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} -\] -where \((a)_+ = a\) if \(a > 0\) and \(0\) otherwise and \(\xi_1 \leq ... \leq \xi_d\) are known knot points.
    • -
    • Prove to yourelf that the mean function -\[ -\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k -\] -is continuous at the knot points.
    • -
    - -
    - -
    - - -
    -

    Simulated example

    -
    -
    -
    n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3)
    -knots <- seq(0, 8 * pi, length = 20); 
    -splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot))
    -xMat <- cbind(1, x, splineTerms)
    -yhat <- predict(lm(y ~ xMat - 1))
    -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2)
    -lines(x, yhat, col = "red", lwd = 2)
    -
    - -
    plot of chunk unnamed-chunk-1
    - -
    - -
    - - -
    -

    Adding squared terms

    -
    -
    -
      -
    • Adding squared terms makes it continuously differentiable at the knot points.
    • -
    • Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. -\[ -Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} -\]
    • -
    - -
    - -
    - - -
    -
    splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2)
    -xMat <- cbind(1, x, x^2, splineTerms)
    -yhat <- predict(lm(y ~ xMat - 1))
    -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2)
    -lines(x, yhat, col = "red", lwd = 2)
    -
    - -
    plot of chunk unnamed-chunk-2
    - -
    - -
    - - -
    -

    Notes

    -
    -
    -
      -
    • The collection of regressors is called a basis. - -
        -
      • People have spent a lot of time thinking about bases for this kind of problem. So, consider this as just a teaser.
      • -
    • -
    • Single knot point terms can fit hockey stick like processes.
    • -
    • These bases can be used in GLMs as well.
    • -
    • An issue with these approaches is the large number of parameters introduced. - -
        -
      • Requires some method of so called regularization.
      • -
    • -
    - -
    - -
    - - -
    -

    Harmonics using linear models

    -
    -
    -
    ##Chord finder, playing the white keys on a piano from octave c4 - c5
    -notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25)
    -t <- seq(0, 2, by = .001); n <- length(t)
    -c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); 
    -g4 <- sin(2 * pi * notes4[5] * t)
    -chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3)
    -x <- sapply(notes4, function(freq) sin(2 * pi * freq * t))
    -fit <- lm(chord ~ x - 1)
    -
    - -
    - -
    - - -
    -
    plot of chunk unnamed-chunk-4
    - -
    - -
    - - -
    -
    ##(How you would really do it)
    -a <- fft(chord); plot(Re(a)^2, type = "l")
    -
    - -
    plot of chunk unnamed-chunk-5
    - -
    - -
    - - -
    - - - - - - - - - - - - - - + + + + Hodgepodge + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Hodgepodge

    +

    Regression models

    +

    Brian Caffo, Jeff Leek, Roger Peng
    Johns Hopkins Bloomberg School of Public Health

    +
    +
    +
    + + + + +
    +

    How to fit functions using linear models

    +
    +
    +
      +
    • Consider a model \(Y_i = f(X_i) + \epsilon\).
    • +
    • How can we fit such a model using linear models (called scatterplot smoothing)
    • +
    • Consider the model +\[ +Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} +\] +where \((a)_+ = a\) if \(a > 0\) and \(0\) otherwise and \(\xi_1 \leq ... \leq \xi_d\) are known knot points.
    • +
    • Prove to yourelf that the mean function +\[ +\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k +\] +is continuous at the knot points.
    • +
    + +
    + +
    + + +
    +

    Simulated example

    +
    +
    +
    n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3)
    +knots <- seq(0, 8 * pi, length = 20); 
    +splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot))
    +xMat <- cbind(1, x, splineTerms)
    +yhat <- predict(lm(y ~ xMat - 1))
    +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2)
    +lines(x, yhat, col = "red", lwd = 2)
    +
    + +
    plot of chunk unnamed-chunk-1
    + +
    + +
    + + +
    +

    Adding squared terms

    +
    +
    +
      +
    • Adding squared terms makes it continuously differentiable at the knot points.
    • +
    • Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. +\[ +Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} +\]
    • +
    + +
    + +
    + + +
    +
    splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2)
    +xMat <- cbind(1, x, x^2, splineTerms)
    +yhat <- predict(lm(y ~ xMat - 1))
    +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2)
    +lines(x, yhat, col = "red", lwd = 2)
    +
    + +
    plot of chunk unnamed-chunk-2
    + +
    + +
    + + +
    +

    Notes

    +
    +
    +
      +
    • The collection of regressors is called a basis. + +
        +
      • People have spent a lot of time thinking about bases for this kind of problem. So, consider this as just a teaser.
      • +
    • +
    • Single knot point terms can fit hockey stick like processes.
    • +
    • These bases can be used in GLMs as well.
    • +
    • An issue with these approaches is the large number of parameters introduced. + +
        +
      • Requires some method of so called regularization.
      • +
    • +
    + +
    + +
    + + +
    +

    Harmonics using linear models

    +
    +
    +
    ##Chord finder, playing the white keys on a piano from octave c4 - c5
    +notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25)
    +t <- seq(0, 2, by = .001); n <- length(t)
    +c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); 
    +g4 <- sin(2 * pi * notes4[5] * t)
    +chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3)
    +x <- sapply(notes4, function(freq) sin(2 * pi * freq * t))
    +fit <- lm(chord ~ x - 1)
    +
    + +
    + +
    + + +
    +
    plot of chunk unnamed-chunk-4
    + +
    + +
    + + +
    +
    ##(How you would really do it)
    +a <- fft(chord); plot(Re(a)^2, type = "l")
    +
    + +
    plot of chunk unnamed-chunk-5
    + +
    + +
    + + +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/07_RegressionModels/03_04_bonus/index.md b/07_RegressionModels/03_04_bonus/index.md index 25a71066c..860e5d650 100644 --- a/07_RegressionModels/03_04_bonus/index.md +++ b/07_RegressionModels/03_04_bonus/index.md @@ -1,111 +1,111 @@ ---- -title : Hodgepodge -subtitle : Regression models -author : Brian Caffo, PhD -job : Johns Hopkins Bloomberg School of Public Health -logo : bloomberg_shield.png -framework : io2012 # {io2012, html5slides, shower, dzslides, ...} -highlighter : highlight.js # {highlight.js, prettify, highlight} -hitheme : tomorrow # -url: - lib: ../../librariesNew - assets: ../../assets -widgets : [mathjax] # {mathjax, quiz, bootstrap} -mode : selfcontained # {standalone, draft} ---- - - - -## How to fit functions using linear models -* Consider a model $Y_i = f(X_i) + \epsilon$. -* How can we fit such a model using linear models (called scatterplot smoothing) -* Consider the model - $$ - Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} - $$ -where $(a)_+ = a$ if $a > 0$ and $0$ otherwise and $\xi_1 \leq ... \leq \xi_d$ are known knot points. -* Prove to yourelf that the mean function -$$ -\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k -$$ -is continuous at the knot points. - ---- -## Simulated example - -```r -n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3) -knots <- seq(0, 8 * pi, length = 20); -splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)) -xMat <- cbind(1, x, splineTerms) -yhat <- predict(lm(y ~ xMat - 1)) -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) -lines(x, yhat, col = "red", lwd = 2) -``` - -
    plot of chunk unnamed-chunk-1
    - - ---- -## Adding squared terms -* Adding squared terms makes it continuously differentiable at the knot points. -* Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. -$$ - Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} -$$ - ---- - -```r -splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2) -xMat <- cbind(1, x, x^2, splineTerms) -yhat <- predict(lm(y ~ xMat - 1)) -plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) -lines(x, yhat, col = "red", lwd = 2) -``` - -
    plot of chunk unnamed-chunk-2
    - - ---- -## Notes -* The collection of regressors is called a basis. - * People have spent **a lot** of time thinking about bases for this kind of problem. So, consider this as just a teaser. -* Single knot point terms can fit hockey stick like processes. -* These bases can be used in GLMs as well. -* An issue with these approaches is the large number of parameters introduced. - * Requires some method of so called regularization. - ---- -## Harmonics using linear models - -```r -##Chord finder, playing the white keys on a piano from octave c4 - c5 -notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25) -t <- seq(0, 2, by = .001); n <- length(t) -c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); -g4 <- sin(2 * pi * notes4[5] * t) -chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3) -x <- sapply(notes4, function(freq) sin(2 * pi * freq * t)) -fit <- lm(chord ~ x - 1) -``` - - ---- -
    plot of chunk unnamed-chunk-4
    - - ---- - -```r -##(How you would really do it) -a <- fft(chord); plot(Re(a)^2, type = "l") -``` - -
    plot of chunk unnamed-chunk-5
    - - - - - - +--- +title : Hodgepodge +subtitle : Regression models +author : Brian Caffo, Jeff Leek, Roger Peng +job : Johns Hopkins Bloomberg School of Public Health +logo : bloomberg_shield.png +framework : io2012 # {io2012, html5slides, shower, dzslides, ...} +highlighter : highlight.js # {highlight.js, prettify, highlight} +hitheme : tomorrow # +url: + lib: ../../librariesNew + assets: ../../assets +widgets : [mathjax] # {mathjax, quiz, bootstrap} +mode : selfcontained # {standalone, draft} +--- + + + +## How to fit functions using linear models +* Consider a model $Y_i = f(X_i) + \epsilon$. +* How can we fit such a model using linear models (called scatterplot smoothing) +* Consider the model + $$ + Y_i = \beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k + \epsilon_{i} + $$ +where $(a)_+ = a$ if $a > 0$ and $0$ otherwise and $\xi_1 \leq ... \leq \xi_d$ are known knot points. +* Prove to yourelf that the mean function +$$ +\beta_0 + \beta_1 X_i + \sum_{k=1}^d (x_i - \xi_k)_+ \gamma_k +$$ +is continuous at the knot points. + +--- +## Simulated example + +```r +n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3) +knots <- seq(0, 8 * pi, length = 20); +splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)) +xMat <- cbind(1, x, splineTerms) +yhat <- predict(lm(y ~ xMat - 1)) +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) +lines(x, yhat, col = "red", lwd = 2) +``` + +
    plot of chunk unnamed-chunk-1
    + + +--- +## Adding squared terms +* Adding squared terms makes it continuously differentiable at the knot points. +* Adding cubic terms makes it twice continuously differentiable at the knot points; etcetera. +$$ + Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \sum_{k=1}^d (x_i - \xi_k)_+^2 \gamma_k + \epsilon_{i} +$$ + +--- + +```r +splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2) +xMat <- cbind(1, x, x^2, splineTerms) +yhat <- predict(lm(y ~ xMat - 1)) +plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) +lines(x, yhat, col = "red", lwd = 2) +``` + +
    plot of chunk unnamed-chunk-2
    + + +--- +## Notes +* The collection of regressors is called a basis. + * People have spent **a lot** of time thinking about bases for this kind of problem. So, consider this as just a teaser. +* Single knot point terms can fit hockey stick like processes. +* These bases can be used in GLMs as well. +* An issue with these approaches is the large number of parameters introduced. + * Requires some method of so called regularization. + +--- +## Harmonics using linear models + +```r +##Chord finder, playing the white keys on a piano from octave c4 - c5 +notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25) +t <- seq(0, 2, by = .001); n <- length(t) +c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t); +g4 <- sin(2 * pi * notes4[5] * t) +chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3) +x <- sapply(notes4, function(freq) sin(2 * pi * freq * t)) +fit <- lm(chord ~ x - 1) +``` + + +--- +
    plot of chunk unnamed-chunk-4
    + + +--- + +```r +##(How you would really do it) +a <- fft(chord); plot(Re(a)^2, type = "l") +``` + +
    plot of chunk unnamed-chunk-5
    + + + + + +