7 years ago · 0cebbd3847
--- a/statistical_significance/bonferroni.Rmd
+++ b/statistical_significance/bonferroni.Rmd
@@ -0,0 +1,57 @@
 
				+---
			
 
				+title: "Multiple Comparisons and the Bonferroni Correction"
			
 
				+author: "Will Koehrsen"
			
 
				+date: "January 25, 2018"
			
 
				+output: html_document
			
 
				+---
			
 
				+
			
 
				+```{r setup, include=FALSE}
			
 
				+knitr::opts_chunk$set(echo = TRUE)
			
 
				+```
			
 
				+# Multiple Comparison
			
 
				+
			
 
				+We have a feature vector of 100 random data points. We want to test this against 
			
 
				+100 other vectors. We will use an $\alpha = 0.05$ for all of our comparisons. 
			
 
				+
			
 
				+```{r}
			
 
				+# Create a random vector of 100 x values
			
 
				+x <- 5 * abs(rnorm(100))
			
 
				+
			
 
				+# 100 random vectors of 100 random values
			
 
				+y <- runif(1e4, 0, 5)
			
 
				+y <- matrix(y, nrow=100, ncol=100)
			
 
				+df <- as.data.frame(y)
			
 
				+df$x <- x
			
 
				+p_values <- as.data.frame(matrix(ncol=2, nrow=100))
			
 
				+names(p_values) <- c('variable', 'p_value')
			
 
				+
			
 
				+i = 1
			
 
				+for (term in names(df)) {
			
 
				+  if (term != 'x') {
			
 
				+  m <- lm(df$x ~ df[[term]])
			
 
				+  r <- summary(m)
			
 
				+  p <- r$coefficients[2, 4]
			
 
				+  p_values[i, 'variable'] = term
			
 
				+  p_values[i, 'p_value'] = p
			
 
				+  i = i + 1
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+```
			
 
				+
			
 
				+```{r}
			
 
				+library(tidyverse)
			
 
				+library(ggthemes)
			
 
				+p_values <- arrange(p_values, p_value)
			
 
				+
			
 
				+sig_vars <- dplyr::filter(p_values, p_value < 0.05)$variable
			
 
				+sig_df <- df[, c(sig_vars)]
			
 
				+sig_df <- gather(sig_df)
			
 
				+x <- rep(df$x, 4)
			
 
				+
			
 
				+ggplot(data = sig_df) + geom_point(aes(x = x, y = sig_df$value, color = sig_df$key)) + 
			
 
				+  geom_smooth(method= 'lm', se = FALSE, aes(x = x, y = sig_df$value, color = sig_df$key)) + 
			
 
				+  theme_fivethirtyeight(14) + xlab('x') + ylab('Y') + ggtitle('Y vs X for Random Variables')
			
 
				+
			
 
				+```
			
--- a/statistical_significance/images/no_correction.png
+++ b/statistical_significance/images/no_correction.png
--- a/statistical_significance/images/spurious.png
+++ b/statistical_significance/images/spurious.png
--- a/statistical_significance/images/with_correction.png
+++ b/statistical_significance/images/with_correction.png
--- a/statistical_significance/multiple_comparisons.ipynb
+++ b/statistical_significance/multiple_comparisons.ipynb
--- a/statistical_significance/style/bmh_matplotlibrc.json
+++ b/statistical_significance/style/bmh_matplotlibrc.json
@@ -0,0 +1,22 @@
 
				+{
			
 
				+  "lines.linewidth": 2.0,
			
 
				+  "axes.edgecolor": "#bcbcbc",
			
 
				+  "patch.linewidth": 0.5,
			
 
				+  "legend.fancybox": true,
			
 
				+  "axes.color_cycle": [
			
 
				+    "#348ABD",
			
 
				+    "#A60628",
			
 
				+    "#7A68A6",
			
 
				+    "#467821",
			
 
				+    "#CF4457",
			
 
				+    "#188487",
			
 
				+    "#E24A33"
			
 
				+  ],
			
 
				+  "axes.facecolor": "#eeeeee",
			
 
				+  "axes.labelsize": "large",
			
 
				+  "axes.grid": true,
			
 
				+  "patch.edgecolor": "#eeeeee",
			
 
				+  "axes.titlesize": "x-large",
			
 
				+  "svg.fonttype": "path",
			
 
				+  "examples.directory": ""
			
 
				+}