---
title: "Multiple Comparisons and the Bonferroni Correction"
author: "Will Koehrsen"
date: "January 25, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Multiple Comparison

We have a feature vector of 100 random data points. We want to test this against 
100 other vectors. We will use an $\alpha = 0.05$ for all of our comparisons. 

```{r}
# Create a random vector of 100 x values
x <- 5 * abs(rnorm(100))

# 100 random vectors of 100 random values
y <- runif(1e4, 0, 5)
y <- matrix(y, nrow=100, ncol=100)
df <- as.data.frame(y)
df$x <- x
p_values <- as.data.frame(matrix(ncol=2, nrow=100))
names(p_values) <- c('variable', 'p_value')

i = 1
for (term in names(df)) {
  if (term != 'x') {
  m <- lm(df$x ~ df[[term]])
  r <- summary(m)
  p <- r$coefficients[2, 4]
  p_values[i, 'variable'] = term
  p_values[i, 'p_value'] = p
  i = i + 1
  }
}


```

```{r}
library(tidyverse)
library(ggthemes)
p_values <- arrange(p_values, p_value)

sig_vars <- dplyr::filter(p_values, p_value < 0.05)$variable
sig_df <- df[, c(sig_vars)]
sig_df <- gather(sig_df)
x <- rep(df$x, 4)

ggplot(data = sig_df) + geom_point(aes(x = x, y = sig_df$value, color = sig_df$key)) + 
  geom_smooth(method= 'lm', se = FALSE, aes(x = x, y = sig_df$value, color = sig_df$key)) + 
  theme_fivethirtyeight(14) + xlab('x') + ylab('Y') + ggtitle('Y vs X for Random Variables')

```