7 anos atrás · f17e4e0b16
--- a/random_forest_explained/Improving
+++ b/random_forest_explained/Improving
--- a/random_forest_explained/Improving
+++ b/random_forest_explained/Improving
@@ -0,0 +1,251 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Pandas is used for data manipulation\n",
			
 
				+    "import pandas as pd\n",
			
 
				+    "\n",
			
 
				+    "# Read in data as a dataframe\n",
			
 
				+    "features = pd.read_csv('data/temps_extended.csv')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# One Hot Encoding\n",
			
 
				+    "features = pd.get_dummies(features)\n",
			
 
				+    "\n",
			
 
				+    "# Extract features and labels\n",
			
 
				+    "labels = features['actual']\n",
			
 
				+    "features = features.drop('actual', axis = 1)\n",
			
 
				+    "\n",
			
 
				+    "# List of features for later use\n",
			
 
				+    "feature_list = list(features.columns)\n",
			
 
				+    "\n",
			
 
				+    "# Convert to numpy arrays\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "\n",
			
 
				+    "features = np.array(features)\n",
			
 
				+    "labels = np.array(labels)\n",
			
 
				+    "\n",
			
 
				+    "# Training and Testing Sets\n",
			
 
				+    "from sklearn.model_selection import train_test_split\n",
			
 
				+    "\n",
			
 
				+    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, \n",
			
 
				+    "                                                                            test_size = 0.25, random_state = 42)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Training Features Shape: (1643, 17)\n",
			
 
				+      "Training Labels Shape: (1643,)\n",
			
 
				+      "Testing Features Shape: (548, 17)\n",
			
 
				+      "Testing Labels Shape: (548,)\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "print('Training Features Shape:', train_features.shape)\n",
			
 
				+    "print('Training Labels Shape:', train_labels.shape)\n",
			
 
				+    "print('Testing Features Shape:', test_features.shape)\n",
			
 
				+    "print('Testing Labels Shape:', test_labels.shape)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Important train features shape: (1643, 5)\n",
			
 
				+      "Important test features shape: (548, 5)\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "# Names of five importances accounting for 95% of total importance\n",
			
 
				+    "important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend']\n",
			
 
				+    "\n",
			
 
				+    "# Find the columns of the most important features\n",
			
 
				+    "important_indices = [feature_list.index(feature) for feature in important_feature_names]\n",
			
 
				+    "\n",
			
 
				+    "# Create training and testing sets with only the important features\n",
			
 
				+    "important_train_features = train_features[:, important_indices]\n",
			
 
				+    "important_test_features = test_features[:, important_indices]\n",
			
 
				+    "\n",
			
 
				+    "# Sanity check on operations\n",
			
 
				+    "print('Important train features shape:', important_train_features.shape)\n",
			
 
				+    "print('Important test features shape:', important_test_features.shape)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Parameters currently in use:\n",
			
 
				+      "{'bootstrap': True,\n",
			
 
				+      " 'criterion': 'mse',\n",
			
 
				+      " 'max_depth': None,\n",
			
 
				+      " 'max_features': 'auto',\n",
			
 
				+      " 'max_leaf_nodes': None,\n",
			
 
				+      " 'min_impurity_decrease': 0.0,\n",
			
 
				+      " 'min_impurity_split': None,\n",
			
 
				+      " 'min_samples_leaf': 1,\n",
			
 
				+      " 'min_samples_split': 2,\n",
			
 
				+      " 'min_weight_fraction_leaf': 0.0,\n",
			
 
				+      " 'n_estimators': 1000,\n",
			
 
				+      " 'n_jobs': 1,\n",
			
 
				+      " 'oob_score': False,\n",
			
 
				+      " 'random_state': 42,\n",
			
 
				+      " 'verbose': 0,\n",
			
 
				+      " 'warm_start': False}\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "from sklearn.ensemble import RandomForestRegressor\n",
			
 
				+    "\n",
			
 
				+    "rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)\n",
			
 
				+    "\n",
			
 
				+    "from pprint import pprint\n",
			
 
				+    "\n",
			
 
				+    "# Look at parameters used by our current forest\n",
			
 
				+    "print('Parameters currently in use:')\n",
			
 
				+    "pprint(rf.get_params())"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Use only the most important features\n",
			
 
				+    "train_features = important_train_features[:]\n",
			
 
				+    "test_features = important_test_features[:]\n",
			
 
				+    "\n",
			
 
				+    "# Update feature list for visualizations\n",
			
 
				+    "feature_list = important_feature_names[:]"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from sklearn.model_selection import RandomizedSearchCV\n",
			
 
				+    "\n",
			
 
				+    "# Number of trees in random forest\n",
			
 
				+    "n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]\n",
			
 
				+    "# Number of features to consider at every split\n",
			
 
				+    "max_features = ['auto', 'sqrt']\n",
			
 
				+    "# Maximum number of levels in tree\n",
			
 
				+    "max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]\n",
			
 
				+    "max_depth.append(None)\n",
			
 
				+    "# Minimum number of samples required to split a node\n",
			
 
				+    "min_samples_split = [2, 5, 10]\n",
			
 
				+    "# Minimum number of samples required at each leaf node\n",
			
 
				+    "min_samples_leaf = [1, 2, 4]\n",
			
 
				+    "# Method of selecting samples for training each tree\n",
			
 
				+    "bootstrap = [True, False]\n",
			
 
				+    "\n",
			
 
				+    "# Create the random grid\n",
			
 
				+    "random_grid = {'n_estimators': n_estimators,\n",
			
 
				+    "               'max_features': max_features,\n",
			
 
				+    "               'max_depth': max_depth,\n",
			
 
				+    "               'min_samples_split': min_samples_split,\n",
			
 
				+    "               'min_samples_leaf': min_samples_leaf,\n",
			
 
				+    "               'bootstrap': bootstrap}"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Use the random grid to search for best hyperparameters\n",
			
 
				+    "# First create the base model to tune\n",
			
 
				+    "rf = RandomForestRegressor(random_state=42)\n",
			
 
				+    "# Random search of parameters\n",
			
 
				+    "rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,\n",
			
 
				+    "                              n_iter = 20, scoring='neg_mean_absolute_error', \n",
			
 
				+    "                              cv = 3, verbose=2, random_state=42, n_jobs=-1)\n",
			
 
				+    "\n",
			
 
				+    "# Fit the random search model\n",
			
 
				+    "rf_random.fit(train_features, train_labels)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "rf_random.best_params_"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.6.0"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/random_forest_explained/Random
+++ b/random_forest_explained/Random
--- a/random_forest_explained/data/.RData
+++ b/random_forest_explained/data/.RData
@@ -0,0 +1,3 @@
 
				+version https://git-lfs.github.com/spec/v1
			
 
				+oid sha256:26e03604f6895e6547964bc5cef6432001be4e3f37fa9275b94b70a14c1c1c09
			
 
				+size 180158
			
--- a/random_forest_explained/data/.Rhistory
+++ b/random_forest_explained/data/.Rhistory
@@ -0,0 +1,283 @@
 
				+library(gganimate)
			
 
				+library(ggthemes)
			
 
				+df <- read_csv('data_vis_challenge.csv')
			
 
				+library(tidyverse)
			
 
				+library(gganimate)
			
 
				+library(ggthemes)
			
 
				+df <- read_csv('data_vis_challenge.csv')
			
 
				+df <- gather(df, key = 'species', value = 'rate', -`Light Intensity`, -Temperature)
			
 
				+df <- dplyr::rename(df, intensity = `Light Intensity`, temp = Temperature)
			
 
				+ggplot(data = df, aes(x = temp, y = intensity, color = species, size = intensity)) + geom_point() +
			
 
				+xlab('Temperature') + ylab('Rate')
			
 
				+ggplot(df, aes(x = intensity, y = rate, color = species, frame = temp)) +
			
 
				+geom_point() + theme_classic(12)
			
 
				+library(tidyverse)
			
 
				+library(gganimate)
			
 
				+library(ggthemes)
			
 
				+df <- read_csv('data_vis_challenge.csv')
			
 
				+setwd("C:/Users/Will Koehrsen/Desktop")
			
 
				+load("C:/Users/Will Koehrsen/Desktop/.RData")
			
 
				+ggplot(data = df, aes(x = intensity, y = rate, color = species, frame = temp)) +
			
 
				+geom_point() + theme_classic(12)
			
 
				+table(df$intensity)
			
 
				+table(df$temp)
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(fill = 'navy') +
			
 
				+facet_grid(temp, intensity) + coord_flip()
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(fill = 'navy') +
			
 
				+facet_grid(temp ~ intensity) + coord_flip()
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(temp ~ intensity) + coord_flip()
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(intensity ~ temp) + coord_flip() + theme_hc(12)
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(intensity ~ temp) + coord_flip() + theme_stata(12)
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(intensity ~ temp) + coord_flip() + theme_economist(12)
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(intensity ~ temp) + coord_flip() + theme_hc(12)
			
 
				+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
			
 
				+facet_grid(intensity ~ temp) + coord_flip() + theme_classic(12)
			
 
				+setwd("~/Data-Analysis/random_forest_explained")
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+setwd("~/Data-Analysis/random_forest_explained")
			
 
				+df <- read_csv('1169857.csv')
			
 
				+df <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT WA US')
			
 
				+df <- read_csv('1169857.csv')
			
 
				+table(df$NAME)
			
 
				+df <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+View(temps)
			
 
				+averages <- read_csv('averages.csv')
			
 
				+View(averages)
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+averages <- read_csv('averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'),
			
 
				+all.x = TRUE)
			
 
				+View(temps)
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+averages <- read_csv('averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'),
			
 
				+all.x = TRUE)
			
 
				+temps <- arrange(temps, DATE)
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+setwd("~/Data-Analysis/random_forest_explained/data")
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+temps$temp_1 <- c(temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+# Read in the data as a dataframe
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+# Make sure all readings are from same station
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+# Create month, day, and week columns
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+# Create the past max temperature columns
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+# Read in the averages as a dataframe
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+# Create columns for the month and day
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+# Join the averages to the temperature measurements
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE)
			
 
				+View(temps)
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+# Read in the data as a dataframe
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+# Make sure all readings are from same station
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+# Create month, day, and week columns
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+# Create the past max temperature columns
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+# Read in the averages as a dataframe
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+# Create columns for the month and day
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+# Join the averages to the temperature measurements
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+temps <- dplyr::select(temps, month, day, AWND, PRCP, TMAX, week, temp_1, temp_2, DLY-TMAX-NORMAL)
			
 
				+temps <- dplyr::select(temps, month, day, AWND, PRCP, TMAX, week, temp_1, temp_2, `DLY-TMAX-NORMAL`)
			
 
				+df <- read_csv('raw_temps.csv')
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, year = lubridate::year(DATE), month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE) %>% mutate(year = )
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, TMAX, temp_1, temp_2, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, temp_1, temp_2, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws', 'prcp', 'temp_2',
			
 
				+'temp_1', 'average', 'actual')
			
 
				+sapply(temps$average, function(x) (runif(1, min = x - 20, max = x + 20)))
			
 
				+sapply(temps$average, function(x) round(runif(1, min = x - 20, max = x + 20)))
			
 
				+temps$friend <- sapply(temps$average, function(x) round(runif(1, min = x - 20, max = x + 20)))
			
 
				+View(temps)
			
 
				+temps <- temps[-c(1,2), ]
			
 
				+ggplot(temps, aes(x = seq(1, nrow(temps)), y = ws)) + geom_point()
			
 
				+for (column in c('ws', 'prcp', 'temp_2', 'temp_1', 'average', 'actual', 'friend')) {
			
 
				+ggplot(temps, aes(x = seq(1, nrow(temps)), y = column)) + geom_point()
			
 
				+}
			
 
				+print(ggplot(temps, aes(x = seq(1, nrow(temps)), y = column)) + geom_point())
			
 
				+for (column in c('ws', 'prcp', 'temp_2', 'temp_1', 'average', 'actual', 'friend')) {
			
 
				+print(ggplot(temps, aes(x = seq(1, nrow(temps)), y = temps[[column]])) + geom_point())
			
 
				+}
			
 
				+temps <- temps[complete.cases(temps), ]
			
 
				+summary(temps)
			
 
				+write_csv(temps, 'temps_extended.csv')
			
 
				+table(temps$year)
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+temps <- mutate(temps, year = lubridate::year(DATE),
			
 
				+month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE),
			
 
				+week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+# Read in the data as a dataframe
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+# Make sure all readings are from same station
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+# Create month, day, and week columns
			
 
				+temps <- mutate(temps, year = lubridate::year(DATE),
			
 
				+month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE),
			
 
				+week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+# Create the past max temperature columns
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+# Shift the average wind speed, precipitation, and snow depth
			
 
				+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
			
 
				+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
			
 
				+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
			
 
				+temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws', 'prcp', 'snwd',
			
 
				+'temp_2', 'temp_1', 'average', 'actual')
			
 
				+temps$friend <- sapply(temps$average, function(x)
			
 
				+round(runif(1, min = x - 20, max = x + 20)))
			
 
				+temps <- temps[-c(1,2), ]
			
 
				+temps <- temps[complete.cases(temps), ]
			
 
				+summary(temps)
			
 
				+View(temps)
			
 
				+# RF temperature modeling
			
 
				+#
			
 
				+# Read in data
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+# Read in the data as a dataframe
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+# Make sure all readings are from same station
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+# Create month, day, and week columns
			
 
				+temps <- mutate(temps, year = lubridate::year(DATE),
			
 
				+month = lubridate::month(DATE),
			
 
				+day = lubridate::day(DATE),
			
 
				+week = lubridate::wday(DATE, label = TRUE)) %>%
			
 
				+arrange(DATE)
			
 
				+# Create the past max temperature columns
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+# Shift the average wind speed, precipitation, and snow depth
			
 
				+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
			
 
				+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
			
 
				+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
			
 
				+# Read in the averages as a dataframe
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+# Create columns for the month and day
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+# Join the averages to the temperature measurements
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
			
 
				+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+# Select and order relevant columns
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
			
 
				+temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+# Rename columns
			
 
				+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws_1', 'prcp_1', 'snwd_1',
			
 
				+'temp_2', 'temp_1', 'average', 'actual')
			
 
				+# Friend predictions
			
 
				+temps$friend <- sapply(temps$average, function(x)
			
 
				+round(runif(1, min = x - 20, max = x + 20)))
			
 
				+# Remove first two rows
			
 
				+temps <- temps[-c(1,2), ]
			
 
				+# Remove na
			
 
				+temps <- temps[complete.cases(temps), ]
			
 
				+# Summary of data
			
 
				+summary(temps)
			
 
				+# Write to csv file
			
 
				+write_csv(temps, 'temps_extended.csv')
			
--- a/random_forest_explained/data/hist_averages.csv
+++ b/random_forest_explained/data/hist_averages.csv
@@ -0,0 +1,3 @@
 
				+version https://git-lfs.github.com/spec/v1
			
 
				+oid sha256:02e5acdf9b9d5d0ee67e0f55b4db5759f68cda00b580c281d54f390323072c20
			
 
				+size 38819
			
--- a/random_forest_explained/data/munge_temps.R
+++ b/random_forest_explained/data/munge_temps.R
@@ -0,0 +1,61 @@
 
				+# RF temperature modeling
			
 
				+# 
			
 
				+# Read in data
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+
			
 
				+# Read in the data as a dataframe
			
 
				+temps <- read_csv('raw_temps.csv')
			
 
				+
			
 
				+# Make sure all readings are from same station
			
 
				+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
			
 
				+
			
 
				+# Create month, day, and week columns
			
 
				+temps <- mutate(temps, year = lubridate::year(DATE), 
			
 
				+                month = lubridate::month(DATE), 
			
 
				+                day = lubridate::day(DATE), 
			
 
				+                week = lubridate::wday(DATE, label = TRUE)) %>% 
			
 
				+  arrange(DATE)
			
 
				+
			
 
				+# Create the past max temperature columns
			
 
				+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+# Shift the average wind speed, precipitation, and snow depth
			
 
				+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
			
 
				+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
			
 
				+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
			
 
				+
			
 
				+# Read in the averages as a dataframe
			
 
				+averages <- read_csv('hist_averages.csv')
			
 
				+
			
 
				+# Create columns for the month and day
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+
			
 
				+# Join the averages to the temperature measurements
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], 
			
 
				+               by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
			
 
				+
			
 
				+# Select and order relevant columns
			
 
				+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
			
 
				+                       temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
			
 
				+
			
 
				+# Rename columns
			
 
				+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws_1', 'prcp_1', 'snwd_1', 
			
 
				+                  'temp_2', 'temp_1', 'average', 'actual')
			
 
				+
			
 
				+# Friend predictions
			
 
				+temps$friend <- sapply(temps$average, function(x) 
			
 
				+  round(runif(1, min = x - 20, max = x + 20)))
			
 
				+
			
 
				+# Remove first two rows
			
 
				+temps <- temps[-c(1,2), ]
			
 
				+
			
 
				+# Remove na
			
 
				+temps <- temps[complete.cases(temps), ]
			
 
				+
			
 
				+# Summary of data
			
 
				+summary(temps)
			
 
				+
			
 
				+# Write to csv file
			
 
				+write_csv(temps, 'temps_extended.csv')
			
--- a/random_forest_explained/data/raw_temps.csv
+++ b/random_forest_explained/data/raw_temps.csv
@@ -0,0 +1,3 @@
 
				+version https://git-lfs.github.com/spec/v1
			
 
				+oid sha256:1f786379e28bafe8cb4a4c60f3dfc888aa1bee3d71ee9fa5da5226a81066573b
			
 
				+size 362408
			
--- a/random_forest_explained/data/temp_format.R
+++ b/random_forest_explained/data/temp_format.R
@@ -0,0 +1,27 @@
 
				+# RF temperature modeling
			
 
				+# 
			
 
				+# Read in data
			
 
				+library(tidyverse)
			
 
				+library(lubridate)
			
 
				+
			
 
				+temps <- read_csv('1159640.csv')
			
 
				+temps <- mutate(temps, month = lubridate::month(DATE), 
			
 
				+                day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
			
 
				+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
			
 
				+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
			
 
				+
			
 
				+averages <- read_csv('1159653.csv')
			
 
				+averages <- dplyr::filter(averages, STATION_NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT WA US')
			
 
				+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
			
 
				+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
			
 
				+
			
 
				+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'), 
			
 
				+               all.x = TRUE)
			
 
				+
			
 
				+temps <- temps[, c('month', 'week', 'day', 'DATE', 'TMAX', 'temp_1', 'temp_2', 'DLY-TMAX-NORMAL')]
			
 
				+temps <- dplyr::rename(temps, date = DATE, actual = TMAX)
			
 
				+temps <- dplyr::rename(temps, average = 'DLY-TMAX-NORMAL')
			
 
				+temps$year <- 2016
			
 
				+temps <- temps[, -which(names(temps) == 'date')]
			
 
				+
			
 
				+write_csv(temps, 'mod_temps.csv')
			
--- a/random_forest_explained/data/temps.csv
+++ b/random_forest_explained/data/temps.csv
--- a/random_forest_explained/data/temps_extended.csv
+++ b/random_forest_explained/data/temps_extended.csv
@@ -0,0 +1,3 @@
 
				+version https://git-lfs.github.com/spec/v1
			
 
				+oid sha256:45c030480125ea24f9cea7e87346858b9864f8e165a5fa6d8deae2856d15afef
			
 
				+size 91141
			
--- a/random_forest_explained/images/Temperature
+++ b/random_forest_explained/images/Temperature
--- a/random_forest_explained/images/actual_and_variables.png
+++ b/random_forest_explained/images/actual_and_variables.png
--- a/random_forest_explained/images/basic_plots.png
+++ b/random_forest_explained/images/basic_plots.png
--- a/random_forest_explained/images/data_summary.PNG
+++ b/random_forest_explained/images/data_summary.PNG
--- a/random_forest_explained/images/exp_additional_plots.png
+++ b/random_forest_explained/images/exp_additional_plots.png
--- a/random_forest_explained/images/exp_cumulative_importances.png
+++ b/random_forest_explained/images/exp_cumulative_importances.png
--- a/random_forest_explained/images/exp_data.PNG
+++ b/random_forest_explained/images/exp_data.PNG
--- a/random_forest_explained/images/exp_data_summary.PNG
+++ b/random_forest_explained/images/exp_data_summary.PNG
--- a/random_forest_explained/images/exp_temp_plots.png
+++ b/random_forest_explained/images/exp_temp_plots.png
--- a/random_forest_explained/images/exp_variable_importances.png
+++ b/random_forest_explained/images/exp_variable_importances.png
--- a/random_forest_explained/images/feature_importances.png
+++ b/random_forest_explained/images/feature_importances.png
--- a/random_forest_explained/images/feature_tradeoffs.PNG
+++ b/random_forest_explained/images/feature_tradeoffs.PNG
--- a/random_forest_explained/images/human_decision_process.PNG
+++ b/random_forest_explained/images/human_decision_process.PNG
--- a/random_forest_explained/images/model_comparison.png
+++ b/random_forest_explained/images/model_comparison.png
--- a/random_forest_explained/images/pair_plots.png
+++ b/random_forest_explained/images/pair_plots.png
--- a/random_forest_explained/images/small_tree.png
+++ b/random_forest_explained/images/small_tree.png
--- a/random_forest_explained/images/small_tree_annotated.PNG
+++ b/random_forest_explained/images/small_tree_annotated.PNG
--- a/random_forest_explained/images/temp_actual_predicted.png
+++ b/random_forest_explained/images/temp_actual_predicted.png
--- a/random_forest_explained/images/temperature_prediction_decision_tree.PNG
+++ b/random_forest_explained/images/temperature_prediction_decision_tree.PNG
--- a/random_forest_explained/images/tree.png
+++ b/random_forest_explained/images/tree.png