Ver código fonte

Working on Improving the Random Forest

Will Koehrsen 7 anos atrás
pai
commit
f17e4e0b16
31 arquivos alterados com 2197 adições e 17 exclusões
  1. 1548 0
      random_forest_explained/Improving Random Forest Part 1.ipynb
  2. 251 0
      random_forest_explained/Improving Random Forest Part 2.ipynb
  3. 15 17
      random_forest_explained/Random Forest Explained.ipynb
  4. 3 0
      random_forest_explained/data/.RData
  5. 283 0
      random_forest_explained/data/.Rhistory
  6. 3 0
      random_forest_explained/data/hist_averages.csv
  7. 61 0
      random_forest_explained/data/munge_temps.R
  8. 3 0
      random_forest_explained/data/raw_temps.csv
  9. 27 0
      random_forest_explained/data/temp_format.R
  10. 0 0
      random_forest_explained/data/temps.csv
  11. 3 0
      random_forest_explained/data/temps_extended.csv
  12. 0 0
      random_forest_explained/images/Temperature Prediction Decision Tree - Page 1.png
  13. 0 0
      random_forest_explained/images/actual_and_variables.png
  14. 0 0
      random_forest_explained/images/basic_plots.png
  15. 0 0
      random_forest_explained/images/data_summary.PNG
  16. BIN
      random_forest_explained/images/exp_additional_plots.png
  17. BIN
      random_forest_explained/images/exp_cumulative_importances.png
  18. BIN
      random_forest_explained/images/exp_data.PNG
  19. BIN
      random_forest_explained/images/exp_data_summary.PNG
  20. BIN
      random_forest_explained/images/exp_temp_plots.png
  21. BIN
      random_forest_explained/images/exp_variable_importances.png
  22. 0 0
      random_forest_explained/images/feature_importances.png
  23. BIN
      random_forest_explained/images/feature_tradeoffs.PNG
  24. 0 0
      random_forest_explained/images/human_decision_process.PNG
  25. BIN
      random_forest_explained/images/model_comparison.png
  26. BIN
      random_forest_explained/images/pair_plots.png
  27. BIN
      random_forest_explained/images/small_tree.png
  28. 0 0
      random_forest_explained/images/small_tree_annotated.PNG
  29. 0 0
      random_forest_explained/images/temp_actual_predicted.png
  30. 0 0
      random_forest_explained/images/temperature_prediction_decision_tree.PNG
  31. BIN
      random_forest_explained/images/tree.png

Diferenças do arquivo suprimidas por serem muito extensas
+ 1548 - 0
random_forest_explained/Improving Random Forest Part 1.ipynb


+ 251 - 0
random_forest_explained/Improving Random Forest Part 2.ipynb

@@ -0,0 +1,251 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Pandas is used for data manipulation\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Read in data as a dataframe\n",
+    "features = pd.read_csv('data/temps_extended.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# One Hot Encoding\n",
+    "features = pd.get_dummies(features)\n",
+    "\n",
+    "# Extract features and labels\n",
+    "labels = features['actual']\n",
+    "features = features.drop('actual', axis = 1)\n",
+    "\n",
+    "# List of features for later use\n",
+    "feature_list = list(features.columns)\n",
+    "\n",
+    "# Convert to numpy arrays\n",
+    "import numpy as np\n",
+    "\n",
+    "features = np.array(features)\n",
+    "labels = np.array(labels)\n",
+    "\n",
+    "# Training and Testing Sets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, \n",
+    "                                                                            test_size = 0.25, random_state = 42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Features Shape: (1643, 17)\n",
+      "Training Labels Shape: (1643,)\n",
+      "Testing Features Shape: (548, 17)\n",
+      "Testing Labels Shape: (548,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Training Features Shape:', train_features.shape)\n",
+    "print('Training Labels Shape:', train_labels.shape)\n",
+    "print('Testing Features Shape:', test_features.shape)\n",
+    "print('Testing Labels Shape:', test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Important train features shape: (1643, 5)\n",
+      "Important test features shape: (548, 5)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Names of five importances accounting for 95% of total importance\n",
+    "important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend']\n",
+    "\n",
+    "# Find the columns of the most important features\n",
+    "important_indices = [feature_list.index(feature) for feature in important_feature_names]\n",
+    "\n",
+    "# Create training and testing sets with only the important features\n",
+    "important_train_features = train_features[:, important_indices]\n",
+    "important_test_features = test_features[:, important_indices]\n",
+    "\n",
+    "# Sanity check on operations\n",
+    "print('Important train features shape:', important_train_features.shape)\n",
+    "print('Important test features shape:', important_test_features.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameters currently in use:\n",
+      "{'bootstrap': True,\n",
+      " 'criterion': 'mse',\n",
+      " 'max_depth': None,\n",
+      " 'max_features': 'auto',\n",
+      " 'max_leaf_nodes': None,\n",
+      " 'min_impurity_decrease': 0.0,\n",
+      " 'min_impurity_split': None,\n",
+      " 'min_samples_leaf': 1,\n",
+      " 'min_samples_split': 2,\n",
+      " 'min_weight_fraction_leaf': 0.0,\n",
+      " 'n_estimators': 1000,\n",
+      " 'n_jobs': 1,\n",
+      " 'oob_score': False,\n",
+      " 'random_state': 42,\n",
+      " 'verbose': 0,\n",
+      " 'warm_start': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "\n",
+    "rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)\n",
+    "\n",
+    "from pprint import pprint\n",
+    "\n",
+    "# Look at parameters used by our current forest\n",
+    "print('Parameters currently in use:')\n",
+    "pprint(rf.get_params())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Use only the most important features\n",
+    "train_features = important_train_features[:]\n",
+    "test_features = important_test_features[:]\n",
+    "\n",
+    "# Update feature list for visualizations\n",
+    "feature_list = important_feature_names[:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "\n",
+    "# Number of trees in random forest\n",
+    "n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]\n",
+    "# Number of features to consider at every split\n",
+    "max_features = ['auto', 'sqrt']\n",
+    "# Maximum number of levels in tree\n",
+    "max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]\n",
+    "max_depth.append(None)\n",
+    "# Minimum number of samples required to split a node\n",
+    "min_samples_split = [2, 5, 10]\n",
+    "# Minimum number of samples required at each leaf node\n",
+    "min_samples_leaf = [1, 2, 4]\n",
+    "# Method of selecting samples for training each tree\n",
+    "bootstrap = [True, False]\n",
+    "\n",
+    "# Create the random grid\n",
+    "random_grid = {'n_estimators': n_estimators,\n",
+    "               'max_features': max_features,\n",
+    "               'max_depth': max_depth,\n",
+    "               'min_samples_split': min_samples_split,\n",
+    "               'min_samples_leaf': min_samples_leaf,\n",
+    "               'bootstrap': bootstrap}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Use the random grid to search for best hyperparameters\n",
+    "# First create the base model to tune\n",
+    "rf = RandomForestRegressor(random_state=42)\n",
+    "# Random search of parameters\n",
+    "rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,\n",
+    "                              n_iter = 20, scoring='neg_mean_absolute_error', \n",
+    "                              cv = 3, verbose=2, random_state=42, n_jobs=-1)\n",
+    "\n",
+    "# Fit the random search model\n",
+    "rf_random.fit(train_features, train_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "rf_random.best_params_"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Diferenças do arquivo suprimidas por serem muito extensas
+ 15 - 17
random_forest_explained/Random Forest Explained.ipynb


+ 3 - 0
random_forest_explained/data/.RData

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26e03604f6895e6547964bc5cef6432001be4e3f37fa9275b94b70a14c1c1c09
+size 180158

+ 283 - 0
random_forest_explained/data/.Rhistory

@@ -0,0 +1,283 @@
+library(gganimate)
+library(ggthemes)
+df <- read_csv('data_vis_challenge.csv')
+library(tidyverse)
+library(gganimate)
+library(ggthemes)
+df <- read_csv('data_vis_challenge.csv')
+df <- gather(df, key = 'species', value = 'rate', -`Light Intensity`, -Temperature)
+df <- dplyr::rename(df, intensity = `Light Intensity`, temp = Temperature)
+ggplot(data = df, aes(x = temp, y = intensity, color = species, size = intensity)) + geom_point() +
+xlab('Temperature') + ylab('Rate')
+ggplot(df, aes(x = intensity, y = rate, color = species, frame = temp)) +
+geom_point() + theme_classic(12)
+library(tidyverse)
+library(gganimate)
+library(ggthemes)
+df <- read_csv('data_vis_challenge.csv')
+setwd("C:/Users/Will Koehrsen/Desktop")
+load("C:/Users/Will Koehrsen/Desktop/.RData")
+ggplot(data = df, aes(x = intensity, y = rate, color = species, frame = temp)) +
+geom_point() + theme_classic(12)
+table(df$intensity)
+table(df$temp)
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(fill = 'navy') +
+facet_grid(temp, intensity) + coord_flip()
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(fill = 'navy') +
+facet_grid(temp ~ intensity) + coord_flip()
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(temp ~ intensity) + coord_flip()
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(intensity ~ temp) + coord_flip() + theme_hc(12)
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(intensity ~ temp) + coord_flip() + theme_stata(12)
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(intensity ~ temp) + coord_flip() + theme_economist(12)
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(intensity ~ temp) + coord_flip() + theme_hc(12)
+ggplot(data = df, aes(x = species, y = rate)) + geom_bar(stat = 'identity', fill = 'navy') +
+facet_grid(intensity ~ temp) + coord_flip() + theme_classic(12)
+setwd("~/Data-Analysis/random_forest_explained")
+library(tidyverse)
+library(lubridate)
+setwd("~/Data-Analysis/random_forest_explained")
+df <- read_csv('1169857.csv')
+df <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT WA US')
+df <- read_csv('1169857.csv')
+table(df$NAME)
+df <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
+View(temps)
+averages <- read_csv('averages.csv')
+View(averages)
+df <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
+averages <- read_csv('averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'),
+all.x = TRUE)
+View(temps)
+df <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
+averages <- read_csv('averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'),
+all.x = TRUE)
+temps <- arrange(temps, DATE)
+library(tidyverse)
+library(lubridate)
+df <- read_csv('raw_temps.csv')
+setwd("~/Data-Analysis/random_forest_explained/data")
+df <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+temps$temp_1 <- c(temps$TMAX[1:{nrow(temps) - 1}])
+# Read in the data as a dataframe
+df <- read_csv('raw_temps.csv')
+# Make sure all readings are from same station
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+# Create month, day, and week columns
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+# Create the past max temperature columns
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+# Read in the averages as a dataframe
+averages <- read_csv('hist_averages.csv')
+# Create columns for the month and day
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+# Join the averages to the temperature measurements
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE)
+View(temps)
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+# Read in the data as a dataframe
+df <- read_csv('raw_temps.csv')
+# Make sure all readings are from same station
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+# Create month, day, and week columns
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+# Create the past max temperature columns
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+# Read in the averages as a dataframe
+averages <- read_csv('hist_averages.csv')
+# Create columns for the month and day
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+# Join the averages to the temperature measurements
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+df <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(df, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+averages <- read_csv('hist_averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+temps <- dplyr::select(temps, month, day, AWND, PRCP, TMAX, week, temp_1, temp_2, DLY-TMAX-NORMAL)
+temps <- dplyr::select(temps, month, day, AWND, PRCP, TMAX, week, temp_1, temp_2, `DLY-TMAX-NORMAL`)
+df <- read_csv('raw_temps.csv')
+temps <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+averages <- read_csv('hist_averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+library(tidyverse)
+library(lubridate)
+temps <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, year = lubridate::year(DATE), month = lubridate::month(DATE),
+day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+averages <- read_csv('hist_averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE) %>% mutate(year = )
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, TMAX, temp_1, temp_2, `DLY-TMAX-NORMAL`, TMAX)
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, temp_1, temp_2, `DLY-TMAX-NORMAL`, TMAX)
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws', 'prcp', 'temp_2',
+'temp_1', 'average', 'actual')
+sapply(temps$average, function(x) (runif(1, min = x - 20, max = x + 20)))
+sapply(temps$average, function(x) round(runif(1, min = x - 20, max = x + 20)))
+temps$friend <- sapply(temps$average, function(x) round(runif(1, min = x - 20, max = x + 20)))
+View(temps)
+temps <- temps[-c(1,2), ]
+ggplot(temps, aes(x = seq(1, nrow(temps)), y = ws)) + geom_point()
+for (column in c('ws', 'prcp', 'temp_2', 'temp_1', 'average', 'actual', 'friend')) {
+ggplot(temps, aes(x = seq(1, nrow(temps)), y = column)) + geom_point()
+}
+print(ggplot(temps, aes(x = seq(1, nrow(temps)), y = column)) + geom_point())
+for (column in c('ws', 'prcp', 'temp_2', 'temp_1', 'average', 'actual', 'friend')) {
+print(ggplot(temps, aes(x = seq(1, nrow(temps)), y = temps[[column]])) + geom_point())
+}
+temps <- temps[complete.cases(temps), ]
+summary(temps)
+write_csv(temps, 'temps_extended.csv')
+table(temps$year)
+library(tidyverse)
+library(lubridate)
+temps <- read_csv('raw_temps.csv')
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+temps <- mutate(temps, year = lubridate::year(DATE),
+month = lubridate::month(DATE),
+day = lubridate::day(DATE),
+week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+library(tidyverse)
+library(lubridate)
+# Read in the data as a dataframe
+temps <- read_csv('raw_temps.csv')
+# Make sure all readings are from same station
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+# Create month, day, and week columns
+temps <- mutate(temps, year = lubridate::year(DATE),
+month = lubridate::month(DATE),
+day = lubridate::day(DATE),
+week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+# Create the past max temperature columns
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+# Shift the average wind speed, precipitation, and snow depth
+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
+averages <- read_csv('hist_averages.csv')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
+temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws', 'prcp', 'snwd',
+'temp_2', 'temp_1', 'average', 'actual')
+temps$friend <- sapply(temps$average, function(x)
+round(runif(1, min = x - 20, max = x + 20)))
+temps <- temps[-c(1,2), ]
+temps <- temps[complete.cases(temps), ]
+summary(temps)
+View(temps)
+# RF temperature modeling
+#
+# Read in data
+library(tidyverse)
+library(lubridate)
+# Read in the data as a dataframe
+temps <- read_csv('raw_temps.csv')
+# Make sure all readings are from same station
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+# Create month, day, and week columns
+temps <- mutate(temps, year = lubridate::year(DATE),
+month = lubridate::month(DATE),
+day = lubridate::day(DATE),
+week = lubridate::wday(DATE, label = TRUE)) %>%
+arrange(DATE)
+# Create the past max temperature columns
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+# Shift the average wind speed, precipitation, and snow depth
+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
+# Read in the averages as a dataframe
+averages <- read_csv('hist_averages.csv')
+# Create columns for the month and day
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+# Join the averages to the temperature measurements
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')],
+by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+# Select and order relevant columns
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
+temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
+# Rename columns
+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws_1', 'prcp_1', 'snwd_1',
+'temp_2', 'temp_1', 'average', 'actual')
+# Friend predictions
+temps$friend <- sapply(temps$average, function(x)
+round(runif(1, min = x - 20, max = x + 20)))
+# Remove first two rows
+temps <- temps[-c(1,2), ]
+# Remove na
+temps <- temps[complete.cases(temps), ]
+# Summary of data
+summary(temps)
+# Write to csv file
+write_csv(temps, 'temps_extended.csv')

+ 3 - 0
random_forest_explained/data/hist_averages.csv

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02e5acdf9b9d5d0ee67e0f55b4db5759f68cda00b580c281d54f390323072c20
+size 38819

+ 61 - 0
random_forest_explained/data/munge_temps.R

@@ -0,0 +1,61 @@
+# RF temperature modeling
+# 
+# Read in data
+library(tidyverse)
+library(lubridate)
+
+# Read in the data as a dataframe
+temps <- read_csv('raw_temps.csv')
+
+# Make sure all readings are from same station
+temps <- dplyr::filter(temps, NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US')
+
+# Create month, day, and week columns
+temps <- mutate(temps, year = lubridate::year(DATE), 
+                month = lubridate::month(DATE), 
+                day = lubridate::day(DATE), 
+                week = lubridate::wday(DATE, label = TRUE)) %>% 
+  arrange(DATE)
+
+# Create the past max temperature columns
+temps$temp_1 <- c(NA, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(NA, NA, temps$TMAX[1:{nrow(temps) - 2}])
+# Shift the average wind speed, precipitation, and snow depth
+temps$AWND <- c(NA, temps$AWND[1:{nrow(temps) - 1}])
+temps$PRCP <- c(NA, temps$PRCP[1:{nrow(temps) - 1}])
+temps$SNWD <- c(NA, temps$SNWD[1:{nrow(temps) - 1}])
+
+# Read in the averages as a dataframe
+averages <- read_csv('hist_averages.csv')
+
+# Create columns for the month and day
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+
+# Join the averages to the temperature measurements
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], 
+               by = c('month', 'day'), all.x = TRUE) %>% arrange(DATE)
+
+# Select and order relevant columns
+temps <- dplyr::select(temps, year, month, day, week, AWND, PRCP, SNWD,
+                       temp_2, temp_1, `DLY-TMAX-NORMAL`, TMAX)
+
+# Rename columns
+names(temps) <- c('year', 'month', 'day', 'weekday', 'ws_1', 'prcp_1', 'snwd_1', 
+                  'temp_2', 'temp_1', 'average', 'actual')
+
+# Friend predictions
+temps$friend <- sapply(temps$average, function(x) 
+  round(runif(1, min = x - 20, max = x + 20)))
+
+# Remove first two rows
+temps <- temps[-c(1,2), ]
+
+# Remove na
+temps <- temps[complete.cases(temps), ]
+
+# Summary of data
+summary(temps)
+
+# Write to csv file
+write_csv(temps, 'temps_extended.csv')

+ 3 - 0
random_forest_explained/data/raw_temps.csv

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f786379e28bafe8cb4a4c60f3dfc888aa1bee3d71ee9fa5da5226a81066573b
+size 362408

+ 27 - 0
random_forest_explained/data/temp_format.R

@@ -0,0 +1,27 @@
+# RF temperature modeling
+# 
+# Read in data
+library(tidyverse)
+library(lubridate)
+
+temps <- read_csv('1159640.csv')
+temps <- mutate(temps, month = lubridate::month(DATE), 
+                day = lubridate::day(DATE), week = lubridate::wday(DATE, label = TRUE))
+temps$temp_1 <- c(45, temps$TMAX[1:{nrow(temps) - 1}])
+temps$temp_2 <- c(45, 44, temps$TMAX[1:{nrow(temps) - 2}])
+
+averages <- read_csv('1159653.csv')
+averages <- dplyr::filter(averages, STATION_NAME == 'SEATTLE TACOMA INTERNATIONAL AIRPORT WA US')
+averages$month <- as.numeric(substr(averages$DATE, 5, 6))
+averages$day <- as.numeric(substr(averages$DATE, 7, 8))
+
+temps <- merge(temps, averages[, c('month', 'day', 'DLY-TMAX-NORMAL')], by = c('month', 'day'), 
+               all.x = TRUE)
+
+temps <- temps[, c('month', 'week', 'day', 'DATE', 'TMAX', 'temp_1', 'temp_2', 'DLY-TMAX-NORMAL')]
+temps <- dplyr::rename(temps, date = DATE, actual = TMAX)
+temps <- dplyr::rename(temps, average = 'DLY-TMAX-NORMAL')
+temps$year <- 2016
+temps <- temps[, -which(names(temps) == 'date')]
+
+write_csv(temps, 'mod_temps.csv')

random_forest_explained/temps.csv → random_forest_explained/data/temps.csv


+ 3 - 0
random_forest_explained/data/temps_extended.csv

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45c030480125ea24f9cea7e87346858b9864f8e165a5fa6d8deae2856d15afef
+size 91141

random_forest_explained/Temperature Prediction Decision Tree - Page 1.png → random_forest_explained/images/Temperature Prediction Decision Tree - Page 1.png


random_forest_explained/actual_and_variables.png → random_forest_explained/images/actual_and_variables.png


random_forest_explained/basic_plots.png → random_forest_explained/images/basic_plots.png


random_forest_explained/data_summary.PNG → random_forest_explained/images/data_summary.PNG


BIN
random_forest_explained/images/exp_additional_plots.png


BIN
random_forest_explained/images/exp_cumulative_importances.png


BIN
random_forest_explained/images/exp_data.PNG


BIN
random_forest_explained/images/exp_data_summary.PNG


BIN
random_forest_explained/images/exp_temp_plots.png


BIN
random_forest_explained/images/exp_variable_importances.png


random_forest_explained/feature_importances.png → random_forest_explained/images/feature_importances.png


BIN
random_forest_explained/images/feature_tradeoffs.PNG


random_forest_explained/human_decision_process.PNG → random_forest_explained/images/human_decision_process.PNG


BIN
random_forest_explained/images/model_comparison.png


BIN
random_forest_explained/images/pair_plots.png


BIN
random_forest_explained/images/small_tree.png


random_forest_explained/small_tree_annotated.PNG → random_forest_explained/images/small_tree_annotated.PNG


random_forest_explained/temp_actual_predicted.png → random_forest_explained/images/temp_actual_predicted.png


random_forest_explained/temperature_prediction_decision_tree.PNG → random_forest_explained/images/temperature_prediction_decision_tree.PNG


BIN
random_forest_explained/images/tree.png