{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Pandas is used for data manipulation\n",
    "import pandas as pd\n",
    "\n",
    "# Read in data as a dataframe\n",
    "features = pd.read_csv('data/temps_extended.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# One Hot Encoding\n",
    "features = pd.get_dummies(features)\n",
    "\n",
    "# Extract features and labels\n",
    "labels = features['actual']\n",
    "features = features.drop('actual', axis = 1)\n",
    "\n",
    "# List of features for later use\n",
    "feature_list = list(features.columns)\n",
    "\n",
    "# Convert to numpy arrays\n",
    "import numpy as np\n",
    "\n",
    "features = np.array(features)\n",
    "labels = np.array(labels)\n",
    "\n",
    "# Training and Testing Sets\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, \n",
    "                                                                            test_size = 0.25, random_state = 42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Features Shape: (1643, 17)\n",
      "Training Labels Shape: (1643,)\n",
      "Testing Features Shape: (548, 17)\n",
      "Testing Labels Shape: (548,)\n"
     ]
    }
   ],
   "source": [
    "print('Training Features Shape:', train_features.shape)\n",
    "print('Training Labels Shape:', train_labels.shape)\n",
    "print('Testing Features Shape:', test_features.shape)\n",
    "print('Testing Labels Shape:', test_labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Important train features shape: (1643, 5)\n",
      "Important test features shape: (548, 5)\n"
     ]
    }
   ],
   "source": [
    "# Names of five importances accounting for 95% of total importance\n",
    "important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend']\n",
    "\n",
    "# Find the columns of the most important features\n",
    "important_indices = [feature_list.index(feature) for feature in important_feature_names]\n",
    "\n",
    "# Create training and testing sets with only the important features\n",
    "important_train_features = train_features[:, important_indices]\n",
    "important_test_features = test_features[:, important_indices]\n",
    "\n",
    "# Sanity check on operations\n",
    "print('Important train features shape:', important_train_features.shape)\n",
    "print('Important test features shape:', important_test_features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parameters currently in use:\n",
      "{'bootstrap': True,\n",
      " 'criterion': 'mse',\n",
      " 'max_depth': None,\n",
      " 'max_features': 'auto',\n",
      " 'max_leaf_nodes': None,\n",
      " 'min_impurity_decrease': 0.0,\n",
      " 'min_impurity_split': None,\n",
      " 'min_samples_leaf': 1,\n",
      " 'min_samples_split': 2,\n",
      " 'min_weight_fraction_leaf': 0.0,\n",
      " 'n_estimators': 1000,\n",
      " 'n_jobs': 1,\n",
      " 'oob_score': False,\n",
      " 'random_state': 42,\n",
      " 'verbose': 0,\n",
      " 'warm_start': False}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)\n",
    "\n",
    "from pprint import pprint\n",
    "\n",
    "# Look at parameters used by our current forest\n",
    "print('Parameters currently in use:')\n",
    "pprint(rf.get_params())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Use only the most important features\n",
    "train_features = important_train_features[:]\n",
    "test_features = important_test_features[:]\n",
    "\n",
    "# Update feature list for visualizations\n",
    "feature_list = important_feature_names[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "# Number of trees in random forest\n",
    "n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]\n",
    "# Number of features to consider at every split\n",
    "max_features = ['auto', 'sqrt']\n",
    "# Maximum number of levels in tree\n",
    "max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]\n",
    "max_depth.append(None)\n",
    "# Minimum number of samples required to split a node\n",
    "min_samples_split = [2, 5, 10]\n",
    "# Minimum number of samples required at each leaf node\n",
    "min_samples_leaf = [1, 2, 4]\n",
    "# Method of selecting samples for training each tree\n",
    "bootstrap = [True, False]\n",
    "\n",
    "# Create the random grid\n",
    "random_grid = {'n_estimators': n_estimators,\n",
    "               'max_features': max_features,\n",
    "               'max_depth': max_depth,\n",
    "               'min_samples_split': min_samples_split,\n",
    "               'min_samples_leaf': min_samples_leaf,\n",
    "               'bootstrap': bootstrap}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Use the random grid to search for best hyperparameters\n",
    "# First create the base model to tune\n",
    "rf = RandomForestRegressor(random_state=42)\n",
    "# Random search of parameters\n",
    "rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,\n",
    "                              n_iter = 20, scoring='neg_mean_absolute_error', \n",
    "                              cv = 3, verbose=2, random_state=42, n_jobs=-1)\n",
    "\n",
    "# Fit the random search model\n",
    "rf_random.fit(train_features, train_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rf_random.best_params_"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}