{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Pandas is used for data manipulation\n", "import pandas as pd\n", "\n", "# Read in data as a dataframe\n", "features = pd.read_csv('data/temps_extended.csv')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# One Hot Encoding\n", "features = pd.get_dummies(features)\n", "\n", "# Extract features and labels\n", "labels = features['actual']\n", "features = features.drop('actual', axis = 1)\n", "\n", "# List of features for later use\n", "feature_list = list(features.columns)\n", "\n", "# Convert to numpy arrays\n", "import numpy as np\n", "\n", "features = np.array(features)\n", "labels = np.array(labels)\n", "\n", "# Training and Testing Sets\n", "from sklearn.model_selection import train_test_split\n", "\n", "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, \n", " test_size = 0.25, random_state = 42)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training Features Shape: (1643, 17)\n", "Training Labels Shape: (1643,)\n", "Testing Features Shape: (548, 17)\n", "Testing Labels Shape: (548,)\n" ] } ], "source": [ "print('Training Features Shape:', train_features.shape)\n", "print('Training Labels Shape:', train_labels.shape)\n", "print('Testing Features Shape:', test_features.shape)\n", "print('Testing Labels Shape:', test_labels.shape)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Important train features shape: (1643, 5)\n", "Important test features shape: (548, 5)\n" ] } ], "source": [ "# Names of five importances accounting for 95% of total importance\n", "important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend']\n", "\n", "# Find the columns of the most important features\n", "important_indices = [feature_list.index(feature) for feature in important_feature_names]\n", "\n", "# Create training and testing sets with only the important features\n", "important_train_features = train_features[:, important_indices]\n", "important_test_features = test_features[:, important_indices]\n", "\n", "# Sanity check on operations\n", "print('Important train features shape:', important_train_features.shape)\n", "print('Important test features shape:', important_test_features.shape)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters currently in use:\n", "{'bootstrap': True,\n", " 'criterion': 'mse',\n", " 'max_depth': None,\n", " 'max_features': 'auto',\n", " 'max_leaf_nodes': None,\n", " 'min_impurity_decrease': 0.0,\n", " 'min_impurity_split': None,\n", " 'min_samples_leaf': 1,\n", " 'min_samples_split': 2,\n", " 'min_weight_fraction_leaf': 0.0,\n", " 'n_estimators': 1000,\n", " 'n_jobs': 1,\n", " 'oob_score': False,\n", " 'random_state': 42,\n", " 'verbose': 0,\n", " 'warm_start': False}\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "\n", "rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)\n", "\n", "from pprint import pprint\n", "\n", "# Look at parameters used by our current forest\n", "print('Parameters currently in use:')\n", "pprint(rf.get_params())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Use only the most important features\n", "train_features = important_train_features[:]\n", "test_features = important_test_features[:]\n", "\n", "# Update feature list for visualizations\n", "feature_list = important_feature_names[:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "# Number of trees in random forest\n", "n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]\n", "# Number of features to consider at every split\n", "max_features = ['auto', 'sqrt']\n", "# Maximum number of levels in tree\n", "max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]\n", "max_depth.append(None)\n", "# Minimum number of samples required to split a node\n", "min_samples_split = [2, 5, 10]\n", "# Minimum number of samples required at each leaf node\n", "min_samples_leaf = [1, 2, 4]\n", "# Method of selecting samples for training each tree\n", "bootstrap = [True, False]\n", "\n", "# Create the random grid\n", "random_grid = {'n_estimators': n_estimators,\n", " 'max_features': max_features,\n", " 'max_depth': max_depth,\n", " 'min_samples_split': min_samples_split,\n", " 'min_samples_leaf': min_samples_leaf,\n", " 'bootstrap': bootstrap}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Use the random grid to search for best hyperparameters\n", "# First create the base model to tune\n", "rf = RandomForestRegressor(random_state=42)\n", "# Random search of parameters\n", "rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,\n", " n_iter = 20, scoring='neg_mean_absolute_error', \n", " cv = 3, verbose=2, random_state=42, n_jobs=-1)\n", "\n", "# Fit the random search model\n", "rf_random.fit(train_features, train_labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rf_random.best_params_" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }