浏览代码

Running analysis

Will K 6 年之前
父节点
当前提交
5bd09d23a6

+ 397 - 148
cyclical-features/Testing Cyclical Encoding.ipynb

@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -30,7 +30,7 @@
        "40"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -42,94 +42,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>temperature</th>\n",
-       "      <th>energy</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>timestamp</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2016-09-18 04:00:00</th>\n",
-       "      <td>56.240300</td>\n",
-       "      <td>1.682686</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-09-18 04:15:00</th>\n",
-       "      <td>56.087501</td>\n",
-       "      <td>2.086212</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-09-18 04:30:00</th>\n",
-       "      <td>56.213232</td>\n",
-       "      <td>1.687880</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-09-18 04:45:00</th>\n",
-       "      <td>56.400049</td>\n",
-       "      <td>1.926518</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-09-18 05:00:00</th>\n",
-       "      <td>56.592497</td>\n",
-       "      <td>1.922459</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                     temperature    energy\n",
-       "timestamp                                 \n",
-       "2016-09-18 04:00:00    56.240300  1.682686\n",
-       "2016-09-18 04:15:00    56.087501  2.086212\n",
-       "2016-09-18 04:30:00    56.213232  1.687880\n",
-       "2016-09-18 04:45:00    56.400049  1.926518\n",
-       "2016-09-18 05:00:00    56.592497  1.922459"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "DatetimeIndex: 36960 entries, 2016-09-18 04:00:00 to 2017-10-08 03:45:00\n",
+      "DatetimeIndex: 100786 entries, 2014-01-01 06:15:00 to 2016-11-16 02:30:00\n",
       "Data columns (total 2 columns):\n",
-      "temperature    36960 non-null float64\n",
-      "energy         36960 non-null float64\n",
+      "temperature    100786 non-null float64\n",
+      "energy         100786 non-null float64\n",
       "dtypes: float64(2)\n",
-      "memory usage: 866.2 KB\n"
+      "memory usage: 2.3 MB\n"
      ]
     }
    ],
@@ -141,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -173,7 +99,7 @@
     "        X[\"sin_time_of_day\"], X[\"cos_time_of_day\"] = _cyclical_encoding(\n",
     "            X[\"time_of_day\"], period=24\n",
     "        )\n",
-    "        X[\"sin_day_of_year\"], X[\"cos_day_of_year\"] = _cylical_encoding(\n",
+    "        X[\"sin_day_of_year\"], X[\"cos_day_of_year\"] = _cyclical_encoding(\n",
     "            X[\"day_of_year\"], period=366\n",
     "        )\n",
     "        return X\n",
@@ -186,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -233,91 +159,91 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>2016-09-18 04:00:00</th>\n",
-       "      <td>56.240300</td>\n",
-       "      <td>1.682686</td>\n",
-       "      <td>4.00</td>\n",
-       "      <td>262</td>\n",
-       "      <td>0.866025</td>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>-0.977064</td>\n",
-       "      <td>-0.212947</td>\n",
+       "      <th>2014-01-01 06:15:00</th>\n",
+       "      <td>40.987233</td>\n",
+       "      <td>43.012862</td>\n",
+       "      <td>6.25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.997859</td>\n",
+       "      <td>-0.065403</td>\n",
+       "      <td>0.017166</td>\n",
+       "      <td>0.999853</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2016-09-18 04:15:00</th>\n",
-       "      <td>56.087501</td>\n",
-       "      <td>2.086212</td>\n",
-       "      <td>4.25</td>\n",
-       "      <td>262</td>\n",
-       "      <td>0.896873</td>\n",
-       "      <td>0.442289</td>\n",
-       "      <td>-0.977064</td>\n",
-       "      <td>-0.212947</td>\n",
+       "      <th>2014-01-01 06:30:00</th>\n",
+       "      <td>41.007768</td>\n",
+       "      <td>43.780204</td>\n",
+       "      <td>6.50</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.991445</td>\n",
+       "      <td>-0.130526</td>\n",
+       "      <td>0.017166</td>\n",
+       "      <td>0.999853</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2016-09-18 04:30:00</th>\n",
-       "      <td>56.213232</td>\n",
-       "      <td>1.687880</td>\n",
-       "      <td>4.50</td>\n",
-       "      <td>262</td>\n",
-       "      <td>0.923880</td>\n",
-       "      <td>0.382683</td>\n",
-       "      <td>-0.977064</td>\n",
-       "      <td>-0.212947</td>\n",
+       "      <th>2014-01-01 06:45:00</th>\n",
+       "      <td>41.002971</td>\n",
+       "      <td>43.012709</td>\n",
+       "      <td>6.75</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.980785</td>\n",
+       "      <td>-0.195090</td>\n",
+       "      <td>0.017166</td>\n",
+       "      <td>0.999853</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2016-09-18 04:45:00</th>\n",
-       "      <td>56.400049</td>\n",
-       "      <td>1.926518</td>\n",
-       "      <td>4.75</td>\n",
-       "      <td>262</td>\n",
-       "      <td>0.946930</td>\n",
-       "      <td>0.321439</td>\n",
-       "      <td>-0.977064</td>\n",
-       "      <td>-0.212947</td>\n",
+       "      <th>2014-01-01 07:00:00</th>\n",
+       "      <td>41.008100</td>\n",
+       "      <td>42.631804</td>\n",
+       "      <td>7.00</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.965926</td>\n",
+       "      <td>-0.258819</td>\n",
+       "      <td>0.017166</td>\n",
+       "      <td>0.999853</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2016-09-18 05:00:00</th>\n",
-       "      <td>56.592497</td>\n",
-       "      <td>1.922459</td>\n",
-       "      <td>5.00</td>\n",
-       "      <td>262</td>\n",
-       "      <td>0.965926</td>\n",
-       "      <td>0.258819</td>\n",
-       "      <td>-0.977064</td>\n",
-       "      <td>-0.212947</td>\n",
+       "      <th>2014-01-01 07:15:00</th>\n",
+       "      <td>41.005961</td>\n",
+       "      <td>42.627307</td>\n",
+       "      <td>7.25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.946930</td>\n",
+       "      <td>-0.321439</td>\n",
+       "      <td>0.017166</td>\n",
+       "      <td>0.999853</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                     temperature    energy  time_of_day  day_of_year  \\\n",
-       "timestamp                                                              \n",
-       "2016-09-18 04:00:00    56.240300  1.682686         4.00          262   \n",
-       "2016-09-18 04:15:00    56.087501  2.086212         4.25          262   \n",
-       "2016-09-18 04:30:00    56.213232  1.687880         4.50          262   \n",
-       "2016-09-18 04:45:00    56.400049  1.926518         4.75          262   \n",
-       "2016-09-18 05:00:00    56.592497  1.922459         5.00          262   \n",
+       "                     temperature     energy  time_of_day  day_of_year  \\\n",
+       "timestamp                                                               \n",
+       "2014-01-01 06:15:00    40.987233  43.012862         6.25            1   \n",
+       "2014-01-01 06:30:00    41.007768  43.780204         6.50            1   \n",
+       "2014-01-01 06:45:00    41.002971  43.012709         6.75            1   \n",
+       "2014-01-01 07:00:00    41.008100  42.631804         7.00            1   \n",
+       "2014-01-01 07:15:00    41.005961  42.627307         7.25            1   \n",
        "\n",
        "                     sin_time_of_day  cos_time_of_day  sin_day_of_year  \\\n",
        "timestamp                                                                \n",
-       "2016-09-18 04:00:00         0.866025         0.500000        -0.977064   \n",
-       "2016-09-18 04:15:00         0.896873         0.442289        -0.977064   \n",
-       "2016-09-18 04:30:00         0.923880         0.382683        -0.977064   \n",
-       "2016-09-18 04:45:00         0.946930         0.321439        -0.977064   \n",
-       "2016-09-18 05:00:00         0.965926         0.258819        -0.977064   \n",
+       "2014-01-01 06:15:00         0.997859        -0.065403         0.017166   \n",
+       "2014-01-01 06:30:00         0.991445        -0.130526         0.017166   \n",
+       "2014-01-01 06:45:00         0.980785        -0.195090         0.017166   \n",
+       "2014-01-01 07:00:00         0.965926        -0.258819         0.017166   \n",
+       "2014-01-01 07:15:00         0.946930        -0.321439         0.017166   \n",
        "\n",
        "                     cos_day_of_year  \n",
        "timestamp                             \n",
-       "2016-09-18 04:00:00        -0.212947  \n",
-       "2016-09-18 04:15:00        -0.212947  \n",
-       "2016-09-18 04:30:00        -0.212947  \n",
-       "2016-09-18 04:45:00        -0.212947  \n",
-       "2016-09-18 05:00:00        -0.212947  "
+       "2014-01-01 06:15:00         0.999853  \n",
+       "2014-01-01 06:30:00         0.999853  \n",
+       "2014-01-01 06:45:00         0.999853  \n",
+       "2014-01-01 07:00:00         0.999853  \n",
+       "2014-01-01 07:15:00         0.999853  "
       ]
      },
-     "execution_count": 16,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -338,6 +264,329 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class WeeklyValidator(BaseEstimator):\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "    \n",
+    "    def fit(self, X, y):\n",
+    "        pass\n",
+    "    \n",
+    "    def predict(self, X):\n",
+    "        pass\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_weekly_validation(models, data):\n",
+    "    \n",
+    "    all_predictions = []\n",
+    "    \n",
+    "    feature_sets = [['time_of_day', 'day_of_year', 'temperature'], \n",
+    "                    ['sin_time_of_day', 'cos_time_of_day', 'sin_day_of_year', 'cos_day_of_year', 'temperature']]\n",
+    "    # Iterate through features\n",
+    "    for feature_set in feature_sets:\n",
+    "        features='standard' if 'sin_time_of_day' not in feature_set else 'cyclical'\n",
+    "        print(f'Using features: {features}')\n",
+    "        # Subset to data\n",
+    "        X = data[feature_set + ['energy']].copy()\n",
+    "        \n",
+    "        # Iterate through models\n",
+    "        for model in models:\n",
+    "            model_name = model.__class__.__name__\n",
+    "            print(f'Using model: {model_name}')\n",
+    "            \n",
+    "            # Iterate through weeks in the dataset\n",
+    "            # Must group by string formatted week and year\n",
+    "            for (week, year), X_test in tqdm.tqdm(X.groupby([X.index.strftime('%U'), X.index.strftime('%Y')]), desc='Weeks'):\n",
+    "                \n",
+    "                # Subset to training data\n",
+    "                X_train = X[X.index < X_test.index.min()].copy()\n",
+    "                \n",
+    "                # Can not train or test on zero observations\n",
+    "                if len(X_train) == 0 or len(X_test) == 0:\n",
+    "                    continue\n",
+    "                    \n",
+    "                # Targets\n",
+    "                y_train = X_train.pop('energy')\n",
+    "                y_test = X_test.pop('energy')\n",
+    "                \n",
+    "                model.fit(X_train, y_train)\n",
+    "                predictions = model.predict(X_test)\n",
+    "                \n",
+    "                # Record predictions along with actual values, model, and feature set in a dataframe\n",
+    "                predictions = pd.DataFrame(dict(predicted=predictions,\n",
+    "                                                actual=y_test, \n",
+    "                                                model=model_name, \n",
+    "                                                features=features),\n",
+    "                                           index=X_test.index)\n",
+    "                \n",
+    "                all_predictions.append(predictions)\n",
+    "    # Return list of dataframes\n",
+    "    return all_predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tqdm\n",
+    "import black"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "\n",
+    "# Create linear model and random forest model for regression\n",
+    "models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_estimators=100, max_depth=None, n_jobs=-1, random_state=100)]\n",
+    "# validation = run_weekly_validation(models, data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "Buildings:   0%|                                                                                                                                   | 0/40 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using features: standard\n",
+      "Using model: LinearRegression\n"
+     ]
+    }
+   ],
+   "source": [
+    "def run_all_buildings(building_data_files):\n",
+    "    # Run validation for all buildings\n",
+    "    for building_file_name in tqdm.tqdm(building_data_files, desc='Buildings'):\n",
+    "        building_data = pd.read_csv(building_file_name, parse_dates=['timestamp']).set_index('timestamp')\n",
+    "        # Create sets of features\n",
+    "        building_data = transforms.transform(building_data)\n",
+    "        \n",
+    "        # Run the validation and save the results\n",
+    "        building_validation = run_weekly_validation(models, building_data)\n",
+    "        # Convert from list of dataframes to single dataframe\n",
+    "        building_validation = pd.concat(building_validation).reset_index().sort_values(['model', 'features', 'timestamp']).set_index('timestamp')\n",
+    "        # Save off results for analysis\n",
+    "        building_validation.to_csv(f\"{building_file_name.replace('energy_data', 'validation_results').replace('data', 'validation_results')}\")\n",
+    "                                   \n",
+    "run_all_buildings(building_data_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>predicted</th>\n",
+       "      <th>actual</th>\n",
+       "      <th>model</th>\n",
+       "      <th>features</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>timestamp</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2016-01-02 22:45:00</th>\n",
+       "      <td>135.545103</td>\n",
+       "      <td>71.810619</td>\n",
+       "      <td>RandomForestRegressor</td>\n",
+       "      <td>cyclical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2016-01-02 23:00:00</th>\n",
+       "      <td>144.112047</td>\n",
+       "      <td>96.005072</td>\n",
+       "      <td>RandomForestRegressor</td>\n",
+       "      <td>cyclical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2016-01-02 23:15:00</th>\n",
+       "      <td>144.322802</td>\n",
+       "      <td>93.704430</td>\n",
+       "      <td>RandomForestRegressor</td>\n",
+       "      <td>cyclical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2016-01-02 23:30:00</th>\n",
+       "      <td>143.739093</td>\n",
+       "      <td>94.853490</td>\n",
+       "      <td>RandomForestRegressor</td>\n",
+       "      <td>cyclical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2016-01-02 23:45:00</th>\n",
+       "      <td>140.805196</td>\n",
+       "      <td>92.931160</td>\n",
+       "      <td>RandomForestRegressor</td>\n",
+       "      <td>cyclical</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      predicted     actual                  model  features\n",
+       "timestamp                                                                  \n",
+       "2016-01-02 22:45:00  135.545103  71.810619  RandomForestRegressor  cyclical\n",
+       "2016-01-02 23:00:00  144.112047  96.005072  RandomForestRegressor  cyclical\n",
+       "2016-01-02 23:15:00  144.322802  93.704430  RandomForestRegressor  cyclical\n",
+       "2016-01-02 23:30:00  143.739093  94.853490  RandomForestRegressor  cyclical\n",
+       "2016-01-02 23:45:00  140.805196  92.931160  RandomForestRegressor  cyclical"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "validation = pd.concat(validation)\n",
+    "validation.head()\n",
+    "validation.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['LinearRegression', 'RandomForestRegressor'], dtype=object)"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array(['standard', 'cyclical'], dtype=object)"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "validation['model'].unique()\n",
+    "validation['features'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "InteractiveShell.ast_node_interactivity = 'all'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['LinearRegression'], dtype=object)"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "validation['model'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_results(results):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def graph_results(results):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],

+ 3 - 0
cyclical-features/validation_results/building_10_validation_results.csv

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5996dd9f8299398b2866ac586b3217e839c37adcf660624c8d3179e2b1825bf6
+size 444952

+ 3 - 0
cyclical-features/validation_results/building_11_validation_results.csv

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0bdc0c6eff231981d2e2e55c6589c8fdde2f039c2c38be75cb9078664f025f6
+size 299522