123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266 |
- import pandas as pd
- import numpy as np
- from tqdm import tqdm_notebook
- def cyclical_encoding(series, period):
- features = pd.concat(
- [np.sin((2 * np.pi * series / period)), np.cos((2 * np.pi * series / period))],
- axis=1,
- )
- features.columns = [f"sin_{series.name}", f"cos_{series.name}"]
- return features
- def create_time_features(
- fld,
- keep_frac_only=False,
- include_additional=False,
- cyc_encode=False,
- timezone=None,
- ):
- """
- Create features out of a series of datetimes.
- Time zones are converted to local time if specified.
- :param fld: series of datetimes
- :param keep_frac_only: maintain only fractional times
- :param include_additional: whether to include additional attributes
- :param cyc_encode: whether to cyclically encode time and date features
- :param timezone: string for the time zone. if passed, times are converted to local
- :return df: dataframe with added date and time columns
- """
- # Convert to a series (in case of index)
- fld = pd.to_datetime(pd.Series(fld))
- # Create a dataframe with index as original times
- df = fld.to_frame().drop(columns=[fld.name])
- # Used for naming the columns
- prefix = fld.name
- prefix += "_"
- # Convert to local time and then remove time zone information
- if timezone:
- fld = fld.dt.tz_convert(timezone).dt.tz_localize(None)
- df["local"] = fld
- # Basic attributes
- attr = [
- "second",
- "minute",
- "hour",
- "year",
- "month",
- "week",
- "day",
- "dayofweek",
- "dayofyear",
- ]
- if include_additional:
- # Additional attributes to extract
- attr = attr + [
- "is_month_end",
- "is_month_start",
- "is_quarter_end",
- "is_quarter_start",
- "is_year_end",
- "is_year_start",
- "days_in_month",
- ]
- # iterate through each attribute and add it to the dataframe
- for n in attr:
- df[prefix + n] = getattr(fld.dt, n)
- # Add fractional time of day converting to hours
- df[prefix + "fracday"] = (
- df[prefix + "hour"]
- + df[prefix + "minute"] / 60
- + df[prefix + "second"] / 60 / 60
- ) / 24
- # Add fractional time of week
- df[prefix + "fracweek"] = (df[prefix + "dayofweek"] + df[prefix + "fracday"]) / 7
- # Add fractional time of month
- df[prefix + "fracmonth"] = ((df[prefix + "day"] - 1) + df[prefix + "fracday"]) / (
- fld.dt.days_in_month
- ) # Use fld days_in_month in case this is not
- # one of the attributes specified
- # Calculate days in year (accounting for leap year rules)
- days_in_year = np.where(
- (df[prefix + "year"] % 4 == 0)
- & ((df[prefix + "year"] % 100 != 0) | (df[prefix + "year"] % 400 == 0)),
- 366,
- 365,
- )
- # Add fractional time of year
- df[prefix + "fracyear"] = (
- (df[prefix + "dayofyear"] - 1) + df[prefix + "fracday"]
- ) / days_in_year
- if cyc_encode:
- df = pd.concat([df, cyclical_encoding(df[prefix + "hour"], 24)], axis=1)
- df = pd.concat([df, cyclical_encoding(df[prefix + "dayofweek"], 6)], axis=1)
- df = pd.concat([df, cyclical_encoding(df[prefix + "day"], 31)], axis=1)
- df = pd.concat([df, cyclical_encoding(df[prefix + "month"], 12)], axis=1)
- df = pd.concat(
- [df] + [cyclical_encoding(df[c], 1) for c in df if "frac" in c], axis=1
- )
- if keep_frac_only:
- df = df.drop(
- [
- prefix + c
- for c in [
- "second",
- "minute",
- "hour",
- "year",
- "month",
- "week",
- "day",
- "dayofweek",
- "dayofyear",
- ]
- ],
- axis=1,
- )
- df = df.set_index(fld).sort_index()
- return df
- def monthly_validation(data, model, track=False):
- train_stops = np.unique(data.index[data.index.is_month_end].date)
- X = data.copy()
- y = X.pop("energy")
- weighted_score = 0
- total_possible = 0
- train_points = []
- test_points = []
- scores = []
- for date in train_stops:
- y_train, y_test = y[:date], y[date:]
- X_train, X_test = X[:date], X[date:]
- model.fit(X_train, y_train)
- y_hat = model.predict(X_test)
- test_start, test_end = X_test.index.min().date(), X_test.index.max().date()
- n_days = (test_end - test_start).days
- score = 100 - mape(y_test, y_hat)
- if track:
- print(
- f"Accuracy: {score:.2f}% testing from {test_start} to {test_end} ({n_days} days)."
- )
- weighted_score += score * len(X_test)
- total_possible += 100 * len(X_test)
- train_points.append(len(X_train))
- test_points.append(len(X_test))
- scores.append(score)
- model.fit(X, y)
- importance_df = None
- if hasattr(model, "feature_importances_"):
- importance_df = pd.DataFrame(
- dict(features=X.columns, importance=model.feature_importances_)
- )
- final_score = weighted_score / total_possible
- results_df = pd.DataFrame(
- dict(train_points=train_points, test_points=test_points, score=scores)
- )
- return dict(results=results_df, importances=importance_df, score=final_score)
- def mape(y_true, y_pred):
- return 100 * np.mean(np.abs((y_pred - y_true) / y_true))
- def data_reading(filename):
- data = pd.read_csv(filename, parse_dates=["timestamp"])
- data = data.dropna(subset=["energy"])
- freq_counts = data["timestamp"].diff(1).value_counts()
- freq = round(freq_counts.idxmax().total_seconds() / 60)
- data = data.set_index("timestamp").sort_index()
- return data, freq, len(data)
- def data_testing(filename, model):
- building_id = filename.split("_")[-1].split(".csv")[0]
- data, freq, dpoints = data_reading(filename)
- results = test_time_features(data, model)
- results["freq"] = freq
- results["dpoints"] = dpoints
- results["building_id"] = building_id
- return results
- def test_time_features(data, model):
- data = pd.concat([data, create_time_features(data.index, cyc_encode=True)], axis=1)
- scores = []
- methods = []
- y = data.pop("energy")
- normal_features = [
- "timestamp_" + t for t in ["hour", "dayofweek", "month", "dayofyear", "year"]
- ]
- normal_cyc_features = [
- "sin_" + t
- for t in normal_features
- if t not in ["timestamp_dayofyear", "timestamp_year"]
- ] + [
- "cos_" + t
- for t in normal_features
- if t not in ["timestamp_dayofyear", "timestamp_year"]
- ]
- frac_features = [
- "timestamp_" + t for t in ["fracday", "fracweek", "fracmonth", "fracyear"]
- ]
- frac_cyc_features = ["sin_" + t for t in frac_features] + [
- "cos_" + t for t in frac_features
- ]
- data_normal = data[normal_features].copy()
- data_normal_cyc = data[normal_cyc_features].copy()
- data_frac = data[frac_features].copy()
- data_frac_cyc = data[frac_cyc_features].copy()
- results = {}
- dataset_names = ["normal", "normal_cyc", "frac", "frac_cyc"]
- for dataset, name in zip(
- [data_normal, data_normal_cyc, data_frac, data_frac_cyc], dataset_names
- ):
- to_drop = dataset.columns[
- (dataset.nunique() == 1) | (dataset.nunique() == len(dataset))
- ]
- dataset = dataset.drop(columns=to_drop)
- dataset["energy"] = y.copy()
- try:
- data_results = monthly_validation(dataset, model)
- scores.append(data_results["score"])
- methods.append(name)
- except Exception as e:
- print(e, name)
- results = pd.DataFrame(dict(score=scores, method=methods))
- return results
|