123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- # Copyright (c) 2017-present, Facebook, Inc.
- # All rights reserved.
- #
- # This source code is licensed under the BSD-style license found in the
- # LICENSE file in the root directory of this source tree. An additional grant
- # of patent rights can be found in the PATENTS file in the same directory.
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- from __future__ import unicode_literals
- import logging
- logger = logging.getLogger(__name__)
- import numpy as np
- import pandas as pd
- from functools import reduce
- def _cutoffs(df, horizon, k, period):
- """Generate cutoff dates
- Parameters
- ----------
- df: pd.DataFrame with historical data
- horizon: pd.Timedelta.
- Forecast horizon
- k: Int number.
- The number of forecasts point.
- period: pd.Timedelta.
- Simulated Forecast will be done at every this period.
- Returns
- -------
- list of pd.Timestamp
- """
- # Last cutoff is 'latest date in data - horizon' date
- cutoff = df['ds'].max() - horizon
- if cutoff < df['ds'].min():
- raise ValueError('Less data than horizon.')
- result = [cutoff]
- for i in range(1, k):
- cutoff -= period
- # If data does not exist in data range (cutoff, cutoff + horizon]
- if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
- # Next cutoff point is 'last date before cutoff in data - horizon'
- closest_date = df[df['ds'] <= cutoff].max()['ds']
- cutoff = closest_date - horizon
- if cutoff < df['ds'].min():
- logger.warning(
- 'Not enough data for requested number of cutoffs! '
- 'Using {}.'.format(i))
- break
- result.append(cutoff)
- # Sort lines in ascending order
- return reversed(result)
- def simulated_historical_forecasts(model, horizon, k, period=None):
- """Simulated Historical Forecasts.
- Make forecasts from k historical cutoff points, working backwards from
- (end - horizon) with a spacing of period between each cutoff.
- Parameters
- ----------
- model: Prophet class object.
- Fitted Prophet model
- horizon: string with pd.Timedelta compatible style, e.g., '5 days',
- '3 hours', '10 seconds'.
- k: Int number of forecasts point.
- period: Optional string with pd.Timedelta compatible style. Simulated
- forecast will be done at every this period. If not provided,
- 0.5 * horizon is used.
- Returns
- -------
- A pd.DataFrame with the forecast, actual value and cutoff.
- """
- df = model.history.copy().reset_index(drop=True)
- horizon = pd.Timedelta(horizon)
- period = 0.5 * horizon if period is None else pd.Timedelta(period)
- cutoffs = _cutoffs(df, horizon, k, period)
- predicts = []
- for cutoff in cutoffs:
- # Generate new object with copying fitting options
- m = model.copy(cutoff)
- # Train model
- m.fit(df[df['ds'] <= cutoff])
- # Calculate yhat
- index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
- # Get the columns for the future dataframe
- columns = ['ds']
- if m.growth == 'logistic':
- columns.append('cap')
- if m.logistic_floor:
- columns.append('floor')
- columns.extend(m.extra_regressors.keys())
- yhat = m.predict(df[index_predicted][columns])
- # Merge yhat(predicts), y(df, original data) and cutoff
- predicts.append(pd.concat([
- yhat[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
- df[index_predicted][['y']].reset_index(drop=True),
- pd.DataFrame({'cutoff': [cutoff] * len(yhat)})
- ], axis=1))
- # Combine all predicted pd.DataFrame into one pd.DataFrame
- return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
- def cross_validation(model, horizon, period=None, initial=None):
- """Cross-Validation for time series.
- Computes forecasts from historical cutoff points. Beginning from initial,
- makes cutoffs with a spacing of period up to (end - horizon).
- When period is equal to the time interval of the data, this is the
- technique described in https://robjhyndman.com/hyndsight/tscv/ .
- Parameters
- ----------
- model: Prophet class object. Fitted Prophet model
- horizon: string with pd.Timedelta compatible style, e.g., '5 days',
- '3 hours', '10 seconds'.
- period: string with pd.Timedelta compatible style. Simulated forecast will
- be done at every this period. If not provided, 0.5 * horizon is used.
- initial: string with pd.Timedelta compatible style. The first training
- period will begin here. If not provided, 3 * horizon is used.
- Returns
- -------
- A pd.DataFrame with the forecast, actual value and cutoff.
- """
- te = model.history['ds'].max()
- ts = model.history['ds'].min()
- horizon = pd.Timedelta(horizon)
- period = 0.5 * horizon if period is None else pd.Timedelta(period)
- initial = 3 * horizon if initial is None else pd.Timedelta(initial)
- k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
- if k < 1:
- raise ValueError(
- 'Not enough data for specified horizon, period, and initial.')
- return simulated_historical_forecasts(model, horizon, k, period)
- def me(df):
- return((df['yhat'] - df['y']).sum()/len(df['yhat']))
- def mse(df):
- return((df['yhat'] - df['y']).pow(2).sum()/len(df))
- def rmse(df):
- return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
- def mae(df):
- return((df['yhat'] - df['y']).abs().sum()/len(df))
- def mpe(df):
- return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
- def mape(df):
- return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))
- def all_metrics(model, df_cv = None):
- """Compute model fit metrics for time series.
- Computes the following metrics about each time series that has been through
- Cross Validation;
- Mean Error (ME)
- Mean Squared Error (MSE)
- Root Mean Square Error (RMSE,
- Mean Absolute Error (MAE)
- Mean Percentage Error (MPE)
- Mean Absolute Percentage Error (MAPE)
- Parameters
- ----------
- df: A pandas dataframe. Contains y and yhat produced by cross-validation
- Returns
- -------
- A dictionary where the key = the error type, and value is the value of the error
- """
-
- df = []
- if df_cv is not None:
- df = df_cv
- else:
- # run a forecast on your own data with period = 0 so that it is in-sample data onlyl
- #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
- df = (model
- .history[['ds', 'y']]
- .merge(
- model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']],
- how='inner', on='ds'
- )
- )
- if 'yhat' not in df.columns:
- raise ValueError(
- 'Please run Cross-Validation first before computing quality metrics.')
- return {
- 'ME':me(df),
- 'MSE':mse(df),
- 'RMSE': rmse(df),
- 'MAE': mae(df),
- 'MPE': mpe(df),
- 'MAPE': mape(df)
- }
|