diagnostics.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree. An additional grant
  6. # of patent rights can be found in the PATENTS file in the same directory.
  7. from __future__ import absolute_import
  8. from __future__ import division
  9. from __future__ import print_function
  10. from __future__ import unicode_literals
  11. from copy import deepcopy
  12. from functools import reduce
  13. import logging
  14. import numpy as np
  15. import pandas as pd
  16. logger = logging.getLogger(__name__)
  17. def _cutoffs(df, horizon, k, period):
  18. """Generate cutoff dates
  19. Parameters
  20. ----------
  21. df: pd.DataFrame with historical data
  22. horizon: pd.Timedelta.
  23. Forecast horizon
  24. k: Int number.
  25. The number of forecasts point.
  26. period: pd.Timedelta.
  27. Simulated Forecast will be done at every this period.
  28. Returns
  29. -------
  30. list of pd.Timestamp
  31. """
  32. # Last cutoff is 'latest date in data - horizon' date
  33. cutoff = df['ds'].max() - horizon
  34. if cutoff < df['ds'].min():
  35. raise ValueError('Less data than horizon.')
  36. result = [cutoff]
  37. for i in range(1, k):
  38. cutoff -= period
  39. # If data does not exist in data range (cutoff, cutoff + horizon]
  40. if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
  41. # Next cutoff point is 'last date before cutoff in data - horizon'
  42. closest_date = df[df['ds'] <= cutoff].max()['ds']
  43. cutoff = closest_date - horizon
  44. if cutoff < df['ds'].min():
  45. logger.warning(
  46. 'Not enough data for requested number of cutoffs! '
  47. 'Using {}.'.format(i))
  48. break
  49. result.append(cutoff)
  50. # Sort lines in ascending order
  51. return reversed(result)
  52. def simulated_historical_forecasts(model, horizon, k, period=None):
  53. """Simulated Historical Forecasts.
  54. Make forecasts from k historical cutoff points, working backwards from
  55. (end - horizon) with a spacing of period between each cutoff.
  56. Parameters
  57. ----------
  58. model: Prophet class object.
  59. Fitted Prophet model
  60. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  61. '3 hours', '10 seconds'.
  62. k: Int number of forecasts point.
  63. period: Optional string with pd.Timedelta compatible style. Simulated
  64. forecast will be done at every this period. If not provided,
  65. 0.5 * horizon is used.
  66. Returns
  67. -------
  68. A pd.DataFrame with the forecast, actual value and cutoff.
  69. """
  70. df = model.history.copy().reset_index(drop=True)
  71. horizon = pd.Timedelta(horizon)
  72. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  73. cutoffs = _cutoffs(df, horizon, k, period)
  74. predicts = []
  75. for cutoff in cutoffs:
  76. # Generate new object with copying fitting options
  77. m = prophet_copy(model, cutoff)
  78. # Train model
  79. m.fit(df[df['ds'] <= cutoff])
  80. # Calculate yhat
  81. index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
  82. # Get the columns for the future dataframe
  83. columns = ['ds']
  84. if m.growth == 'logistic':
  85. columns.append('cap')
  86. if m.logistic_floor:
  87. columns.append('floor')
  88. columns.extend(m.extra_regressors.keys())
  89. yhat = m.predict(df[index_predicted][columns])
  90. # Merge yhat(predicts), y(df, original data) and cutoff
  91. predicts.append(pd.concat([
  92. yhat[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
  93. df[index_predicted][['y']].reset_index(drop=True),
  94. pd.DataFrame({'cutoff': [cutoff] * len(yhat)})
  95. ], axis=1))
  96. # Combine all predicted pd.DataFrame into one pd.DataFrame
  97. return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
  98. def cross_validation(model, horizon, period=None, initial=None):
  99. """Cross-Validation for time series.
  100. Computes forecasts from historical cutoff points. Beginning from initial,
  101. makes cutoffs with a spacing of period up to (end - horizon).
  102. When period is equal to the time interval of the data, this is the
  103. technique described in https://robjhyndman.com/hyndsight/tscv/ .
  104. Parameters
  105. ----------
  106. model: Prophet class object. Fitted Prophet model
  107. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  108. '3 hours', '10 seconds'.
  109. period: string with pd.Timedelta compatible style. Simulated forecast will
  110. be done at every this period. If not provided, 0.5 * horizon is used.
  111. initial: string with pd.Timedelta compatible style. The first training
  112. period will begin here. If not provided, 3 * horizon is used.
  113. Returns
  114. -------
  115. A pd.DataFrame with the forecast, actual value and cutoff.
  116. """
  117. te = model.history['ds'].max()
  118. ts = model.history['ds'].min()
  119. horizon = pd.Timedelta(horizon)
  120. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  121. initial = 3 * horizon if initial is None else pd.Timedelta(initial)
  122. k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
  123. if k < 1:
  124. raise ValueError(
  125. 'Not enough data for specified horizon, period, and initial.')
  126. return simulated_historical_forecasts(model, horizon, k, period)
  127. def prophet_copy(m, cutoff=None):
  128. """Copy Prophet object
  129. Parameters
  130. ----------
  131. m: Prophet model.
  132. cutoff: pd.Timestamp or None, default None.
  133. cuttoff Timestamp for changepoints member variable.
  134. changepoints are only retained if 'changepoints <= cutoff'
  135. Returns
  136. -------
  137. Prophet class object with the same parameter with model variable
  138. """
  139. if m.history is None:
  140. raise Exception('This is for copying a fitted Prophet object.')
  141. if m.specified_changepoints:
  142. changepoints = m.changepoints
  143. if cutoff is not None:
  144. # Filter change points '<= cutoff'
  145. changepoints = changepoints[changepoints <= cutoff]
  146. else:
  147. changepoints = None
  148. # Auto seasonalities are set to False because they are already set in
  149. # m.seasonalities.
  150. m2 = m.__class__(
  151. growth=m.growth,
  152. n_changepoints=m.n_changepoints,
  153. changepoints=changepoints,
  154. yearly_seasonality=False,
  155. weekly_seasonality=False,
  156. daily_seasonality=False,
  157. holidays=m.holidays,
  158. seasonality_prior_scale=m.seasonality_prior_scale,
  159. changepoint_prior_scale=m.changepoint_prior_scale,
  160. holidays_prior_scale=m.holidays_prior_scale,
  161. mcmc_samples=m.mcmc_samples,
  162. interval_width=m.interval_width,
  163. uncertainty_samples=m.uncertainty_samples,
  164. )
  165. m2.extra_regressors = deepcopy(m.extra_regressors)
  166. m2.seasonalities = deepcopy(m.seasonalities)
  167. return m2
  168. def me(df):
  169. return((df['yhat'] - df['y']).sum()/len(df['yhat']))
  170. def mse(df):
  171. return((df['yhat'] - df['y']).pow(2).sum()/len(df))
  172. def rmse(df):
  173. return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
  174. def mae(df):
  175. return((df['yhat'] - df['y']).abs().sum()/len(df))
  176. def mpe(df):
  177. return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
  178. def mape(df):
  179. return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))
  180. def all_metrics(model, df_cv = None):
  181. """Compute model fit metrics for time series.
  182. Computes the following metrics about each time series that has been through
  183. Cross Validation;
  184. Mean Error (ME)
  185. Mean Squared Error (MSE)
  186. Root Mean Square Error (RMSE,
  187. Mean Absolute Error (MAE)
  188. Mean Percentage Error (MPE)
  189. Mean Absolute Percentage Error (MAPE)
  190. Parameters
  191. ----------
  192. df: A pandas dataframe. Contains y and yhat produced by cross-validation
  193. Returns
  194. -------
  195. A dictionary where the key = the error type, and value is the value of the error
  196. """
  197. df = []
  198. if df_cv is not None:
  199. df = df_cv
  200. else:
  201. # run a forecast on your own data with period = 0 so that it is in-sample data onlyl
  202. #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
  203. df = (model
  204. .history[['ds', 'y']]
  205. .merge(
  206. model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']],
  207. how='inner', on='ds'
  208. )
  209. )
  210. if 'yhat' not in df.columns:
  211. raise ValueError(
  212. 'Please run Cross-Validation first before computing quality metrics.')
  213. return {
  214. 'ME':me(df),
  215. 'MSE':mse(df),
  216. 'RMSE': rmse(df),
  217. 'MAE': mae(df),
  218. 'MPE': mpe(df),
  219. 'MAPE': mape(df)
  220. }