diagnostics.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree. An additional grant
  6. # of patent rights can be found in the PATENTS file in the same directory.
  7. from __future__ import absolute_import
  8. from __future__ import division
  9. from __future__ import print_function
  10. from __future__ import unicode_literals
  11. import logging
  12. logger = logging.getLogger(__name__)
  13. import numpy as np
  14. import pandas as pd
  15. from functools import reduce
  16. def _cutoffs(df, horizon, k, period):
  17. """Generate cutoff dates
  18. Parameters
  19. ----------
  20. df: pd.DataFrame with historical data
  21. horizon: pd.Timedelta.
  22. Forecast horizon
  23. k: Int number.
  24. The number of forecasts point.
  25. period: pd.Timedelta.
  26. Simulated Forecast will be done at every this period.
  27. Returns
  28. -------
  29. list of pd.Timestamp
  30. """
  31. # Last cutoff is 'latest date in data - horizon' date
  32. cutoff = df['ds'].max() - horizon
  33. if cutoff < df['ds'].min():
  34. raise ValueError('Less data than horizon.')
  35. result = [cutoff]
  36. for i in range(1, k):
  37. cutoff -= period
  38. # If data does not exist in data range (cutoff, cutoff + horizon]
  39. if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
  40. # Next cutoff point is 'last date before cutoff in data - horizon'
  41. closest_date = df[df['ds'] <= cutoff].max()['ds']
  42. cutoff = closest_date - horizon
  43. if cutoff < df['ds'].min():
  44. logger.warning(
  45. 'Not enough data for requested number of cutoffs! '
  46. 'Using {}.'.format(i))
  47. break
  48. result.append(cutoff)
  49. # Sort lines in ascending order
  50. return reversed(result)
  51. def simulated_historical_forecasts(model, horizon, k, period=None):
  52. """Simulated Historical Forecasts.
  53. Make forecasts from k historical cutoff points, working backwards from
  54. (end - horizon) with a spacing of period between each cutoff.
  55. Parameters
  56. ----------
  57. model: Prophet class object.
  58. Fitted Prophet model
  59. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  60. '3 hours', '10 seconds'.
  61. k: Int number of forecasts point.
  62. period: Optional string with pd.Timedelta compatible style. Simulated
  63. forecast will be done at every this period. If not provided,
  64. 0.5 * horizon is used.
  65. Returns
  66. -------
  67. A pd.DataFrame with the forecast, actual value and cutoff.
  68. """
  69. df = model.history.copy().reset_index(drop=True)
  70. horizon = pd.Timedelta(horizon)
  71. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  72. cutoffs = _cutoffs(df, horizon, k, period)
  73. predicts = []
  74. for cutoff in cutoffs:
  75. # Generate new object with copying fitting options
  76. m = model.copy(cutoff)
  77. # Train model
  78. m.fit(df[df['ds'] <= cutoff])
  79. # Calculate yhat
  80. index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
  81. # Get the columns for the future dataframe
  82. columns = ['ds']
  83. if m.growth == 'logistic':
  84. columns.append('cap')
  85. if m.logistic_floor:
  86. columns.append('floor')
  87. columns.extend(m.extra_regressors.keys())
  88. yhat = m.predict(df[index_predicted][columns])
  89. # Merge yhat(predicts), y(df, original data) and cutoff
  90. predicts.append(pd.concat([
  91. yhat[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
  92. df[index_predicted][['y']].reset_index(drop=True),
  93. pd.DataFrame({'cutoff': [cutoff] * len(yhat)})
  94. ], axis=1))
  95. # Combine all predicted pd.DataFrame into one pd.DataFrame
  96. return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
  97. def cross_validation(model, horizon, period=None, initial=None):
  98. """Cross-Validation for time series.
  99. Computes forecasts from historical cutoff points. Beginning from initial,
  100. makes cutoffs with a spacing of period up to (end - horizon).
  101. When period is equal to the time interval of the data, this is the
  102. technique described in https://robjhyndman.com/hyndsight/tscv/ .
  103. Parameters
  104. ----------
  105. model: Prophet class object. Fitted Prophet model
  106. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  107. '3 hours', '10 seconds'.
  108. period: string with pd.Timedelta compatible style. Simulated forecast will
  109. be done at every this period. If not provided, 0.5 * horizon is used.
  110. initial: string with pd.Timedelta compatible style. The first training
  111. period will begin here. If not provided, 3 * horizon is used.
  112. Returns
  113. -------
  114. A pd.DataFrame with the forecast, actual value and cutoff.
  115. """
  116. te = model.history['ds'].max()
  117. ts = model.history['ds'].min()
  118. horizon = pd.Timedelta(horizon)
  119. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  120. initial = 3 * horizon if initial is None else pd.Timedelta(initial)
  121. k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
  122. if k < 1:
  123. raise ValueError(
  124. 'Not enough data for specified horizon, period, and initial.')
  125. return simulated_historical_forecasts(model, horizon, k, period)
  126. def me(df):
  127. return((df['yhat'] - df['y']).sum()/len(df['yhat']))
  128. def mse(df):
  129. return((df['yhat'] - df['y']).pow(2).sum()/len(df))
  130. def rmse(df):
  131. return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
  132. def mae(df):
  133. return((df['yhat'] - df['y']).abs().sum()/len(df))
  134. def mpe(df):
  135. return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
  136. def mape(df):
  137. return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))
  138. def all_metrics(model, df_cv = None):
  139. """Compute model fit metrics for time series.
  140. Computes the following metrics about each time series that has been through
  141. Cross Validation;
  142. Mean Error (ME)
  143. Mean Squared Error (MSE)
  144. Root Mean Square Error (RMSE,
  145. Mean Absolute Error (MAE)
  146. Mean Percentage Error (MPE)
  147. Mean Absolute Percentage Error (MAPE)
  148. Parameters
  149. ----------
  150. df: A pandas dataframe. Contains y and yhat produced by cross-validation
  151. Returns
  152. -------
  153. A dictionary where the key = the error type, and value is the value of the error
  154. """
  155. df = []
  156. if df_cv is not None:
  157. df = df_cv
  158. else:
  159. # run a forecast on your own data with period = 0 so that it is in-sample data onlyl
  160. #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
  161. df = (model
  162. .history[['ds', 'y']]
  163. .merge(
  164. model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']],
  165. how='inner', on='ds'
  166. )
  167. )
  168. if 'yhat' not in df.columns:
  169. raise ValueError(
  170. 'Please run Cross-Validation first before computing quality metrics.')
  171. return {
  172. 'ME':me(df),
  173. 'MSE':mse(df),
  174. 'RMSE': rmse(df),
  175. 'MAE': mae(df),
  176. 'MPE': mpe(df),
  177. 'MAPE': mape(df)
  178. }