diagnostics.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree. An additional grant
  6. # of patent rights can be found in the PATENTS file in the same directory.
  7. from __future__ import absolute_import
  8. from __future__ import division
  9. from __future__ import print_function
  10. from __future__ import unicode_literals
  11. from copy import deepcopy
  12. from functools import reduce
  13. import logging
  14. import numpy as np
  15. import pandas as pd
  16. logger = logging.getLogger(__name__)
  17. def _cutoffs(df, horizon, k, period):
  18. """Generate cutoff dates
  19. Parameters
  20. ----------
  21. df: pd.DataFrame with historical data
  22. horizon: pd.Timedelta.
  23. Forecast horizon
  24. k: Int number.
  25. The number of forecasts point.
  26. period: pd.Timedelta.
  27. Simulated Forecast will be done at every this period.
  28. Returns
  29. -------
  30. list of pd.Timestamp
  31. """
  32. # Last cutoff is 'latest date in data - horizon' date
  33. cutoff = df['ds'].max() - horizon
  34. if cutoff < df['ds'].min():
  35. raise ValueError('Less data than horizon.')
  36. result = [cutoff]
  37. for i in range(1, k):
  38. cutoff -= period
  39. # If data does not exist in data range (cutoff, cutoff + horizon]
  40. if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
  41. # Next cutoff point is 'last date before cutoff in data - horizon'
  42. closest_date = df[df['ds'] <= cutoff].max()['ds']
  43. cutoff = closest_date - horizon
  44. if cutoff < df['ds'].min():
  45. logger.warning(
  46. 'Not enough data for requested number of cutoffs! '
  47. 'Using {}.'.format(i))
  48. break
  49. result.append(cutoff)
  50. # Sort lines in ascending order
  51. return reversed(result)
  52. def simulated_historical_forecasts(model, horizon, k, period=None):
  53. """Simulated Historical Forecasts.
  54. Make forecasts from k historical cutoff points, working backwards from
  55. (end - horizon) with a spacing of period between each cutoff.
  56. Parameters
  57. ----------
  58. model: Prophet class object.
  59. Fitted Prophet model
  60. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  61. '3 hours', '10 seconds'.
  62. k: Int number of forecasts point.
  63. period: Optional string with pd.Timedelta compatible style. Simulated
  64. forecast will be done at every this period. If not provided,
  65. 0.5 * horizon is used.
  66. Returns
  67. -------
  68. A pd.DataFrame with the forecast, actual value and cutoff.
  69. """
  70. df = model.history.copy().reset_index(drop=True)
  71. horizon = pd.Timedelta(horizon)
  72. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  73. cutoffs = _cutoffs(df, horizon, k, period)
  74. predicts = []
  75. for cutoff in cutoffs:
  76. # Generate new object with copying fitting options
  77. m = prophet_copy(model, cutoff)
  78. # Train model
  79. m.fit(df[df['ds'] <= cutoff])
  80. # Calculate yhat
  81. index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
  82. # Get the columns for the future dataframe
  83. columns = ['ds']
  84. if m.growth == 'logistic':
  85. columns.append('cap')
  86. if m.logistic_floor:
  87. columns.append('floor')
  88. columns.extend(m.extra_regressors.keys())
  89. yhat = m.predict(df[index_predicted][columns])
  90. # Merge yhat(predicts), y(df, original data) and cutoff
  91. predicts.append(pd.concat([
  92. yhat[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
  93. df[index_predicted][['y']].reset_index(drop=True),
  94. pd.DataFrame({'cutoff': [cutoff] * len(yhat)})
  95. ], axis=1))
  96. # Combine all predicted pd.DataFrame into one pd.DataFrame
  97. return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
  98. def cross_validation(model, horizon, period=None, initial=None):
  99. """Cross-Validation for time series.
  100. Computes forecasts from historical cutoff points. Beginning from initial,
  101. makes cutoffs with a spacing of period up to (end - horizon).
  102. When period is equal to the time interval of the data, this is the
  103. technique described in https://robjhyndman.com/hyndsight/tscv/ .
  104. Parameters
  105. ----------
  106. model: Prophet class object. Fitted Prophet model
  107. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  108. '3 hours', '10 seconds'.
  109. period: string with pd.Timedelta compatible style. Simulated forecast will
  110. be done at every this period. If not provided, 0.5 * horizon is used.
  111. initial: string with pd.Timedelta compatible style. The first training
  112. period will begin here. If not provided, 3 * horizon is used.
  113. Returns
  114. -------
  115. A pd.DataFrame with the forecast, actual value and cutoff.
  116. """
  117. te = model.history['ds'].max()
  118. ts = model.history['ds'].min()
  119. horizon = pd.Timedelta(horizon)
  120. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  121. initial = 3 * horizon if initial is None else pd.Timedelta(initial)
  122. k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
  123. if k < 1:
  124. raise ValueError(
  125. 'Not enough data for specified horizon, period, and initial.')
  126. return simulated_historical_forecasts(model, horizon, k, period)
  127. def prophet_copy(m, cutoff=None):
  128. """Copy Prophet object
  129. Parameters
  130. ----------
  131. m: Prophet model.
  132. cutoff: pd.Timestamp or None, default None.
  133. cuttoff Timestamp for changepoints member variable.
  134. changepoints are only retained if 'changepoints <= cutoff'
  135. Returns
  136. -------
  137. Prophet class object with the same parameter with model variable
  138. """
  139. if m.history is None:
  140. raise Exception('This is for copying a fitted Prophet object.')
  141. if m.specified_changepoints:
  142. changepoints = m.changepoints
  143. if cutoff is not None:
  144. # Filter change points '<= cutoff'
  145. changepoints = changepoints[changepoints <= cutoff]
  146. else:
  147. changepoints = None
  148. # Auto seasonalities are set to False because they are already set in
  149. # m.seasonalities.
  150. m2 = m.__class__(
  151. growth=m.growth,
  152. n_changepoints=m.n_changepoints,
  153. changepoints=changepoints,
  154. yearly_seasonality=False,
  155. weekly_seasonality=False,
  156. daily_seasonality=False,
  157. holidays=m.holidays,
  158. seasonality_prior_scale=m.seasonality_prior_scale,
  159. changepoint_prior_scale=m.changepoint_prior_scale,
  160. holidays_prior_scale=m.holidays_prior_scale,
  161. mcmc_samples=m.mcmc_samples,
  162. interval_width=m.interval_width,
  163. uncertainty_samples=m.uncertainty_samples,
  164. )
  165. m2.extra_regressors = deepcopy(m.extra_regressors)
  166. m2.seasonalities = deepcopy(m.seasonalities)
  167. return m2
  168. def performance_metrics(df, metrics=None, aggregation='horizon'):
  169. """Compute performance metrics from cross-validation results.
  170. Computes a suite of performance metrics on the output of cross-validation.
  171. By default the following metrics are included:
  172. 'mse': mean squared error
  173. 'mae': mean absolute error
  174. 'mape': mean percent error
  175. 'coverage': coverage of the upper and lower intervals
  176. A subset of these can be specified by passing a list of names as the
  177. `metrics` argument.
  178. By default, metrics will be computed for each horizon (ds - cutoff).
  179. Alternatively, metrics can be computed at the level of individual ds/cutoff
  180. pairs (aggregation='none'), or aggregated over all ds/cutoffs
  181. (aggregation='all').
  182. The output is a dataframe containing the columns corresponding to the level
  183. of aggregation ('horizon', 'ds' and 'cutoff', or none) along with columns
  184. for each of the metrics computed.
  185. Parameters
  186. ----------
  187. df: The dataframe returned by cross_validation.
  188. metrics: A list of performance metrics to compute. If not provided, will
  189. use ['mse', 'mae', 'mape', 'coverage'].
  190. aggregation: Level of aggregation for computing performance statistics.
  191. Must be 'horizon', 'none', or 'all'.
  192. Returns
  193. -------
  194. Dataframe with a column for each metric, and a combination of columns 'ds',
  195. 'cutoff', and 'horizon', depending on the aggregation level.
  196. """
  197. # Input validation
  198. valid_aggregations = ['horizon', 'all', 'none']
  199. if aggregation not in valid_aggregations:
  200. raise ValueError(
  201. 'Aggregation {} is not valid; must be one of {}'.format(
  202. aggregation, valid_agggregations
  203. )
  204. )
  205. valid_metrics = ['mse', 'mae', 'mape', 'coverage']
  206. if metrics is None:
  207. metrics = valid_metrics
  208. if len(set(metrics)) != len(metrics):
  209. raise ValueError('Input metrics must be a list of unique values')
  210. if not set(metrics).issubset(set(valid_metrics)):
  211. raise ValueError(
  212. 'Valid values for metrics are: {}'.format(valid_metrics)
  213. )
  214. # Get function for the metrics we want
  215. metric_fns = {m: eval(m) for m in metrics}
  216. def all_metrics(df_g):
  217. return pd.Series({name: fn(df_g) for name, fn in metric_fns.items()})
  218. # Apply functions to groupby
  219. if aggregation == 'all':
  220. return all_metrics(df)
  221. # else,
  222. df_m = df.copy()
  223. df_m['horizon'] = df_m['ds'] - df_m['cutoff']
  224. if aggregation == 'horizon':
  225. return df_m.groupby('horizon').apply(all_metrics).reset_index()
  226. # else,
  227. for name, fn in metric_fns.items():
  228. df_m[name] = fn(df_m, agg=False)
  229. return df_m
  230. # The functions below specify performance metrics for cross-validation results.
  231. # Each takes as input the output of cross_validation, and has two modes of
  232. # return: if agg=True, returns a float that is the metric aggregated over the
  233. # input. If agg=False, returns results without aggregation (for
  234. # aggregation='none' in performance_metrics).
  235. def mse(df, agg=True):
  236. """Mean squared error
  237. """
  238. se = (df['y'] - df['yhat']) ** 2
  239. if agg:
  240. return np.mean(se)
  241. return se
  242. def mae(df, agg=True):
  243. """Mean absolute error
  244. """
  245. ae = np.abs(df['y'] - df['yhat'])
  246. if agg:
  247. return np.mean(ae)
  248. return ae
  249. def mape(df, agg=True):
  250. """Mean absolute percent error
  251. """
  252. ape = np.abs((df['y'] - df['yhat']) / df['y'])
  253. if agg:
  254. return np.mean(ape)
  255. return ape
  256. def coverage(df, agg=True):
  257. """Coverage
  258. """
  259. is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper'])
  260. if agg:
  261. return np.mean(is_covered)
  262. return is_covered