diagnostics.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree. An additional grant
  6. # of patent rights can be found in the PATENTS file in the same directory.
  7. from __future__ import absolute_import
  8. from __future__ import division
  9. from __future__ import print_function
  10. from __future__ import unicode_literals
  11. from copy import deepcopy
  12. from functools import reduce
  13. import logging
  14. import numpy as np
  15. import pandas as pd
  16. logger = logging.getLogger(__name__)
  17. def generate_cutoffs(df, horizon, initial, period):
  18. """Generate cutoff dates
  19. Parameters
  20. ----------
  21. df: pd.DataFrame with historical data.
  22. horizon: pd.Timedelta forecast horizon.
  23. initial: pd.Timedelta window of the initial forecast period.
  24. period: pd.Timedelta simulated forecasts are done with this period.
  25. Returns
  26. -------
  27. list of pd.Timestamp
  28. """
  29. # Last cutoff is 'latest date in data - horizon' date
  30. cutoff = df['ds'].max() - horizon
  31. if cutoff < df['ds'].min():
  32. raise ValueError('Less data than horizon.')
  33. result = [cutoff]
  34. while result[-1] >= min(df['ds']) + initial:
  35. cutoff -= period
  36. # If data does not exist in data range (cutoff, cutoff + horizon]
  37. if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
  38. # Next cutoff point is 'last date before cutoff in data - horizon'
  39. closest_date = df[df['ds'] <= cutoff].max()['ds']
  40. cutoff = closest_date - horizon
  41. result.append(cutoff)
  42. result = result[:-1]
  43. if len(result) == 0:
  44. raise ValueError(
  45. 'Less data than horizon after initial window. '
  46. 'Make horizon or initial shorter.'
  47. )
  48. logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
  49. len(result), result[-1], result[0]
  50. ))
  51. return reversed(result)
  52. def cross_validation(model, horizon, period=None, initial=None):
  53. """Cross-Validation for time series.
  54. Computes forecasts from historical cutoff points. Beginning from
  55. (end - horizon), works backwards making cutoffs with a spacing of period
  56. until initial is reached.
  57. When period is equal to the time interval of the data, this is the
  58. technique described in https://robjhyndman.com/hyndsight/tscv/ .
  59. Parameters
  60. ----------
  61. model: Prophet class object. Fitted Prophet model
  62. horizon: string with pd.Timedelta compatible style, e.g., '5 days',
  63. '3 hours', '10 seconds'.
  64. period: string with pd.Timedelta compatible style. Simulated forecast will
  65. be done at every this period. If not provided, 0.5 * horizon is used.
  66. initial: string with pd.Timedelta compatible style. The first training
  67. period will begin here. If not provided, 3 * horizon is used.
  68. Returns
  69. -------
  70. A pd.DataFrame with the forecast, actual value and cutoff.
  71. """
  72. df = model.history.copy().reset_index(drop=True)
  73. te = df['ds'].max()
  74. ts = df['ds'].min()
  75. horizon = pd.Timedelta(horizon)
  76. period = 0.5 * horizon if period is None else pd.Timedelta(period)
  77. initial = 3 * horizon if initial is None else pd.Timedelta(initial)
  78. cutoffs = generate_cutoffs(df, horizon, initial, period)
  79. predicts = []
  80. for cutoff in cutoffs:
  81. # Generate new object with copying fitting options
  82. m = prophet_copy(model, cutoff)
  83. # Train model
  84. history_c = df[df['ds'] <= cutoff]
  85. if history_c.shape[0] < 2:
  86. raise Exception(
  87. 'Less than two datapoints before cutoff. '
  88. 'Increase initial window.'
  89. )
  90. m.fit(history_c)
  91. # Calculate yhat
  92. index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
  93. # Get the columns for the future dataframe
  94. columns = ['ds']
  95. if m.growth == 'logistic':
  96. columns.append('cap')
  97. if m.logistic_floor:
  98. columns.append('floor')
  99. columns.extend(m.extra_regressors.keys())
  100. yhat = m.predict(df[index_predicted][columns])
  101. # Merge yhat(predicts), y(df, original data) and cutoff
  102. predicts.append(pd.concat([
  103. yhat[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
  104. df[index_predicted][['y']].reset_index(drop=True),
  105. pd.DataFrame({'cutoff': [cutoff] * len(yhat)})
  106. ], axis=1))
  107. # Combine all predicted pd.DataFrame into one pd.DataFrame
  108. return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
  109. def prophet_copy(m, cutoff=None):
  110. """Copy Prophet object
  111. Parameters
  112. ----------
  113. m: Prophet model.
  114. cutoff: pd.Timestamp or None, default None.
  115. cuttoff Timestamp for changepoints member variable.
  116. changepoints are only retained if 'changepoints <= cutoff'
  117. Returns
  118. -------
  119. Prophet class object with the same parameter with model variable
  120. """
  121. if m.history is None:
  122. raise Exception('This is for copying a fitted Prophet object.')
  123. if m.specified_changepoints:
  124. changepoints = m.changepoints
  125. if cutoff is not None:
  126. # Filter change points '<= cutoff'
  127. changepoints = changepoints[changepoints <= cutoff]
  128. else:
  129. changepoints = None
  130. # Auto seasonalities are set to False because they are already set in
  131. # m.seasonalities.
  132. m2 = m.__class__(
  133. growth=m.growth,
  134. n_changepoints=m.n_changepoints,
  135. changepoint_range=m.changepoint_range,
  136. changepoints=changepoints,
  137. yearly_seasonality=False,
  138. weekly_seasonality=False,
  139. daily_seasonality=False,
  140. holidays=m.holidays,
  141. append_holidays=m.append_holidays,
  142. seasonality_mode=m.seasonality_mode,
  143. seasonality_prior_scale=m.seasonality_prior_scale,
  144. changepoint_prior_scale=m.changepoint_prior_scale,
  145. holidays_prior_scale=m.holidays_prior_scale,
  146. mcmc_samples=m.mcmc_samples,
  147. interval_width=m.interval_width,
  148. uncertainty_samples=m.uncertainty_samples,
  149. )
  150. m2.extra_regressors = deepcopy(m.extra_regressors)
  151. m2.seasonalities = deepcopy(m.seasonalities)
  152. return m2
  153. def performance_metrics(df, metrics=None, rolling_window=0.1):
  154. """Compute performance metrics from cross-validation results.
  155. Computes a suite of performance metrics on the output of cross-validation.
  156. By default the following metrics are included:
  157. 'mse': mean squared error
  158. 'rmse': root mean squared error
  159. 'mae': mean absolute error
  160. 'mape': mean percent error
  161. 'coverage': coverage of the upper and lower intervals
  162. A subset of these can be specified by passing a list of names as the
  163. `metrics` argument.
  164. Metrics are calculated over a rolling window of cross validation
  165. predictions, after sorting by horizon. The size of that window (number of
  166. simulated forecast points) is determined by the rolling_window argument,
  167. which specifies a proportion of simulated forecast points to include in
  168. each window. rolling_window=0 will compute it separately for each simulated
  169. forecast point (i.e., 'mse' will actually be squared error with no mean).
  170. The default of rolling_window=0.1 will use 10% of the rows in df in each
  171. window. rolling_window=1 will compute the metric across all simulated forecast
  172. points. The results are set to the right edge of the window.
  173. The output is a dataframe containing column 'horizon' along with columns
  174. for each of the metrics computed.
  175. Parameters
  176. ----------
  177. df: The dataframe returned by cross_validation.
  178. metrics: A list of performance metrics to compute. If not provided, will
  179. use ['mse', 'rmse', 'mae', 'mape', 'coverage'].
  180. rolling_window: Proportion of data to use in each rolling window for
  181. computing the metrics. Should be in [0, 1].
  182. Returns
  183. -------
  184. Dataframe with a column for each metric, and column 'horizon'
  185. """
  186. valid_metrics = ['mse', 'rmse', 'mae', 'mape', 'coverage']
  187. if metrics is None:
  188. metrics = valid_metrics
  189. if len(set(metrics)) != len(metrics):
  190. raise ValueError('Input metrics must be a list of unique values')
  191. if not set(metrics).issubset(set(valid_metrics)):
  192. raise ValueError(
  193. 'Valid values for metrics are: {}'.format(valid_metrics)
  194. )
  195. df_m = df.copy()
  196. df_m['horizon'] = df_m['ds'] - df_m['cutoff']
  197. df_m.sort_values('horizon', inplace=True)
  198. # Window size
  199. w = int(rolling_window * df_m.shape[0])
  200. w = max(w, 1)
  201. w = min(w, df_m.shape[0])
  202. cols = ['horizon']
  203. for metric in metrics:
  204. df_m[metric] = eval(metric)(df_m, w)
  205. cols.append(metric)
  206. df_m = df_m[cols]
  207. return df_m.dropna()
  208. def rolling_mean(x, w):
  209. """Compute a rolling mean of x
  210. Right-aligned. Padded with NaNs on the front so the output is the same
  211. size as x.
  212. Parameters
  213. ----------
  214. x: Array.
  215. w: Integer window size (number of elements).
  216. Returns
  217. -------
  218. Rolling mean of x with window size w.
  219. """
  220. s = np.cumsum(np.insert(x, 0, 0))
  221. prefix = np.empty(w - 1)
  222. prefix.fill(np.nan)
  223. return np.hstack((prefix, (s[w:] - s[:-w]) / float(w))) # right-aligned
  224. # The functions below specify performance metrics for cross-validation results.
  225. # Each takes as input the output of cross_validation, and returns the statistic
  226. # as an array, given a window size for rolling aggregation.
  227. def mse(df, w):
  228. """Mean squared error
  229. Parameters
  230. ----------
  231. df: Cross-validation results dataframe.
  232. w: Aggregation window size.
  233. Returns
  234. -------
  235. Array of mean squared errors.
  236. """
  237. se = (df['y'] - df['yhat']) ** 2
  238. return rolling_mean(se.values, w)
  239. def rmse(df, w):
  240. """Root mean squared error
  241. Parameters
  242. ----------
  243. df: Cross-validation results dataframe.
  244. w: Aggregation window size.
  245. Returns
  246. -------
  247. Array of root mean squared errors.
  248. """
  249. return np.sqrt(mse(df, w))
  250. def mae(df, w):
  251. """Mean absolute error
  252. Parameters
  253. ----------
  254. df: Cross-validation results dataframe.
  255. w: Aggregation window size.
  256. Returns
  257. -------
  258. Array of mean absolute errors.
  259. """
  260. ae = np.abs(df['y'] - df['yhat'])
  261. return rolling_mean(ae.values, w)
  262. def mape(df, w):
  263. """Mean absolute percent error
  264. Parameters
  265. ----------
  266. df: Cross-validation results dataframe.
  267. w: Aggregation window size.
  268. Returns
  269. -------
  270. Array of mean absolute percent errors.
  271. """
  272. ape = np.abs((df['y'] - df['yhat']) / df['y'])
  273. return rolling_mean(ape.values, w)
  274. def smape(df, w):
  275. """Symmetric mean absolute percentage error
  276. Parameters
  277. ----------
  278. df: Cross-validation results dataframe.
  279. w: Aggregation window size.
  280. Returns
  281. -------
  282. Array of symmetric mean absolute percent errors.
  283. """
  284. sape = np.abs(df['yhat']-df['y']) / ((np.abs(df['y']) + np.abs(df['yhat'])) /2)
  285. return rolling_mean(sape.values, w)
  286. def coverage(df, w):
  287. """Coverage
  288. Parameters
  289. ----------
  290. df: Cross-validation results dataframe.
  291. w: Aggregation window size.
  292. Returns
  293. -------
  294. Array of coverages.
  295. """
  296. is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper'])
  297. return rolling_mean(is_covered.values, w)