Преглед на файлове

Simplify logic/interfaces for cross_validation, to better handle irregularly spaced data and actually respect initial window

Ben Letham преди 7 години
родител
ревизия
218283f157

+ 0 - 1
R/NAMESPACE

@@ -15,6 +15,5 @@ export(plot_forecast_component)
 export(predictive_samples)
 export(prophet)
 export(prophet_plot_components)
-export(simulated_historical_forecasts)
 import(Rcpp)
 importFrom(dplyr,"%>%")

+ 50 - 76
R/R/diagnostics.R

@@ -11,68 +11,81 @@ globalVariables(c(
 
 #' Generate cutoff dates
 #'
-#' @param df Dataframe with historical data
-#' @param horizon timediff forecast horizon
-#' @param k integer number of forecast points
+#' @param df Dataframe with historical data.
+#' @param horizon timediff forecast horizon.
+#' @param initial timediff initial window.
 #' @param period timediff Simulated forecasts are done with this period.
 #'
-#' @return Array of datetimes
+#' @return Array of datetimes.
 #'
 #' @keywords internal
-generate_cutoffs <- function(df, horizon, k, period) {
+generate_cutoffs <- function(df, horizon, initial, period) {
   # Last cutoff is (latest date in data) - (horizon).
   cutoff <- max(df$ds) - horizon
-  if (cutoff < min(df$ds)) {
-    stop('Less data than horizon.')
-  }
   tzone <- attr(cutoff, "tzone")  # Timezone is wiped by putting in array
-  result <- cutoff
-  if (k > 1) {
-    for (i in 2:k) {
-      cutoff <- cutoff - period
-      # If data does not exist in data range (cutoff, cutoff + horizon]
-      if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
+  result <- c(cutoff)
+  while (result[length(result)] >= min(df$ds) + initial) {
+    cutoff <- cutoff - period
+    # If data does not exist in data range (cutoff, cutoff + horizon]
+    if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
         # Next cutoff point is 'closest date before cutoff in data - horizon'
         closest.date <- max(df$ds[df$ds <= cutoff])
         cutoff <- closest.date - horizon
-      }
-      if (cutoff < min(df$ds)) {
-        warning('Not enough data for requested number of cutoffs! Using ', i)
-        break
-      }
-      result <- c(result, cutoff)
     }
+    result <- c(result, cutoff)
+  }
+  result <- head(result, -1)
+  if (length(result) == 0) {
+    stop(paste(
+      'Less data than horizon after initial window.',
+      'Make horizon or initial shorter.'
+    ))
   }
   # Reset timezones
   attr(result, "tzone") <- tzone
+  message(paste(
+    'Making', length(result), 'forecasts with cutoffs between',
+    result[length(result)], 'and', result[1]
+  ))
   return(rev(result))
 }
 
-#' Simulated historical forecasts.
+#' Cross-validation for time series.
+#'
+#' Computes forecasts from historical cutoff points. Beginning from
+#' (end - horizon), works backwards making cutoffs with a spacing of period
+#' until initial is reached.
 #'
-#' Make forecasts from k historical cutoff points, working backwards from
-#' (end - horizon) with a spacing of period between each cutoff.
+#' When period is equal to the time interval of the data, this is the
+#' technique described in https://robjhyndman.com/hyndsight/tscv/ .
 #'
 #' @param model Fitted Prophet model.
 #' @param horizon Integer size of the horizon
 #' @param units String unit of the horizon, e.g., "days", "secs".
-#' @param k integer number of forecast points
 #' @param period Integer amount of time between cutoff dates. Same units as
-#'  horizon. If not provided, will use 0.5 * horizon.
+#'  horizon. If not provided, 0.5 * horizon is used.
+#' @param initial Integer size of the first training period. If not provided,
+#'  3 * horizon is used. Same units as horizon.
 #'
 #' @return A dataframe with the forecast, actual value, and cutoff date.
 #'
 #' @export
-simulated_historical_forecasts <- function(model, horizon, units, k,
-                                           period = NULL) {
+cross_validation <- function(
+    model, horizon, units, period = NULL, initial = NULL) {
   df <- model$history
-  horizon <- as.difftime(horizon, units = units)
+  te <- max(df$ds)
+  ts <- min(df$ds)
   if (is.null(period)) {
-    period <- horizon / 2
-  } else {
-    period <- as.difftime(period, units = units)
+    period <- 0.5 * horizon
+  }
+  if (is.null(initial)) {
+    initial <- 3 * horizon
   }
-  cutoffs <- generate_cutoffs(df, horizon, k, period)
+  horizon.dt <- as.difftime(horizon, units = units)
+  initial.dt <- as.difftime(initial, units = units)
+  period.dt <- as.difftime(period, units = units)
+
+  cutoffs <- generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
   predicts <- data.frame()
   for (i in seq_along(cutoffs)) {
     cutoff <- cutoffs[i]
@@ -80,9 +93,12 @@ simulated_historical_forecasts <- function(model, horizon, units, k,
     m <- prophet_copy(model, cutoff)
     # Train model
     history.c <- dplyr::filter(df, ds <= cutoff)
+    if (nrow(history.c) < 2) {
+      stop('Less than two datapoints before cutoff. Increase initial window.')
+    }
     m <- fit.prophet(m, history.c)
     # Calculate yhat
-    df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon)
+    df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon.dt)
     # Get the columns for the future dataframe
     columns <- 'ds'
     if (m$growth == 'logistic') {
@@ -92,7 +108,7 @@ simulated_historical_forecasts <- function(model, horizon, units, k,
       }
     }
     columns <- c(columns, names(m$extra_regressors))
-    future <- df[columns]
+    future <- df.predict[columns]
     yhat <- stats::predict(m, future)
     # Merge yhat, y, and cutoff.
     df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
@@ -103,48 +119,6 @@ simulated_historical_forecasts <- function(model, horizon, units, k,
   return(predicts)
 }
 
-#' Cross-validation for time series.
-#'
-#' Computes forecasts from historical cutoff points. Beginning from initial,
-#' makes cutoffs with a spacing of period up to (end - horizon).
-#'
-#' When period is equal to the time interval of the data, this is the
-#' technique described in https://robjhyndman.com/hyndsight/tscv/ .
-#'
-#' @param model Fitted Prophet model.
-#' @param horizon Integer size of the horizon
-#' @param units String unit of the horizon, e.g., "days", "secs".
-#' @param period Integer amount of time between cutoff dates. Same units as
-#'  horizon. If not provided, 0.5 * horizon is used.
-#' @param initial Integer size of the first training period. If not provided,
-#'  3 * horizon is used. Same units as horizon.
-#'
-#' @return A dataframe with the forecast, actual value, and cutoff date.
-#'
-#' @export
-cross_validation <- function(
-    model, horizon, units, period = NULL, initial = NULL) {
-  te <- max(model$history$ds)
-  ts <- min(model$history$ds)
-  if (is.null(period)) {
-    period <- 0.5 * horizon
-  }
-  if (is.null(initial)) {
-    initial <- 3 * horizon
-  }
-  horizon.dt <- as.difftime(horizon, units = units)
-  initial.dt <- as.difftime(initial, units = units)
-  period.dt <- as.difftime(period, units = units)
-  k <- ceiling(
-    as.double((te - horizon.dt) - (ts + initial.dt), units='secs') /
-    as.double(period.dt, units = 'secs')
-  )
-  if (k < 1) {
-    stop('Not enough data for specified horizon, period, and initial.')
-  }
-  return(simulated_historical_forecasts(model, horizon, units, k, period))
-}
-
 #' Copy Prophet object.
 #'
 #' @param m Prophet model object.

+ 3 - 2
R/man/cross_validation.Rd

@@ -23,8 +23,9 @@ horizon. If not provided, 0.5 * horizon is used.}
 A dataframe with the forecast, actual value, and cutoff date.
 }
 \description{
-Computes forecasts from historical cutoff points. Beginning from initial,
-makes cutoffs with a spacing of period up to (end - horizon).
+Computes forecasts from historical cutoff points. Beginning from
+(end - horizon), works backwards making cutoffs with a spacing of period
+until initial is reached.
 }
 \details{
 When period is equal to the time interval of the data, this is the

+ 3 - 2
R/man/dyplot.prophet.Rd

@@ -24,8 +24,9 @@ Plot the prophet forecast.
 }
 \examples{
 \dontrun{
-history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
-                      y = sin(1:366/200) + rnorm(366)/10)
+history <- data.frame(
+ ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
+ y = sin(1:366/200) + rnorm(366)/10)
 m <- prophet(history)
 future <- make_future_dataframe(m, periods = 365)
 forecast <- predict(m, future)

+ 2 - 2
R/man/generate_cutoffs.Rd

@@ -4,14 +4,14 @@
 \alias{generate_cutoffs}
 \title{Generate cutoff dates}
 \usage{
-generate_cutoffs(df, horizon, k, period)
+generate_cutoffs(df, horizon, initial, period)
 }
 \arguments{
 \item{df}{Dataframe with historical data}
 
 \item{horizon}{timediff forecast horizon}
 
-\item{k}{integer number of forecast points}
+\item{initial}{timediff initial window}
 
 \item{period}{timediff Simulated forecasts are done with this period.}
 }

+ 12 - 7
R/man/prophet.Rd

@@ -5,12 +5,13 @@
 \title{Prophet forecaster.}
 \usage{
 prophet(df = NULL, growth = "linear", changepoints = NULL,
-  n.changepoints = 25, yearly.seasonality = "auto",
-  weekly.seasonality = "auto", daily.seasonality = "auto",
-  holidays = NULL, seasonality.mode = "additive",
-  seasonality.prior.scale = 10, holidays.prior.scale = 10,
-  changepoint.prior.scale = 0.05, mcmc.samples = 0, interval.width = 0.8,
-  uncertainty.samples = 1000, fit = TRUE, ...)
+  n.changepoints = 25, changepoint.range = 0.8,
+  yearly.seasonality = "auto", weekly.seasonality = "auto",
+  daily.seasonality = "auto", holidays = NULL,
+  seasonality.mode = "additive", seasonality.prior.scale = 10,
+  holidays.prior.scale = 10, changepoint.prior.scale = 0.05,
+  mcmc.samples = 0, interval.width = 0.8, uncertainty.samples = 1000,
+  fit = TRUE, ...)
 }
 \arguments{
 \item{df}{(optional) Dataframe containing the history. Must have columns ds
@@ -29,7 +30,11 @@ automatically.}
 \item{n.changepoints}{Number of potential changepoints to include. Not used
 if input `changepoints` is supplied. If `changepoints` is not supplied,
 then n.changepoints potential changepoints are selected uniformly from the
-first 80 percent of df$ds.}
+first `changepoint.range` proportion of df$ds.}
+
+\item{changepoint.range}{Proportion of history in which trend changepoints
+will be estimated. Defaults to 0.8 for the first 80%. Not used if
+`changepoints` is specified.}
 
 \item{yearly.seasonality}{Fit yearly seasonality. Can be 'auto', TRUE,
 FALSE, or a number of Fourier terms to generate.}

+ 0 - 27
R/man/simulated_historical_forecasts.Rd

@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/diagnostics.R
-\name{simulated_historical_forecasts}
-\alias{simulated_historical_forecasts}
-\title{Simulated historical forecasts.}
-\usage{
-simulated_historical_forecasts(model, horizon, units, k, period = NULL)
-}
-\arguments{
-\item{model}{Fitted Prophet model.}
-
-\item{horizon}{Integer size of the horizon}
-
-\item{units}{String unit of the horizon, e.g., "days", "secs".}
-
-\item{k}{integer number of forecast points}
-
-\item{period}{Integer amount of time between cutoff dates. Same units as
-horizon. If not provided, will use 0.5 * horizon.}
-}
-\value{
-A dataframe with the forecast, actual value, and cutoff date.
-}
-\description{
-Make forecasts from k historical cutoff points, working backwards from
-(end - horizon) with a spacing of period between each cutoff.
-}

+ 42 - 66
R/tests/testthat/test_diagnostics.R

@@ -8,47 +8,50 @@ DATA_all <- read.csv('data.csv')
 DATA_all$ds <- as.Date(DATA_all$ds)
 DATA <- head(DATA_all, 100)
 
-test_that("simulated_historical_forecasts", {
+test_that("cross_validation", {
   skip_if_not(Sys.getenv('R_ARCH') != '/i386')
   m <- prophet(DATA)
-  k <- 2
-  for (p in c(1, 10)) {
-    for (h in c(1, 3)) {
-      df.shf <- simulated_historical_forecasts(
-        m, horizon = h, units = 'days', k = k, period = p)
-      # All cutoff dates should be less than ds dates
-      expect_true(all(df.shf$cutoff < df.shf$ds))
-      # The unique size of output cutoff should be equal to 'k'
-      expect_equal(length(unique(df.shf$cutoff)), k)
-      expect_equal(max(df.shf$ds - df.shf$cutoff),
-                   as.difftime(h, units = 'days'))
-      dc <- diff(df.shf$cutoff)
-      dc <- min(dc[dc > 0])
-      expect_true(dc >= as.difftime(p, units = 'days'))
-      # Each y in df_shf and DATA with same ds should be equal
-      df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
-      expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
-    }
-  }
+  # Calculate the number of cutoff points
+  te <- max(DATA$ds)
+  ts <- min(DATA$ds)
+  horizon <- as.difftime(4, units = "days")
+  period <- as.difftime(10, units = "days")
+  initial <- as.difftime(115, units = "days")
+  df.cv <- cross_validation(
+    m, horizon = 4, units = "days", period = 10, initial = 115)
+  expect_equal(length(unique(df.cv$cutoff)), 3)
+  expect_equal(max(df.cv$ds - df.cv$cutoff), horizon)
+  expect_true(min(df.cv$cutoff) >= ts + initial)
+  dc <- diff(df.cv$cutoff)
+  dc <- min(dc[dc > 0])
+  expect_true(dc >= period)
+  expect_true(all(df.cv$cutoff < df.cv$ds))
+  # Each y in df.cv and DATA with same ds should be equal
+  df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
+  expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
+  df.cv <- cross_validation(
+    m, horizon = 4, units = "days", period = 10, initial = 135)
+  expect_equal(length(unique(df.cv$cutoff)), 1)
+  expect_error(
+    cross_validation(
+      m, horizon = 10, units = "days", period = 10, initial = 140)
+  )
 })
 
-test_that("simulated_historical_forecasts_logistic", {
+test_that("cross_validation_logistic", {
   skip_if_not(Sys.getenv('R_ARCH') != '/i386')
   df <- DATA
   df$cap <- 40
-  m <- prophet(df, growth='logistic')
-  df.shf <- simulated_historical_forecasts(
-    m, horizon = 3, units = 'days', k = 2, period = 3)
-  # All cutoff dates should be less than ds dates
-  expect_true(all(df.shf$cutoff < df.shf$ds))
-  # The unique size of output cutoff should be equal to 'k'
-  expect_equal(length(unique(df.shf$cutoff)), 2)
-  # Each y in df_shf and DATA with same ds should be equal
-  df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
+  m <- prophet(df, growth = 'logistic')
+  df.cv <- cross_validation(
+    m, horizon = 1, units = "days", period = 1, initial = 140)
+  expect_equal(length(unique(df.cv$cutoff)), 2)
+  expect_true(all(df.cv$cutoff < df.cv$ds))
+  df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
   expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
 })
 
-test_that("simulated_historical_forecasts_extra_regressors", {
+test_that("cross_validation_extra_regressors", {
   skip_if_not(Sys.getenv('R_ARCH') != '/i386')
   df <- DATA
   df$extra <- seq(0, nrow(df) - 1)
@@ -56,43 +59,16 @@ test_that("simulated_historical_forecasts_extra_regressors", {
   m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5)
   m <- add_regressor(m, 'extra')
   m <- fit.prophet(m, df)
-  df.shf <- simulated_historical_forecasts(
-    m, horizon = 3, units = 'days', k = 2, period = 3)
-  # All cutoff dates should be less than ds dates
-  expect_true(all(df.shf$cutoff < df.shf$ds))
-  # The unique size of output cutoff should be equal to 'k'
-  expect_equal(length(unique(df.shf$cutoff)), 2)
-  # Each y in df_shf and DATA with same ds should be equal
-  df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
-  expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
-})
-
-test_that("simulated_historical_forecasts_default_value_check", {
-  skip_if_not(Sys.getenv('R_ARCH') != '/i386')
-  m <- prophet(DATA)
-  df.shf1 <- simulated_historical_forecasts(
-    m, horizon = 10, units = 'days', k = 1)
-  df.shf2 <- simulated_historical_forecasts(
-    m, horizon = 10, units = 'days', k = 1, period = 5)
-  expect_equal(sum(dplyr::select(df.shf1 - df.shf2, y, yhat)), 0)
-})
-
-test_that("cross_validation", {
-  skip_if_not(Sys.getenv('R_ARCH') != '/i386')
-  m <- prophet(DATA)
-  # Calculate the number of cutoff points
-  te <- max(DATA$ds)
-  ts <- min(DATA$ds)
-  horizon <- as.difftime(4, units = "days")
-  period <- as.difftime(10, units = "days")
-  k <- 5
   df.cv <- cross_validation(
-    m, horizon = 4, units = "days", period = 10, initial = 90)
-  expect_equal(length(unique(df.cv$cutoff)), k)
-  expect_equal(max(df.cv$ds - df.cv$cutoff), horizon)
+    m, horizon = 4, units = "days", period = 4, initial = 135)
+  expect_equal(length(unique(df.cv$cutoff)), 2)
+  period <- as.difftime(4, units = "days")
   dc <- diff(df.cv$cutoff)
   dc <- min(dc[dc > 0])
   expect_true(dc >= period)
+  expect_true(all(df.cv$cutoff < df.cv$ds))
+  df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
+  expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
 })
 
 test_that("cross_validation_default_value_check", {
@@ -116,11 +92,11 @@ test_that("performance_metrics", {
     sort(colnames(df_none))
     == sort(c('horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'))
   ))
-  expect_equal(nrow(df_none), 14)
+  expect_equal(nrow(df_none), 16)
   # Aggregation level 0.2
   df_horizon <- performance_metrics(df_cv, rolling_window = 0.2)
   expect_equal(length(unique(df_horizon$horizon)), 4)
-  expect_equal(nrow(df_horizon), 13)
+  expect_equal(nrow(df_horizon), 14)
   # Aggregation level all
   df_all <- performance_metrics(df_cv, rolling_window = 1)
   expect_equal(nrow(df_all), 1)

+ 40 - 65
python/fbprophet/diagnostics.py

@@ -21,18 +21,15 @@ import pandas as pd
 logger = logging.getLogger(__name__)
 
 
-def _cutoffs(df, horizon, k, period):
+def generate_cutoffs(df, horizon, initial, period):
     """Generate cutoff dates
 
     Parameters
     ----------
-    df: pd.DataFrame with historical data
-    horizon: pd.Timedelta.
-        Forecast horizon
-    k: Int number.
-        The number of forecasts point.
-    period: pd.Timedelta.
-        Simulated Forecast will be done at every this period.
+    df: pd.DataFrame with historical data.
+    horizon: pd.Timedelta forecast horizon.
+    initial: pd.Timedelta window of the initial forecast period.
+    period: pd.Timedelta simulated forecasts are done with this period.
 
     Returns
     -------
@@ -43,56 +40,70 @@ def _cutoffs(df, horizon, k, period):
     if cutoff < df['ds'].min():
         raise ValueError('Less data than horizon.')
     result = [cutoff]
-
-    for i in range(1, k):
+    while result[-1] >= min(df['ds']) + initial:
         cutoff -= period
         # If data does not exist in data range (cutoff, cutoff + horizon]
         if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
             # Next cutoff point is 'last date before cutoff in data - horizon'
             closest_date = df[df['ds'] <= cutoff].max()['ds']
             cutoff = closest_date - horizon
-        if cutoff < df['ds'].min():
-            logger.warning(
-                'Not enough data for requested number of cutoffs! '
-                'Using {}.'.format(i))
-            break
         result.append(cutoff)
-
-    # Sort lines in ascending order
+    result = result[:-1]
+    if len(result) == 0:
+        raise ValueError(
+            'Less data than horizon after initial window. '
+            'Make horizon or initial shorter.'
+        )
+    logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
+        len(result), result[-1], result[0]
+    ))
     return reversed(result)
 
 
-def simulated_historical_forecasts(model, horizon, k, period=None):
-    """Simulated Historical Forecasts.
+def cross_validation(model, horizon, period=None, initial=None):
+    """Cross-Validation for time series.
 
-    Make forecasts from k historical cutoff points, working backwards from
-    (end - horizon) with a spacing of period between each cutoff.
+    Computes forecasts from historical cutoff points. Beginning from
+    (end - horizon), works backwards making cutoffs with a spacing of period
+    until initial is reached.
+
+    When period is equal to the time interval of the data, this is the
+    technique described in https://robjhyndman.com/hyndsight/tscv/ .
 
     Parameters
     ----------
-    model: Prophet class object.
-        Fitted Prophet model
+    model: Prophet class object. Fitted Prophet model
     horizon: string with pd.Timedelta compatible style, e.g., '5 days',
         '3 hours', '10 seconds'.
-    k: Int number of forecasts point.
-    period: Optional string with pd.Timedelta compatible style. Simulated
-        forecast will be done at every this period. If not provided,
-        0.5 * horizon is used.
+    period: string with pd.Timedelta compatible style. Simulated forecast will
+        be done at every this period. If not provided, 0.5 * horizon is used.
+    initial: string with pd.Timedelta compatible style. The first training
+        period will begin here. If not provided, 3 * horizon is used.
 
     Returns
     -------
     A pd.DataFrame with the forecast, actual value and cutoff.
     """
     df = model.history.copy().reset_index(drop=True)
+    te = df['ds'].max()
+    ts = df['ds'].min()
     horizon = pd.Timedelta(horizon)
     period = 0.5 * horizon if period is None else pd.Timedelta(period)
-    cutoffs = _cutoffs(df, horizon, k, period)
+    initial = 3 * horizon if initial is None else pd.Timedelta(initial)
+
+    cutoffs = generate_cutoffs(df, horizon, initial, period)
     predicts = []
     for cutoff in cutoffs:
         # Generate new object with copying fitting options
         m = prophet_copy(model, cutoff)
         # Train model
-        m.fit(df[df['ds'] <= cutoff])
+        history_c = df[df['ds'] <= cutoff]
+        if history_c.shape[0] < 2:
+            raise Exception(
+                'Less than two datapoints before cutoff. '
+                'Increase initial window.'
+            )
+        m.fit(history_c)
         # Calculate yhat
         index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
         # Get the columns for the future dataframe
@@ -113,42 +124,6 @@ def simulated_historical_forecasts(model, horizon, k, period=None):
     # Combine all predicted pd.DataFrame into one pd.DataFrame
     return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
 
-
-def cross_validation(model, horizon, period=None, initial=None):
-    """Cross-Validation for time series.
-
-    Computes forecasts from historical cutoff points. Beginning from initial,
-    makes cutoffs with a spacing of period up to (end - horizon).
-
-    When period is equal to the time interval of the data, this is the
-    technique described in https://robjhyndman.com/hyndsight/tscv/ .
-
-    Parameters
-    ----------
-    model: Prophet class object. Fitted Prophet model
-    horizon: string with pd.Timedelta compatible style, e.g., '5 days',
-        '3 hours', '10 seconds'.
-    period: string with pd.Timedelta compatible style. Simulated forecast will
-        be done at every this period. If not provided, 0.5 * horizon is used.
-    initial: string with pd.Timedelta compatible style. The first training
-        period will begin here. If not provided, 3 * horizon is used.
-
-    Returns
-    -------
-    A pd.DataFrame with the forecast, actual value and cutoff.
-    """
-    te = model.history['ds'].max()
-    ts = model.history['ds'].min()
-    horizon = pd.Timedelta(horizon)
-    period = 0.5 * horizon if period is None else pd.Timedelta(period)
-    initial = 3 * horizon if initial is None else pd.Timedelta(initial)
-    k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
-    if k < 1:
-        raise ValueError(
-            'Not enough data for specified horizon, period, and initial.')
-    return simulated_historical_forecasts(model, horizon, k, period)
-
-
 def prophet_copy(m, cutoff=None):
     """Copy Prophet object
 

+ 43 - 75
python/fbprophet/tests/test_diagnostics.py

@@ -36,95 +36,63 @@ class TestDiagnostics(TestCase):
         # Use first 100 record in data.csv
         self.__df = DATA
 
-    def test_simulated_historical_forecasts(self):
+    def test_cross_validation(self):
         m = Prophet()
         m.fit(self.__df)
-        k = 2
-        for p in [1, 10]:
-            for h in [1, 3]:
-                period = '{} days'.format(p)
-                horizon = '{} days'.format(h)
-                df_shf = diagnostics.simulated_historical_forecasts(
-                    m, horizon=horizon, k=k, period=period)
-                # All cutoff dates should be less than ds dates
-                self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
-                # The unique size of output cutoff should be equal to 'k'
-                self.assertEqual(len(np.unique(df_shf['cutoff'])), k)
-                self.assertEqual(
-                    max(df_shf['ds'] - df_shf['cutoff']),
-                    pd.Timedelta(horizon),
-                )
-                dc = df_shf['cutoff'].diff()
-                dc = dc[dc > pd.Timedelta(0)].min()
-                self.assertTrue(dc >= pd.Timedelta(period))
-                # Each y in df_shf and self.__df with same ds should be equal
-                df_merged = pd.merge(df_shf, self.__df, 'left', on='ds')
-                self.assertAlmostEqual(
-                    np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
+        # Calculate the number of cutoff points(k)
+        horizon = pd.Timedelta('4 days')
+        period = pd.Timedelta('10 days')
+        initial = pd.Timedelta('115 days')
+        df_cv = diagnostics.cross_validation(
+            m, horizon='4 days', period='10 days', initial='115 days')
+        self.assertEqual(len(np.unique(df_cv['cutoff'])), 3)
+        self.assertEqual(max(df_cv['ds'] - df_cv['cutoff']), horizon)
+        self.assertTrue(min(df_cv['cutoff']) >= min(self.__df['ds']) + initial)
+        dc = df_cv['cutoff'].diff()
+        dc = dc[dc > pd.Timedelta(0)].min()
+        self.assertTrue(dc >= period)
+        self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
+        # Each y in df_cv and self.__df with same ds should be equal
+        df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
+        self.assertAlmostEqual(
+            np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
+        df_cv = diagnostics.cross_validation(
+            m, horizon='4 days', period='10 days', initial='135 days')
+        self.assertEqual(len(np.unique(df_cv['cutoff'])), 1)
+        with self.assertRaises(ValueError):
+            diagnostics.cross_validation(
+                m, horizon='10 days', period='10 days', initial='140 days')
 
-    def test_simulated_historical_forecasts_logistic(self):
-        m = Prophet(growth='logistic')
+    def test_cross_validation_logistic(self):
         df = self.__df.copy()
         df['cap'] = 40
-        m.fit(df)
-        df_shf = diagnostics.simulated_historical_forecasts(
-            m, horizon='3 days', k=2, period='3 days')
-        # All cutoff dates should be less than ds dates
-        self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
-        # The unique size of output cutoff should be equal to 'k'
-        self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
-        # Each y in df_shf and self.__df with same ds should be equal
-        df_merged = pd.merge(df_shf, df, 'left', on='ds')
+        m = Prophet(growth='logistic').fit(df)
+        df_cv = diagnostics.cross_validation(
+            m, horizon='1 days', period='1 days', initial='140 days')
+        self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
+        self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
+        df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
         self.assertAlmostEqual(
             np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
 
-    def test_simulated_historical_forecasts_extra_regressors(self):
+    def test_cross_validation_extra_regressors(self):
+        df = self.__df.copy()
+        df['extra'] = range(df.shape[0])
         m = Prophet()
         m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
         m.add_regressor('extra')
-        df = self.__df.copy()
-        df['cap'] = 40
-        df['extra'] = range(df.shape[0])
         m.fit(df)
-        df_shf = diagnostics.simulated_historical_forecasts(
-            m, horizon='3 days', k=2, period='3 days')
-        # All cutoff dates should be less than ds dates
-        self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
-        # The unique size of output cutoff should be equal to 'k'
-        self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
-        # Each y in df_shf and self.__df with same ds should be equal
-        df_merged = pd.merge(df_shf, df, 'left', on='ds')
-        self.assertAlmostEqual(
-            np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
-
-    def test_simulated_historical_forecasts_default_value_check(self):
-        m = Prophet()
-        m.fit(self.__df)
-        # Default value of period should be equal to 0.5 * horizon
-        df_shf1 = diagnostics.simulated_historical_forecasts(
-            m, horizon='10 days', k=1)
-        df_shf2 = diagnostics.simulated_historical_forecasts(
-            m, horizon='10 days', k=1, period='5 days')
-        self.assertAlmostEqual(
-            ((df_shf1['y'] - df_shf2['y']) ** 2).sum(), 0.0)
-        self.assertAlmostEqual(
-            ((df_shf1['yhat'] - df_shf2['yhat']) ** 2).sum(), 0.0)
-
-    def test_cross_validation(self):
-        m = Prophet()
-        m.fit(self.__df)
-        # Calculate the number of cutoff points(k)
-        horizon = pd.Timedelta('4 days')
-        period = pd.Timedelta('10 days')
-        k = 5
         df_cv = diagnostics.cross_validation(
-            m, horizon='4 days', period='10 days', initial='90 days')
-        # The unique size of output cutoff should be equal to 'k'
-        self.assertEqual(len(np.unique(df_cv['cutoff'])), k)
-        self.assertEqual(max(df_cv['ds'] - df_cv['cutoff']), horizon)
+            m, horizon='4 days', period='4 days', initial='135 days')
+        self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
+        period = pd.Timedelta('4 days')
         dc = df_cv['cutoff'].diff()
         dc = dc[dc > pd.Timedelta(0)].min()
         self.assertTrue(dc >= period)
+        self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
+        df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
+        self.assertAlmostEqual(
+            np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
 
     def test_cross_validation_default_value_check(self):
         m = Prophet()
@@ -150,11 +118,11 @@ class TestDiagnostics(TestCase):
             set(df_none.columns),
             {'horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'},
         )
-        self.assertEqual(df_none.shape[0], 14)
+        self.assertEqual(df_none.shape[0], 16)
         # Aggregation level 0.2
         df_horizon = diagnostics.performance_metrics(df_cv, rolling_window=0.2)
         self.assertEqual(len(df_horizon['horizon'].unique()), 4)
-        self.assertEqual(df_horizon.shape[0], 13)
+        self.assertEqual(df_horizon.shape[0], 14)
         # Aggregation level all
         df_all = diagnostics.performance_metrics(df_cv, rolling_window=1)
         self.assertEqual(df_all.shape[0], 1)