Pārlūkot izejas kodu

Compute cross validation performance metrics on a rolling window

Ben Letham 7 gadi atpakaļ
vecāks
revīzija
3e59bbdc84

+ 43 - 54
python/fbprophet/diagnostics.py

@@ -196,7 +196,7 @@ def prophet_copy(m, cutoff=None):
     return m2
 
 
-def performance_metrics(df, metrics=None, aggregation='horizon'):
+def performance_metrics(df, metrics=None, rolling_window=0.05):
     """Compute performance metrics from cross-validation results.
 
     Computes a suite of performance metrics on the output of cross-validation.
@@ -209,13 +209,17 @@ def performance_metrics(df, metrics=None, aggregation='horizon'):
     A subset of these can be specified by passing a list of names as the
     `metrics` argument.
 
-    By default, metrics will be computed for each horizon (ds - cutoff).
-    Alternatively, metrics can be computed at the level of individual ds/cutoff
-    pairs (aggregation='none'), or aggregated over all ds/cutoffs
-    (aggregation='all').
-
-    The output is a dataframe containing the columns corresponding to the level
-    of aggregation ('horizon', 'ds' and 'cutoff', or none) along with columns
+    Metrics are calculated over a rolling window of cross validation
+    predictions, after sorting by horizon. The size of that window (number of
+    simulated forecast points) is determined by the rolling_window argument,
+    which specifies a proportion of simulated forecast points to include in
+    each window. rolling_window=0 will compute it separately for each simulated
+    forecast point (i.e., 'mse' will actually be squared error with no mean).
+    The default of rolling_window=0.05 will use 5% of the rows in df in each
+    window. rolling_window=1 will compute the metric across all simulated forecast
+    points. The results are set to the right edge of the window.
+
+    The output is a dataframe containing column 'horizon' along with columns
     for each of the metrics computed.
 
     Parameters
@@ -223,22 +227,13 @@ def performance_metrics(df, metrics=None, aggregation='horizon'):
     df: The dataframe returned by cross_validation.
     metrics: A list of performance metrics to compute. If not provided, will
         use ['mse', 'mae', 'mape', 'coverage'].
-    aggregation: Level of aggregation for computing performance statistics.
-        Must be 'horizon', 'none', or 'all'.
+    rolling_window: Proportion of data to use in each rolling window for
+        computing the metrics.
 
     Returns
     -------
-    Dataframe with a column for each metric, and a combination of columns 'ds',
-    'cutoff', and 'horizon', depending on the aggregation level.
+    Dataframe with a column for each metric, and column 'horizon'
     """
-    # Input validation
-    valid_aggregations = ['horizon', 'all', 'none']
-    if aggregation not in valid_aggregations:
-        raise ValueError(
-            'Aggregation {} is not valid; must be one of {}'.format(
-                aggregation, valid_agggregations
-            )
-        )
     valid_metrics = ['mse', 'mae', 'mape', 'coverage']
     if metrics is None:
         metrics = valid_metrics
@@ -248,62 +243,56 @@ def performance_metrics(df, metrics=None, aggregation='horizon'):
         raise ValueError(
             'Valid values for metrics are: {}'.format(valid_metrics)
         )
-    # Get function for the metrics we want
-    metric_fns = {m: eval(m) for m in metrics}
-    def all_metrics(df_g):
-        return pd.Series({name: fn(df_g) for name, fn in metric_fns.items()})
-    # Apply functions to groupby
-    if aggregation == 'all':
-        return all_metrics(df)
-    # else,
     df_m = df.copy()
     df_m['horizon'] = df_m['ds'] - df_m['cutoff']
-    if aggregation == 'horizon':
-        return df_m.groupby('horizon').apply(all_metrics).reset_index()
-    # else,
-    for name, fn in metric_fns.items():
-        df_m[name] = fn(df_m, agg=False)
-    return df_m
+    df_m.sort_values('horizon', inplace=True)
+    # Window size
+    w = int(rolling_window * df_m.shape[0])
+    w = max(w, 1)
+    w = min(w, df_m.shape[0])
+    cols = ['horizon']
+    for metric in metrics:
+        df_m[metric] = eval(metric)(df_m, w)
+        cols.append(metric)
+    df_m = df_m[cols]
+    return df_m.dropna()
+
+
+def rolling_mean(x, w):
+    s = np.cumsum(np.insert(x, 0, 0))
+    prefix = np.empty(w - 1)
+    prefix.fill(np.nan)
+    return np.hstack((prefix, (s[w:] - s[:-w]) / float(w)))  # right-aligned
 
 
 # The functions below specify performance metrics for cross-validation results.
-# Each takes as input the output of cross_validation, and has two modes of
-# return: if agg=True, returns a float that is the metric aggregated over the
-# input. If agg=False, returns results without aggregation (for
-# aggregation='none' in performance_metrics).
+# Each takes as input the output of cross_validation, and returns the statistic
+# as an array, given a window size for rolling aggregation.
 
 
-def mse(df, agg=True):
+def mse(df, w):
     """Mean squared error
     """
     se = (df['y'] - df['yhat']) ** 2
-    if agg:
-        return np.mean(se)
-    return se
+    return rolling_mean(se.values, w)
 
 
-def mae(df, agg=True):
+def mae(df, w):
     """Mean absolute error
     """
     ae = np.abs(df['y'] - df['yhat'])
-    if agg:
-        return np.mean(ae)
-    return ae
+    return rolling_mean(ae.values, w)
 
 
-def mape(df, agg=True):
+def mape(df, w):
     """Mean absolute percent error
     """
     ape = np.abs((df['y'] - df['yhat']) / df['y'])
-    if agg:
-        return np.mean(ape)
-    return ape
+    return rolling_mean(ape.values, w)
 
 
-def coverage(df, agg=True):
+def coverage(df, w):
     """Coverage
     """
     is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper'])
-    if agg:
-        return np.mean(is_covered)
-    return is_covered
+    return rolling_mean(is_covered.values, w)

+ 9 - 40
python/fbprophet/tests/test_diagnostics.py

@@ -142,52 +142,21 @@ class TestDiagnostics(TestCase):
         df_cv = diagnostics.cross_validation(
             m, horizon='4 days', period='10 days', initial='90 days')
         # Aggregation level none
-        df_none = diagnostics.performance_metrics(df_cv, aggregation='none')
+        df_none = diagnostics.performance_metrics(df_cv, rolling_window=0)
         self.assertEqual(
             set(df_none.columns),
-            {
-                'y', 'yhat', 'yhat_lower', 'yhat_upper', 'ds', 'cutoff',
-                'horizon', 'coverage', 'mae', 'mape', 'mse',
-            },
+            {'horizon', 'coverage', 'mae', 'mape', 'mse'},
         )
-        # Check each metric
-        self.assertEqual(
-            np.abs(df_cv['yhat'][0] - df_cv['y'][0]),
-            df_none['mae'][0],
-        )
-        self.assertEqual(
-            np.abs((df_cv['yhat'][0] - df_cv['y'][0]) / df_cv['y'][0]),
-            df_none['mape'][0],
-        )
-        self.assertEqual(
-            (df_cv['yhat'][0] - df_cv['y'][0]) ** 2,
-            df_none['mse'][0],
-        )
-        self.assertEqual(
-            (
-                (df_cv['y'][0] >= df_cv['yhat_lower'][0])
-                and (df_cv['y'][0] <= df_cv['yhat_upper'][0])
-            ),
-            df_none['coverage'][0],
-        )
-        # Aggregation level horizon (default)
-        df_horizon = diagnostics.performance_metrics(df_cv)
+        self.assertEqual(df_none.shape[0], 14)
+        # Aggregation level 0.2
+        df_horizon = diagnostics.performance_metrics(df_cv, rolling_window=0.2)
         self.assertEqual(len(df_horizon['horizon'].unique()), 4)
-        self.assertEqual(
-            set(df_horizon.columns),
-            {'coverage', 'mse', 'mape', 'mae', 'horizon'},
-        )
-        self.assertEqual(df_horizon.shape[0], 4)
-        # Check aggregation
-        agg = df_none.groupby('horizon', as_index=False).agg('mean')
-        for metric in ['mse', 'mape', 'mae', 'horizon']:
-            self.assertTrue((agg[metric] == df_horizon[metric]).all())
+        self.assertEqual(df_horizon.shape[0], 13)
         # Aggregation level all
-        df_all = diagnostics.performance_metrics(df_cv, aggregation='all')
-        self.assertEqual(df_all.shape, (4,))
-        self.assertEqual(set(df_all.index), {'coverage', 'mse', 'mae', 'mape'})
+        df_all = diagnostics.performance_metrics(df_cv, rolling_window=1)
+        self.assertEqual(df_all.shape[0], 1)
         for metric in ['mse', 'mape', 'mae', 'coverage']:
-            self.assertEqual(df_all[metric], df_all[metric].mean())
+            self.assertEqual(df_all[metric].values[0], df_none[metric].mean())
         # Custom list of metrics
         df_horizon = diagnostics.performance_metrics(
             df_cv, metrics=['coverage', 'mse'],