Переглянути джерело

Refactor diagnostic metrics to allow/include grouping by horizon (Py)

Ben Letham 7 роки тому
батько
коміт
994db64942
2 змінених файлів з 162 додано та 54 видалено
  1. 101 54
      python/fbprophet/diagnostics.py
  2. 61 0
      python/fbprophet/tests/test_diagnostics.py

+ 101 - 54
python/fbprophet/diagnostics.py

@@ -196,67 +196,114 @@ def prophet_copy(m, cutoff=None):
     return m2
 
 
-def me(df):
-    return((df['yhat'] - df['y']).sum()/len(df['yhat']))
-def mse(df):
-    return((df['yhat'] - df['y']).pow(2).sum()/len(df))
-def rmse(df):
-    return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
-def mae(df):
-    return((df['yhat'] - df['y']).abs().sum()/len(df))
-def mpe(df):
-    return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
-def mape(df):
-    return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))
-
-def all_metrics(model, df_cv = None):
-    """Compute model fit metrics for time series.
-
-    Computes the following metrics about each time series that has been through 
-    Cross Validation;
-
-    Mean Error (ME)
-    Mean Squared Error (MSE)
-    Root Mean Square Error (RMSE,
-    Mean Absolute Error (MAE)
-    Mean Percentage Error (MPE)
-    Mean Absolute Percentage Error (MAPE)
+def performance_metrics(df, metrics=None, aggregation='horizon'):
+    """Compute performance metrics from cross-validation results.
+
+    Computes a suite of performance metrics on the output of cross-validation.
+    By default the following metrics are included:
+    'mse': mean squared error
+    'mae': mean absolute error
+    'mape': mean percent error
+    'coverage': coverage of the upper and lower intervals
+
+    A subset of these can be specified by passing a list of names as the
+    `metrics` argument.
+
+    By default, metrics will be computed for each horizon (ds - cutoff).
+    Alternatively, metrics can be computed at the level of individual ds/cutoff
+    pairs (aggregation='none'), or aggregated over all ds/cutoffs
+    (aggregation='all').
+
+    The output is a dataframe containing the columns corresponding to the level
+    of aggregation ('horizon', 'ds' and 'cutoff', or none) along with columns
+    for each of the metrics computed.
 
     Parameters
     ----------
-    df: A pandas dataframe. Contains y and yhat produced by cross-validation
+    df: The dataframe returned by cross_validation.
+    metrics: A list of performance metrics to compute. If not provided, will
+        use ['mse', 'mae', 'mape', 'coverage'].
+    aggregation: Level of aggregation for computing performance statistics.
+        Must be 'horizon', 'none', or 'all'.
 
     Returns
     -------
-    A dictionary where the key = the error type, and value is the value of the error
+    Dataframe with a column for each metric, and a combination of columns 'ds',
+    'cutoff', and 'horizon', depending on the aggregation level.
     """
+    # Input validation
+    valid_aggregations = ['horizon', 'all', 'none']
+    if aggregation not in valid_aggregations:
+        raise ValueError(
+            'Aggregation {} is not valid; must be one of {}'.format(
+                aggregation, valid_agggregations
+            )
+        )
+    valid_metrics = ['mse', 'mae', 'mape', 'coverage']
+    if metrics is None:
+        metrics = valid_metrics
+    if len(set(metrics)) != len(metrics):
+        raise ValueError('Input metrics must be a list of unique values')
+    if not set(metrics).issubset(set(valid_metrics)):
+        raise ValueError(
+            'Valid values for metrics are: {}'.format(valid_metrics)
+        )
+    # Get function for the metrics we want
+    metric_fns = {m: eval(m) for m in metrics}
+    def all_metrics(df_g):
+        return pd.Series({name: fn(df_g) for name, fn in metric_fns.items()})
+    # Apply functions to groupby
+    if aggregation == 'all':
+        return all_metrics(df)
+    # else,
+    df_m = df.copy()
+    df_m['horizon'] = df_m['ds'] - df_m['cutoff']
+    if aggregation == 'horizon':
+        return df_m.groupby('horizon').apply(all_metrics).reset_index()
+    # else,
+    for name, fn in metric_fns.items():
+        df_m[name] = fn(df_m, agg=False)
+    return df_m
+
+
+# The functions below specify performance metrics for cross-validation results.
+# Each takes as input the output of cross_validation, and has two modes of
+# return: if agg=True, returns a float that is the metric aggregated over the
+# input. If agg=False, returns results without aggregation (for
+# aggregation='none' in performance_metrics).
+
+
+def mse(df, agg=True):
+    """Mean squared error
+    """
+    se = (df['y'] - df['yhat']) ** 2
+    if agg:
+        return np.mean(se)
+    return se
 
-    
 
-    df = []
+def mae(df, agg=True):
+    """Mean absolute error
+    """
+    ae = np.abs(df['y'] - df['yhat'])
+    if agg:
+        return np.mean(ae)
+    return ae
 
-    if df_cv is not None:
-        df = df_cv
-    else:
-        # run a forecast on your own data with period = 0 so that it is in-sample data onlyl
-        #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
-        df = (model
-                .history[['ds', 'y']]
-                .merge(
-                    model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']], 
-                    how='inner', on='ds'
-                    )
-                )
-
-    if 'yhat' not in df.columns:
-        raise ValueError(
-            'Please run Cross-Validation first before computing quality metrics.')
-
-    return {
-            'ME':me(df),
-            'MSE':mse(df), 
-            'RMSE': rmse(df), 
-            'MAE': mae(df), 
-            'MPE': mpe(df), 
-            'MAPE': mape(df)
-            }
+
+def mape(df, agg=True):
+    """Mean absolute percent error
+    """
+    ape = np.abs((df['y'] - df['yhat']) / df['y'])
+    if agg:
+        return np.mean(ape)
+    return ape
+
+
+def coverage(df, agg=True):
+    """Coverage
+    """
+    is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper'])
+    if agg:
+        return np.mean(is_covered)
+    return is_covered

+ 61 - 0
python/fbprophet/tests/test_diagnostics.py

@@ -135,3 +135,64 @@ class TestDiagnostics(TestCase):
             ((df_cv1['y'] - df_cv2['y']) ** 2).sum(), 0.0)
         self.assertAlmostEqual(
             ((df_cv1['yhat'] - df_cv2['yhat']) ** 2).sum(), 0.0)
+
+    def test_performance_metrics(self):
+        m = Prophet()
+        m.fit(self.__df)
+        df_cv = diagnostics.cross_validation(
+            m, horizon='4 days', period='10 days', initial='90 days')
+        # Aggregation level none
+        df_none = diagnostics.performance_metrics(df_cv, aggregation='none')
+        self.assertEqual(
+            set(df_none.columns),
+            {
+                'y', 'yhat', 'yhat_lower', 'yhat_upper', 'ds', 'cutoff',
+                'horizon', 'coverage', 'mae', 'mape', 'mse',
+            },
+        )
+        # Check each metric
+        self.assertEqual(
+            np.abs(df_cv['yhat'][0] - df_cv['y'][0]),
+            df_none['mae'][0],
+        )
+        self.assertEqual(
+            np.abs((df_cv['yhat'][0] - df_cv['y'][0]) / df_cv['y'][0]),
+            df_none['mape'][0],
+        )
+        self.assertEqual(
+            (df_cv['yhat'][0] - df_cv['y'][0]) ** 2,
+            df_none['mse'][0],
+        )
+        self.assertEqual(
+            (
+                (df_cv['y'][0] >= df_cv['yhat_lower'][0])
+                and (df_cv['y'][0] <= df_cv['yhat_upper'][0])
+            ),
+            df_none['coverage'][0],
+        )
+        # Aggregation level horizon (default)
+        df_horizon = diagnostics.performance_metrics(df_cv)
+        self.assertEqual(len(df_horizon['horizon'].unique()), 4)
+        self.assertEqual(
+            set(df_horizon.columns),
+            {'coverage', 'mse', 'mape', 'mae', 'horizon'},
+        )
+        self.assertEqual(df_horizon.shape[0], 4)
+        # Check aggregation
+        agg = df_none.groupby('horizon', as_index=False).agg('mean')
+        for metric in ['mse', 'mape', 'mae', 'horizon']:
+            self.assertTrue((agg[metric] == df_horizon[metric]).all())
+        # Aggregation level all
+        df_all = diagnostics.performance_metrics(df_cv, aggregation='all')
+        self.assertEqual(df_all.shape, (4,))
+        self.assertEqual(set(df_all.index), {'coverage', 'mse', 'mae', 'mape'})
+        for metric in ['mse', 'mape', 'mae', 'coverage']:
+            self.assertEqual(df_all[metric], df_all[metric].mean())
+        # Custom list of metrics
+        df_horizon = diagnostics.performance_metrics(
+            df_cv, metrics=['coverage', 'mse'],
+        )
+        self.assertEqual(
+            set(df_horizon.columns),
+            {'coverage', 'mse', 'horizon'},
+        )