Cluster bootstrap for accuracy

EleutherAI · norabelrose · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023
commit bc3f29c5b10183cf272d3cded771780ce9f82142
diff --git a/elk/metrics/__init__.py b/elk/metrics/__init__.py
@@ -1,8 +1,10 @@
+from .accuracy import accuracy_ci
 from .calibration import CalibrationError, CalibrationEstimate
 from .eval import EvalResult, evaluate_preds, to_one_hot
 from .roc_auc import RocAucResult, roc_auc, roc_auc_ci
 
 __all__ = [
+ "accuracy_ci",
  "CalibrationError",
  "CalibrationEstimate",
  "EvalResult",

diff --git a/elk/metrics/accuracy.py b/elk/metrics/accuracy.py
@@ -2,7 +2,6 @@
 
 import torch
 from torch import Tensor
-from torch.distributions.normal import Normal
 
 
 @dataclass(frozen=True)
@@ -18,31 +17,66 @@ class AccuracyResult:
 
 
 def accuracy_ci(
- y_true: Tensor, y_pred: Tensor, *, level: float = 0.95
+ y_true: Tensor,
+ y_pred: Tensor,
+ *,
+ num_samples: int = 1000,
+ level: float = 0.95,
+ seed: int = 42,
 ) -> AccuracyResult:
- """
- Compute the accuracy of a classifier and its confidence interval.
+ """Bootstrap confidence interval for accuracy, with optional clustering.
+
+ When the input arguments are 2D, this function performs the cluster bootstrap,
+ resampling clusters with replacement instead of individual samples. The first
+ axis is assumed to be the cluster axis.
 
  Args:
- y_true: Ground truth tensor of shape (N,).
- y_pred: Predicted class tensor of shape (N,).
+ y_true: Ground truth tensor of shape `(N,)` or `(N, cluster_size)`.
+ y_pred: Predicted class tensor of shape `(N,)` or `(N, cluster_size)`.
+ num_samples (int): Number of bootstrap samples to use.
+ level (float): Confidence level of the confidence interval.
+ seed (int): Random seed for reproducibility.
 
  Returns:
- float: Accuracy of the model.
+ RocAucResult: Named tuple containing the lower and upper bounds of the
+ confidence interval, along with the point estimate.
  """
- # We expect the inputs to be integers
- assert not torch.is_floating_point(y_pred) and not torch.is_floating_point(y_true)
- assert y_true.shape == y_pred.shape
+ if torch.is_floating_point(y_pred) or torch.is_floating_point(y_true):
+ raise TypeError("y_true and y_pred should be integer tensors")
+ if y_true.shape != y_pred.shape:
+ raise ValueError(
+ f"y_true and y_pred should have the same shape; "
+ f"got {y_true.shape} and {y_pred.shape}"
+ )
+ if y_true.dim() not in (1, 2):
+ raise ValueError("y_true and y_pred should be 1D or 2D tensors")
 
- # Point estimate of the accuracy
- acc = y_pred.eq(y_true).float().mean()
+ # Either the number of samples (1D) or the number of clusters (2D)
+ N = y_true.shape[0]
+ device = y_true.device
 
- # Compute the CI quantiles
- alpha = (1 - level) / 2
- q = acc.new_tensor([alpha, 1 - alpha])
+ # Generate random indices for bootstrap samples (shape: [num_bootstraps, N])
+ rng = torch.Generator(device=device).manual_seed(seed)
+ indices = torch.randint(0, N, (num_samples, N), device=device, generator=rng)
 
- # Normal approximation to the binomial distribution
- stderr = (acc * (1 - acc) / len(y_true)) ** 0.5
- lower, upper = Normal(acc, stderr).icdf(q).tolist()
+ # Create bootstrap samples of true labels and predicted probabilities
+ y_true_bootstraps = y_true[indices]
+ y_pred_bootstraps = y_pred[indices]
+
+ # Compute ROC AUC scores for bootstrap samples. If the inputs were 2D, the
+ # bootstrapped tensors are now 3D [num_bootstraps, N, cluster_size], so we
+ # call flatten(1) to get a 2D tensor [num_bootstraps, N * cluster_size].
+ bootstrap_hits = y_true_bootstraps.flatten(1).eq(y_pred_bootstraps.flatten(1))
+ bootstrap_accs = bootstrap_hits.float().mean(1)
+
+ # Calculate the lower and upper bounds of the confidence interval. We use
+ # nanquantile instead of quantile because some bootstrap samples may have
+ # NaN values due to the fact that they have only one class.
+ alpha = (1 - level) / 2
+ q = bootstrap_accs.new_tensor([alpha, 1 - alpha])
+ lower, upper = bootstrap_accs.nanquantile(q).tolist()
 
- return AccuracyResult(acc.item(), lower, upper)
+ # Compute the point estimate. Call flatten to ensure that we get a single number
+ # computed across cluster boundaries even if the inputs were clustered.
+ estimate = y_true.flatten().eq(y_pred.flatten()).float().mean().item()
+ return AccuracyResult(estimate, lower, upper)
diff --git a/elk/metrics/eval.py b/elk/metrics/eval.py
@@ -1,7 +1,7 @@
 from dataclasses import asdict, dataclass
 
 import torch
-from einops import rearrange, repeat
+from einops import repeat
 from torch import Tensor
 
 from .accuracy import AccuracyResult, accuracy_ci
@@ -57,23 +57,22 @@ def evaluate_preds(y_true: Tensor, y_pred: Tensor) -> EvalResult:
  # Clustered bootstrap confidence intervals for AUROC
  y_true = repeat(y_true, "n -> n v", v=v)
  auroc = roc_auc_ci(to_one_hot(y_true, c).long().flatten(1), y_pred.flatten(1))
-
- y_pred = rearrange(y_pred, "n v c -> (n v) c")
- y_true = y_true.flatten()
-
  acc = accuracy_ci(y_true, y_pred.argmax(dim=-1))
+
  cal_acc = None
  cal_err = None
 
  if c == 2:
- pos_probs = y_pred[..., 1].flatten().sigmoid()
- cal_err = CalibrationError().update(y_true, pos_probs).compute()
+ pos_probs = y_pred[..., 1].sigmoid()
 
  # Calibrated accuracy
  cal_thresh = pos_probs.float().quantile(y_true.float().mean())
  cal_preds = pos_probs.gt(cal_thresh).to(torch.int)
  cal_acc = accuracy_ci(y_true, cal_preds)
 
+ cal = CalibrationError().update(y_true.flatten(), pos_probs.flatten())
+ cal_err = cal.compute()
+
  return EvalResult(acc, cal_acc, cal_err, auroc)
 
 

diff --git a/elk/metrics/roc_auc.py b/elk/metrics/roc_auc.py
@@ -123,7 +123,7 @@ def roc_auc_ci(
  # nanquantile instead of quantile because some bootstrap samples may have
  # NaN values due to the fact that they have only one class.
  alpha = (1 - level) / 2
- q = y_pred.new_tensor([alpha, 1 - alpha])
+ q = bootstrap_aucs.new_tensor([alpha, 1 - alpha])
  lower, upper = bootstrap_aucs.nanquantile(q).tolist()
 
  # Compute the point estimate. Call flatten to ensure that we get a single number

diff --git a/tests/test_roc_auc.py → tests/test_metrics.py b/tests/test_roc_auc.py → tests/test_metrics.py
@@ -1,13 +1,16 @@
+import math
+
 import numpy as np
 import torch
 from sklearn.datasets import make_classification
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_auc_score
+from torch.distributions.normal import Normal
 
-from elk.metrics import roc_auc
+from elk.metrics import accuracy_ci, roc_auc
 
 
-def test_roc_auc_score():
+def test_auroc_and_acc():
  # Generate 1D binary classification dataset
  X_1d, y_true_1d = make_classification(n_samples=1000, random_state=42)
 
@@ -51,3 +54,24 @@ def test_roc_auc_score():
  # Assert that the results from the two implementations are almost equal
  np.testing.assert_almost_equal(roc_auc_1d_torch, roc_auc_1d_sklearn)
  np.testing.assert_almost_equal(roc_auc_2d_torch, roc_auc_2d_sklearn)
+
+ ### Test accuracy_ci function ###
+ # Compute accuracy confidence interval
+ level = 0.95
+ hard_preds = y_scores_1d_torch > 0.5
+ acc_ci = accuracy_ci(y_true_1d_torch, hard_preds, level=level)
+
+ # Point estimate of the accuracy
+ acc = hard_preds.eq(y_true_1d_torch).float().mean()
+
+ # Compute the CI quantiles
+ alpha = (1 - level) / 2
+ q = acc.new_tensor([alpha, 1 - alpha])
+
+ # Normal approximation to the binomial distribution
+ stderr = (acc * (1 - acc) / len(y_true_1d_torch)) ** 0.5
+ lower, upper = Normal(acc, stderr).icdf(q).tolist()
+
+ # Assert that the results from the two implementations are close
+ assert math.isclose(acc_ci.lower, lower, rel_tol=2e-3)
+ assert math.isclose(acc_ci.upper, upper, rel_tol=2e-3)