Source code for calibration_toolbox.metrics

"""
Calibration metrics for evaluating model uncertainty.

This module provides a comprehensive collection of binning-based calibration
metrics for classification models, centered around the General Calibration
Error (GCE) framework.
"""

import numpy as np
from scipy.special import softmax
from typing import Union, Literal


[docs] def general_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, class_conditional: bool = False, adaptive_bins: bool = False, top_k_classes: Union[int, Literal['all']] = 1, norm: Union[int, Literal['inf']] = 1, thresholding: float = 0.0, logits: bool = False ) -> float: """ Calculate General Calibration Error (GCE). The GCE is a flexible calibration metric that can be configured to produce many popular calibration metrics including ECE, MCE, RMSCE, ACE, and SCE. The class-conditional GCE with L^p norm is defined as: GCE = (Σ_k Σ_b (n_bk / NK) |acc(b,k) - conf(b,k)|^p)^(1/p) Where acc(b,k) and conf(b,k) are the accuracy and confidence of bin b for class label k; n_bk is the number of predictions in bin b for class k; N is the total number of data points; and K is the number of classes. References: Kull et al. (2019). "Beyond temperature scaling: Obtaining well-calibrated multiclass probabilities with Dirichlet calibration." NeurIPS. Nixon et al. (2020). "Measuring Calibration in Deep Learning." CVPR Workshops. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. class_conditional: If True, compute class-conditional calibration. Default: False. adaptive_bins: If True, use adaptive binning based on data distribution. Default: False (uniform bins). top_k_classes: Number of top predicted classes to consider. Use 'all' to consider all classes. Default: 1 (top prediction only). norm: L^p norm to use. Can be 1, 2, or 'inf'. Default: 1. thresholding: Ignore probabilities below this threshold. Default: 0.0. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: GCE value, typically between 0 and 1 (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> gce = general_calibration_error(probs, labels) >>> print(f"GCE: {gce:.4f}") """ # Convert logits to probabilities if needed if logits: probabilities = softmax(probabilities, axis=1) # Validate inputs probabilities = np.asarray(probabilities) labels = np.asarray(labels) if probabilities.ndim != 2: raise ValueError("probabilities must be a 2D array of shape (n_samples, n_classes)") if labels.ndim != 1: raise ValueError("labels must be a 1D array of shape (n_samples,)") if probabilities.shape[0] != labels.shape[0]: raise ValueError("Number of samples in probabilities and labels must match") n_samples = probabilities.shape[0] n_classes = probabilities.shape[1] # Apply thresholding if thresholding > 0: probabilities = probabilities.copy() probabilities[probabilities < thresholding] = 0.0 # Get predictions and confidences predictions = np.argmax(probabilities, axis=1) confidences = np.max(probabilities, axis=1) accuracies = (predictions == labels).astype(float) if not class_conditional: # Standard calibration (top-1 prediction only) gce = _compute_calibration_error( confidences, accuracies, n_bins, adaptive_bins, norm ) else: # Class-conditional calibration if top_k_classes == 'all': # Compute for all classes class_errors = [] for k in range(n_classes): class_probs = probabilities[:, k] class_correct = (labels == k).astype(float) if np.sum(class_probs > 0) == 0: # Skip if no predictions for this class continue error = _compute_calibration_error( class_probs, class_correct, n_bins, adaptive_bins, norm ) class_errors.append(error) gce = np.mean(class_errors) if class_errors else 0.0 else: # Compute for top-k classes top_k = min(top_k_classes, n_classes) top_k_indices = np.argsort(probabilities, axis=1)[:, -top_k:] class_errors = [] for k in range(top_k): # Get k-th highest probability for each sample k_idx = top_k_indices[:, -(k+1)] k_probs = probabilities[np.arange(n_samples), k_idx] k_correct = (labels == k_idx).astype(float) error = _compute_calibration_error( k_probs, k_correct, n_bins, adaptive_bins, norm ) class_errors.append(error) gce = np.mean(class_errors) if class_errors else 0.0 return float(gce)
def _compute_calibration_error( confidences: np.ndarray, accuracies: np.ndarray, n_bins: int, adaptive_bins: bool, norm: Union[int, Literal['inf']] ) -> float: """ Compute calibration error for a single set of confidences and accuracies. Args: confidences: Array of confidence values. accuracies: Array of binary accuracy values. n_bins: Number of bins. adaptive_bins: Whether to use adaptive binning. norm: L^p norm to use. Returns: float: Calibration error value. """ n_samples = len(confidences) if n_samples == 0: return 0.0 # Compute bin boundaries if adaptive_bins: # Adaptive binning: equal number of samples per bin bin_n = max(1, n_samples // n_bins) sorted_indices = np.argsort(confidences) bin_boundaries = [0.0] for i in range(1, n_bins): idx = min(i * bin_n, n_samples - 1) bin_boundaries.append(confidences[sorted_indices[idx]]) bin_boundaries.append(1.0) bin_lowers = np.array(bin_boundaries[:-1]) bin_uppers = np.array(bin_boundaries[1:]) else: # Uniform binning bin_boundaries = np.linspace(0, 1, n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] # Compute calibration error for each bin bin_errors = [] bin_weights = [] for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): # Find samples in this bin in_bin = np.logical_and( confidences > bin_lower, confidences <= bin_upper ) bin_size = np.sum(in_bin) if bin_size > 0: bin_confidence = np.mean(confidences[in_bin]) bin_accuracy = np.mean(accuracies[in_bin]) bin_error = np.abs(bin_confidence - bin_accuracy) bin_errors.append(bin_error) bin_weights.append(bin_size / n_samples) if not bin_errors: return 0.0 bin_errors = np.array(bin_errors) bin_weights = np.array(bin_weights) # Compute weighted norm if norm == 'inf': return float(np.max(bin_errors)) elif norm == 1: return float(np.sum(bin_weights * bin_errors)) elif norm == 2: return float(np.sqrt(np.sum(bin_weights * (bin_errors ** 2)))) else: # General L^p norm return float((np.sum(bin_weights * (bin_errors ** norm))) ** (1.0 / norm)) # Wrapper functions for common metrics
[docs] def expected_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Expected Calibration Error (ECE). ECE measures the difference between model confidence and accuracy across uniformly-spaced bins. It is defined as: ECE = Σ_b (n_b / N) |acc(b) - conf(b)| Reference: Naeini et al. (2015). "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: ECE value between 0 and 1 (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> ece = expected_calibration_error(probs, labels) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=False, adaptive_bins=False, top_k_classes=1, norm=1, logits=logits )
[docs] def maximum_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Maximum Calibration Error (MCE). MCE is the maximum calibration error across all bins: MCE = max_b |acc(b) - conf(b)| Reference: Naeini et al. (2015). "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: MCE value between 0 and 1 (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> mce = maximum_calibration_error(probs, labels) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=False, adaptive_bins=False, top_k_classes=1, norm='inf', logits=logits )
[docs] def root_mean_square_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Root Mean Square Calibration Error (RMSCE). RMSCE is the root mean square of calibration errors across bins: RMSCE = sqrt(Σ_b (n_b / N) (acc(b) - conf(b))^2) Reference: Hendrycks et al. (2019). "Deep Anomaly Detection with Outlier Exposure." ICLR. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: RMSCE value between 0 and 1 (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> rmsce = root_mean_square_calibration_error(probs, labels) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=False, adaptive_bins=False, top_k_classes=1, norm=2, logits=logits )
[docs] def static_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Static Calibration Error (SCE). SCE is the class-conditional calibration error with uniform binning, averaged across all classes: SCE = (1/K) Σ_k Σ_b (n_bk / N) |acc(b,k) - conf(b,k)| Reference: Nixon et al. (2020). "Measuring Calibration in Deep Learning." CVPR Workshops. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: SCE value (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> sce = static_calibration_error(probs, labels) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=True, adaptive_bins=False, top_k_classes='all', norm=1, logits=logits )
[docs] def adaptive_calibration_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Adaptive Calibration Error (ACE). ACE is the class-conditional calibration error with adaptive binning (equal number of samples per bin), averaged across all classes. Reference: Nixon et al. (2020). "Measuring Calibration in Deep Learning." CVPR Workshops. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: ACE value (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> ace = adaptive_calibration_error(probs, labels) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=True, adaptive_bins=True, top_k_classes='all', norm=1, logits=logits )
[docs] def top_k_calibration_error( probabilities: np.ndarray, labels: np.ndarray, k: int = 1, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Top-k Calibration Error. Computes calibration error for the top-k predicted classes, averaged across the k classes. Reference: Gupta et al. (2021). "Calibration of Neural Networks using Splines." ICLR. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. k: Number of top classes to consider. Default: 1. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: Top-k calibration error (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> top2_ce = top_k_calibration_error(probs, labels, k=2) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=True, adaptive_bins=False, top_k_classes=k, norm=1, logits=logits )
[docs] def thresholded_adaptive_calibration_error( probabilities: np.ndarray, labels: np.ndarray, threshold: float = 0.01, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Thresholded Adaptive Calibration Error (TACE). TACE ignores predictions with confidence below a threshold before computing the adaptive calibration error. Reference: Nixon et al. (2020). "Measuring Calibration in Deep Learning." CVPR Workshops. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. threshold: Confidence threshold. Predictions below this are ignored. Default: 0.01. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: TACE value (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> tace = thresholded_adaptive_calibration_error(probs, labels, threshold=0.01) """ return general_calibration_error( probabilities, labels, n_bins=n_bins, class_conditional=True, adaptive_bins=True, top_k_classes='all', norm=1, thresholding=threshold, logits=logits )
[docs] def overconfidence_error( probabilities: np.ndarray, labels: np.ndarray, n_bins: int = 15, logits: bool = False ) -> float: """ Calculate Overconfidence Error (OE). OE measures the degree of overconfidence, penalizing confident but incorrect predictions more heavily: OE = Σ_b (n_b / N) * conf(b) * max(conf(b) - acc(b), 0) Reference: Thulasidasan et al. (2019). "On Mixup Training: Improved Calibration and Predictive Uncertainty for Deep Neural Networks." NeurIPS. Args: probabilities: Array of shape (n_samples, n_classes) containing predicted probabilities for each class. labels: Array of shape (n_samples,) containing true class labels. n_bins: Number of bins for confidence discretization. Default: 15. logits: If True, input is logits and will be converted to probabilities. Default: False. Returns: float: OE value (lower is better). Example: >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]]) >>> labels = np.array([0, 1, 0]) >>> oe = overconfidence_error(probs, labels) """ # Convert logits to probabilities if needed if logits: probabilities = softmax(probabilities, axis=1) probabilities = np.asarray(probabilities) labels = np.asarray(labels) predictions = np.argmax(probabilities, axis=1) confidences = np.max(probabilities, axis=1) accuracies = (predictions == labels).astype(float) n_samples = len(labels) bin_boundaries = np.linspace(0, 1, n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] oe = 0.0 for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): in_bin = np.logical_and( confidences > bin_lower, confidences <= bin_upper ) bin_size = np.sum(in_bin) if bin_size > 0: bin_confidence = np.mean(confidences[in_bin]) bin_accuracy = np.mean(accuracies[in_bin]) bin_weight = bin_size / n_samples # Overconfidence penalty overconf = max(bin_confidence - bin_accuracy, 0) oe += bin_weight * bin_confidence * overconf return float(oe)
# Convenient aliases ECE = expected_calibration_error MCE = maximum_calibration_error RMSCE = root_mean_square_calibration_error SCE = static_calibration_error ACE = adaptive_calibration_error TACE = thresholded_adaptive_calibration_error OE = overconfidence_error GCE = general_calibration_error