Source code for calibration_toolbox.metrics

"""
Calibration metrics for evaluating model uncertainty.

This module provides a comprehensive collection of binning-based calibration
metrics for classification models, centered around the General Calibration
Error (GCE) framework.
"""

import numpy as np
from scipy.special import softmax
from typing import Union, Literal



[docs]
def general_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    class_conditional: bool = False,
    adaptive_bins: bool = False,
    top_k_classes: Union[int, Literal['all']] = 1,
    norm: Union[int, Literal['inf']] = 1,
    thresholding: float = 0.0,
    logits: bool = False
) -> float:
    """
    Calculate General Calibration Error (GCE).
    
    The GCE is a flexible calibration metric that can be configured to produce
    many popular calibration metrics including ECE, MCE, RMSCE, ACE, and SCE.
    
    The class-conditional GCE with L^p norm is defined as:
    GCE = (Σ_k Σ_b (n_bk / NK) |acc(b,k) - conf(b,k)|^p)^(1/p)
    
    Where acc(b,k) and conf(b,k) are the accuracy and confidence of bin b for
    class label k; n_bk is the number of predictions in bin b for class k;
    N is the total number of data points; and K is the number of classes.
    
    References:
        Kull et al. (2019). "Beyond temperature scaling: Obtaining well-calibrated
        multiclass probabilities with Dirichlet calibration." NeurIPS.
        
        Nixon et al. (2020). "Measuring Calibration in Deep Learning."
        CVPR Workshops.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        class_conditional: If True, compute class-conditional calibration.
            Default: False.
        adaptive_bins: If True, use adaptive binning based on data distribution.
            Default: False (uniform bins).
        top_k_classes: Number of top predicted classes to consider. Use 'all'
            to consider all classes. Default: 1 (top prediction only).
        norm: L^p norm to use. Can be 1, 2, or 'inf'. Default: 1.
        thresholding: Ignore probabilities below this threshold. Default: 0.0.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: GCE value, typically between 0 and 1 (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> gce = general_calibration_error(probs, labels)
        >>> print(f"GCE: {gce:.4f}")
    """
    # Convert logits to probabilities if needed
    if logits:
        probabilities = softmax(probabilities, axis=1)
    
    # Validate inputs
    probabilities = np.asarray(probabilities)
    labels = np.asarray(labels)
    
    if probabilities.ndim != 2:
        raise ValueError("probabilities must be a 2D array of shape (n_samples, n_classes)")
    
    if labels.ndim != 1:
        raise ValueError("labels must be a 1D array of shape (n_samples,)")
    
    if probabilities.shape[0] != labels.shape[0]:
        raise ValueError("Number of samples in probabilities and labels must match")
    
    n_samples = probabilities.shape[0]
    n_classes = probabilities.shape[1]
    
    # Apply thresholding
    if thresholding > 0:
        probabilities = probabilities.copy()
        probabilities[probabilities < thresholding] = 0.0
    
    # Get predictions and confidences
    predictions = np.argmax(probabilities, axis=1)
    confidences = np.max(probabilities, axis=1)
    accuracies = (predictions == labels).astype(float)
    
    if not class_conditional:
        # Standard calibration (top-1 prediction only)
        gce = _compute_calibration_error(
            confidences, accuracies, n_bins, adaptive_bins, norm
        )
    else:
        # Class-conditional calibration
        if top_k_classes == 'all':
            # Compute for all classes
            class_errors = []
            for k in range(n_classes):
                class_probs = probabilities[:, k]
                class_correct = (labels == k).astype(float)
                
                if np.sum(class_probs > 0) == 0:  # Skip if no predictions for this class
                    continue
                
                error = _compute_calibration_error(
                    class_probs, class_correct, n_bins, adaptive_bins, norm
                )
                class_errors.append(error)
            
            gce = np.mean(class_errors) if class_errors else 0.0
        else:
            # Compute for top-k classes
            top_k = min(top_k_classes, n_classes)
            top_k_indices = np.argsort(probabilities, axis=1)[:, -top_k:]
            
            class_errors = []
            for k in range(top_k):
                # Get k-th highest probability for each sample
                k_idx = top_k_indices[:, -(k+1)]
                k_probs = probabilities[np.arange(n_samples), k_idx]
                k_correct = (labels == k_idx).astype(float)
                
                error = _compute_calibration_error(
                    k_probs, k_correct, n_bins, adaptive_bins, norm
                )
                class_errors.append(error)
            
            gce = np.mean(class_errors) if class_errors else 0.0
    
    return float(gce)



def _compute_calibration_error(
    confidences: np.ndarray,
    accuracies: np.ndarray,
    n_bins: int,
    adaptive_bins: bool,
    norm: Union[int, Literal['inf']]
) -> float:
    """
    Compute calibration error for a single set of confidences and accuracies.
    
    Args:
        confidences: Array of confidence values.
        accuracies: Array of binary accuracy values.
        n_bins: Number of bins.
        adaptive_bins: Whether to use adaptive binning.
        norm: L^p norm to use.
    
    Returns:
        float: Calibration error value.
    """
    n_samples = len(confidences)
    
    if n_samples == 0:
        return 0.0
    
    # Compute bin boundaries
    if adaptive_bins:
        # Adaptive binning: equal number of samples per bin
        bin_n = max(1, n_samples // n_bins)
        sorted_indices = np.argsort(confidences)
        bin_boundaries = [0.0]
        
        for i in range(1, n_bins):
            idx = min(i * bin_n, n_samples - 1)
            bin_boundaries.append(confidences[sorted_indices[idx]])
        bin_boundaries.append(1.0)
        
        bin_lowers = np.array(bin_boundaries[:-1])
        bin_uppers = np.array(bin_boundaries[1:])
    else:
        # Uniform binning
        bin_boundaries = np.linspace(0, 1, n_bins + 1)
        bin_lowers = bin_boundaries[:-1]
        bin_uppers = bin_boundaries[1:]
    
    # Compute calibration error for each bin
    bin_errors = []
    bin_weights = []
    
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        # Find samples in this bin
        in_bin = np.logical_and(
            confidences > bin_lower,
            confidences <= bin_upper
        )
        
        bin_size = np.sum(in_bin)
        
        if bin_size > 0:
            bin_confidence = np.mean(confidences[in_bin])
            bin_accuracy = np.mean(accuracies[in_bin])
            bin_error = np.abs(bin_confidence - bin_accuracy)
            
            bin_errors.append(bin_error)
            bin_weights.append(bin_size / n_samples)
    
    if not bin_errors:
        return 0.0
    
    bin_errors = np.array(bin_errors)
    bin_weights = np.array(bin_weights)
    
    # Compute weighted norm
    if norm == 'inf':
        return float(np.max(bin_errors))
    elif norm == 1:
        return float(np.sum(bin_weights * bin_errors))
    elif norm == 2:
        return float(np.sqrt(np.sum(bin_weights * (bin_errors ** 2))))
    else:
        # General L^p norm
        return float((np.sum(bin_weights * (bin_errors ** norm))) ** (1.0 / norm))


# Wrapper functions for common metrics


[docs]
def expected_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Expected Calibration Error (ECE).
    
    ECE measures the difference between model confidence and accuracy
    across uniformly-spaced bins. It is defined as:
    ECE = Σ_b (n_b / N) |acc(b) - conf(b)|
    
    Reference:
        Naeini et al. (2015). "Obtaining Well Calibrated Probabilities 
        Using Bayesian Binning." AAAI.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: ECE value between 0 and 1 (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> ece = expected_calibration_error(probs, labels)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=False,
        adaptive_bins=False, top_k_classes=1, norm=1, logits=logits
    )




[docs]
def maximum_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Maximum Calibration Error (MCE).
    
    MCE is the maximum calibration error across all bins:
    MCE = max_b |acc(b) - conf(b)|
    
    Reference:
        Naeini et al. (2015). "Obtaining Well Calibrated Probabilities 
        Using Bayesian Binning." AAAI.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: MCE value between 0 and 1 (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> mce = maximum_calibration_error(probs, labels)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=False,
        adaptive_bins=False, top_k_classes=1, norm='inf', logits=logits
    )




[docs]
def root_mean_square_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Root Mean Square Calibration Error (RMSCE).
    
    RMSCE is the root mean square of calibration errors across bins:
    RMSCE = sqrt(Σ_b (n_b / N) (acc(b) - conf(b))^2)
    
    Reference:
        Hendrycks et al. (2019). "Deep Anomaly Detection with Outlier Exposure."
        ICLR.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: RMSCE value between 0 and 1 (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> rmsce = root_mean_square_calibration_error(probs, labels)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=False,
        adaptive_bins=False, top_k_classes=1, norm=2, logits=logits
    )




[docs]
def static_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Static Calibration Error (SCE).
    
    SCE is the class-conditional calibration error with uniform binning,
    averaged across all classes:
    SCE = (1/K) Σ_k Σ_b (n_bk / N) |acc(b,k) - conf(b,k)|
    
    Reference:
        Nixon et al. (2020). "Measuring Calibration in Deep Learning."
        CVPR Workshops.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: SCE value (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> sce = static_calibration_error(probs, labels)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=True,
        adaptive_bins=False, top_k_classes='all', norm=1, logits=logits
    )




[docs]
def adaptive_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Adaptive Calibration Error (ACE).
    
    ACE is the class-conditional calibration error with adaptive binning
    (equal number of samples per bin), averaged across all classes.
    
    Reference:
        Nixon et al. (2020). "Measuring Calibration in Deep Learning."
        CVPR Workshops.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: ACE value (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> ace = adaptive_calibration_error(probs, labels)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=True,
        adaptive_bins=True, top_k_classes='all', norm=1, logits=logits
    )




[docs]
def top_k_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    k: int = 1,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Top-k Calibration Error.
    
    Computes calibration error for the top-k predicted classes,
    averaged across the k classes.
    
    Reference:
        Gupta et al. (2021). "Calibration of Neural Networks using Splines."
        ICLR.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        k: Number of top classes to consider. Default: 1.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: Top-k calibration error (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> top2_ce = top_k_calibration_error(probs, labels, k=2)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=True,
        adaptive_bins=False, top_k_classes=k, norm=1, logits=logits
    )




[docs]
def thresholded_adaptive_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    threshold: float = 0.01,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Thresholded Adaptive Calibration Error (TACE).
    
    TACE ignores predictions with confidence below a threshold before
    computing the adaptive calibration error.
    
    Reference:
        Nixon et al. (2020). "Measuring Calibration in Deep Learning."
        CVPR Workshops.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        threshold: Confidence threshold. Predictions below this are ignored.
            Default: 0.01.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: TACE value (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> tace = thresholded_adaptive_calibration_error(probs, labels, threshold=0.01)
    """
    return general_calibration_error(
        probabilities, labels, n_bins=n_bins, class_conditional=True,
        adaptive_bins=True, top_k_classes='all', norm=1,
        thresholding=threshold, logits=logits
    )




[docs]
def overconfidence_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    n_bins: int = 15,
    logits: bool = False
) -> float:
    """
    Calculate Overconfidence Error (OE).
    
    OE measures the degree of overconfidence, penalizing confident
    but incorrect predictions more heavily:
    OE = Σ_b (n_b / N) * conf(b) * max(conf(b) - acc(b), 0)
    
    Reference:
        Thulasidasan et al. (2019). "On Mixup Training: Improved Calibration
        and Predictive Uncertainty for Deep Neural Networks." NeurIPS.
    
    Args:
        probabilities: Array of shape (n_samples, n_classes) containing
            predicted probabilities for each class.
        labels: Array of shape (n_samples,) containing true class labels.
        n_bins: Number of bins for confidence discretization. Default: 15.
        logits: If True, input is logits and will be converted to probabilities.
            Default: False.
    
    Returns:
        float: OE value (lower is better).
    
    Example:
        >>> probs = np.array([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3]])
        >>> labels = np.array([0, 1, 0])
        >>> oe = overconfidence_error(probs, labels)
    """
    # Convert logits to probabilities if needed
    if logits:
        probabilities = softmax(probabilities, axis=1)
    
    probabilities = np.asarray(probabilities)
    labels = np.asarray(labels)
    
    predictions = np.argmax(probabilities, axis=1)
    confidences = np.max(probabilities, axis=1)
    accuracies = (predictions == labels).astype(float)
    
    n_samples = len(labels)
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]
    
    oe = 0.0
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        in_bin = np.logical_and(
            confidences > bin_lower,
            confidences <= bin_upper
        )
        
        bin_size = np.sum(in_bin)
        
        if bin_size > 0:
            bin_confidence = np.mean(confidences[in_bin])
            bin_accuracy = np.mean(accuracies[in_bin])
            bin_weight = bin_size / n_samples
            
            # Overconfidence penalty
            overconf = max(bin_confidence - bin_accuracy, 0)
            oe += bin_weight * bin_confidence * overconf
    
    return float(oe)



# Convenient aliases
ECE = expected_calibration_error
MCE = maximum_calibration_error
RMSCE = root_mean_square_calibration_error
SCE = static_calibration_error
ACE = adaptive_calibration_error
TACE = thresholded_adaptive_calibration_error
OE = overconfidence_error
GCE = general_calibration_error