"""
Metrics Module
Comprehensive evaluation metrics for regression and classification tasks.
"""
import numpy as np
[docs]
class Metrics:
"""
Comprehensive collection of evaluation metrics for neural networks.
Provides implementations of standard metrics for both regression and
classification tasks. All metrics handle edge cases and provide
meaningful results for model evaluation.
"""
[docs]
@staticmethod
def accuracy_multiclass(y_true, y_pred):
"""
Compute multi-class classification accuracy.
Calculates the fraction of correctly predicted samples for multi-class
classification problems. Handles both sparse labels and one-hot encoded inputs.
Args:
y_true (NDArray[np.float64]): True class labels of shape (N,) for sparse labels
or (N, C) for one-hot encoded.
y_pred (NDArray[np.float64]): Predicted class probabilities of shape (N, C).
Returns:
float: Classification accuracy as a fraction (0.0 to 1.0).
Example:
>>> accuracy = Metrics.accuracy_multiclass(y_true, y_pred)
>>> print(f"Accuracy: {accuracy:.2%}")
"""
# y_pred: (N, C), y_true: (N,) or (N, C)
pred_classes = np.argmax(y_pred, axis=1)
if y_true.ndim == 1:
true_classes = y_true
else:
true_classes = np.argmax(y_true, axis=1)
return float(np.mean(pred_classes == true_classes))
[docs]
@staticmethod
def accuracy_binary(y_true, y_pred, thresh=0.5):
"""
Compute binary classification accuracy.
Calculates the fraction of correctly predicted samples for binary
classification by applying a threshold to predicted probabilities.
Args:
y_true (NDArray[np.float64]): Binary labels (0/1) of shape (N,) or (N, 1).
y_pred (NDArray[np.float64]): Predicted probabilities of shape (N,) or (N, 1).
thresh (float, optional): Classification threshold. Defaults to 0.5.
Returns:
float: Binary classification accuracy as a fraction (0.0 to 1.0).
Example:
>>> accuracy = Metrics.accuracy_binary(y_true, y_pred, thresh=0.5)
>>> print(f"Binary Accuracy: {accuracy:.2%}")
"""
# y_pred: (N, C), y_true: (N,) or (N, C)
preds = (y_pred >= thresh).astype(int)
y_true = y_true.reshape(preds.shape)
return float(np.mean(preds == y_true))
[docs]
@staticmethod
def mse(y_true, y_pred):
"""
Compute mean squared error metric.
Calculates the average squared differences between predicted and true values.
Commonly used metric for regression problems.
Args:
y_true (NDArray[np.float64]): Ground truth values of shape (N,) or (N, 1).
y_pred (NDArray[np.float64]): Predicted values of shape (N,) or (N, 1).
Returns:
float: Mean squared error (scalar).
Example:
>>> mse_score = Metrics.mse(y_true, y_pred)
>>> print(f"MSE: {mse_score:.4f}")
"""
y_true = np.asarray(y_true).flatten()
y_pred = np.asarray(y_pred).flatten()
return float(np.mean((y_true - y_pred) ** 2))
[docs]
@staticmethod
def rmse(y_true, y_pred):
return float(np.sqrt(Metrics.mse(y_true, y_pred)))
[docs]
@staticmethod
def mae(y_true, y_pred):
y_true = np.asarray(y_true).flatten()
y_pred = np.asarray(y_pred).flatten()
return float(np.mean(np.abs(y_true - y_pred)))
[docs]
@staticmethod
def r2_score(y_true, y_pred):
"""
Compute coefficient of determination (R² score).
Measures the proportion of variance in the dependent variable that is
predictable from the independent variables. R² = 1 indicates perfect fit,
R² = 0 indicates the model performs as well as predicting the mean.
Args:
y_true (NDArray[np.float64]): Ground truth values of shape (N,) or (N, 1).
y_pred (NDArray[np.float64]): Predicted values of shape (N,) or (N, 1).
Returns:
float: R² score (can be negative for very poor fits).
Example:
>>> r2 = Metrics.r2_score(y_true, y_pred)
>>> print(f"R² Score: {r2:.3f}")
"""
y_true = np.asarray(y_true).flatten()
y_pred = np.asarray(y_pred).flatten()
ss_res = np.sum((y_true - y_pred) ** 2)
ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
if ss_tot == 0:
return 1.0 if ss_res == 0 else 0.0
return float(1.0 - (ss_res / ss_tot))
@staticmethod
def _get_classification_data(y_true, y_pred, threshold=0.5):
"""
Convert predictions to class arrays and compute confusion matrix elements.
Returns:
tuple: (y_true_classes, y_pred_classes, num_classes, tp, fp, fn)
"""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
if y_pred.ndim == 1 or (y_pred.ndim == 2 and y_pred.shape[1] == 1):
# Binary classification
y_pred_classes = (y_pred.flatten() >= threshold).astype(int)
y_true_classes = y_true.flatten().astype(int)
num_classes = 2
else:
# Multi-class classification
y_pred_classes = np.argmax(y_pred, axis=1)
if y_true.ndim == 1:
y_true_classes = y_true.astype(int)
else:
y_true_classes = np.argmax(y_true, axis=1)
num_classes = max(np.max(y_true_classes), np.max(y_pred_classes)) + 1
# Compute confusion matrix elements for all classes at once
tp = np.zeros(num_classes)
fp = np.zeros(num_classes)
fn = np.zeros(num_classes)
for i in range(num_classes):
tp[i] = np.sum((y_true_classes == i) & (y_pred_classes == i))
fp[i] = np.sum((y_true_classes != i) & (y_pred_classes == i))
fn[i] = np.sum((y_true_classes == i) & (y_pred_classes != i))
return y_true_classes, y_pred_classes, num_classes, tp, fp, fn
@staticmethod
def _apply_averaging(scores, y_true_classes, num_classes, average):
"""Apply averaging strategy to per-class scores."""
if average == "macro":
return float(np.mean(scores))
elif average == "weighted":
support = np.array(
[np.sum(y_true_classes == i) for i in range(num_classes)]
)
total_support = np.sum(support)
if total_support == 0:
return 0.0
weights = support / total_support
return float(np.sum(scores * weights))
else:
return scores
[docs]
@staticmethod
def precision(y_true, y_pred, average="weighted", threshold=0.5):
"""
Compute precision score: TP / (TP + FP)
Args:
y_true: True labels
y_pred: Predicted probabilities or labels
average: 'macro', 'weighted', or None for per-class scores
threshold: Decision threshold for binary classification
"""
y_true_classes, y_pred_classes, num_classes, tp, fp, fn = (
Metrics._get_classification_data(y_true, y_pred, threshold)
)
precision_scores = np.zeros(num_classes)
for i in range(num_classes):
precision_scores[i] = (
tp[i] / (tp[i] + fp[i]) if (tp[i] + fp[i]) > 0 else 0.0
)
return Metrics._apply_averaging(
precision_scores, y_true_classes, num_classes, average
)
[docs]
@staticmethod
def recall(y_true, y_pred, average="weighted", threshold=0.5):
"""
Compute recall score: TP / (TP + FN)
Args:
y_true: True labels
y_pred: Predicted probabilities or labels
average: 'macro', 'weighted', or None for per-class scores
threshold: Decision threshold for binary classification
"""
y_true_classes, y_pred_classes, num_classes, tp, fp, fn = (
Metrics._get_classification_data(y_true, y_pred, threshold)
)
recall_scores = np.zeros(num_classes)
for i in range(num_classes):
recall_scores[i] = tp[i] / (tp[i] + fn[i]) if (tp[i] + fn[i]) > 0 else 0.0
return Metrics._apply_averaging(
recall_scores, y_true_classes, num_classes, average
)
[docs]
@staticmethod
def f1_score(y_true, y_pred, average="weighted", threshold=0.5):
"""
Compute F1 score: 2 * (Precision * Recall) / (Precision + Recall)
Args:
y_true: True labels
y_pred: Predicted probabilities or labels
average: 'macro', 'weighted', or None for per-class scores
threshold: Decision threshold for binary classification
"""
y_true_classes, y_pred_classes, num_classes, tp, fp, fn = (
Metrics._get_classification_data(y_true, y_pred, threshold)
)
f1_scores = np.zeros(num_classes)
for i in range(num_classes):
precision_i = tp[i] / (tp[i] + fp[i]) if (tp[i] + fp[i]) > 0 else 0.0
recall_i = tp[i] / (tp[i] + fn[i]) if (tp[i] + fn[i]) > 0 else 0.0
f1_scores[i] = (
2 * precision_i * recall_i / (precision_i + recall_i)
if (precision_i + recall_i) > 0
else 0.0
)
return Metrics._apply_averaging(f1_scores, y_true_classes, num_classes, average)