Source code for neuroscope.mlp.mlp

"""
MLP Neural Network
Main neural network class integrating all framework components.
"""

import pickle
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np

from .core import _BackwardPass, _ForwardPass
from .initializers import WeightInits
from .losses import LossFunctions
from .metrics import Metrics
from .optimizers import SGD, Adam, RMSprop, SGDMomentum
from .utils import Utils



[docs]
class MLP:
    """
    Multi-layer perceptron for quick prototyping and experimentation.

    This MLP supports arbitrary layer sizes, multiple activation functions,
    and modern optimization techniques. Use `compile` to set hyperparameters
    and `fit` to train the model. Includes comprehensive training monitoring
    and diagnostic capabilities.

    Args:
        layer_dims (Sequence[int]): Sizes of layers including input & output, e.g. [784, 128, 10].
        hidden_activation (str, optional): Activation function name for hidden layers.
            Options: "relu", "leaky_relu", "tanh", "sigmoid", "selu". Defaults to "leaky_relu".
        out_activation (str, optional): Output activation function.
            Options: "sigmoid" (binary), "softmax" (multiclass), None (regression). Defaults to None.
        init_method (str, optional): Weight initialization strategy.
            Options: "smart", "he", "xavier", "random", "selu_init". Defaults to "smart".
        init_seed (int, optional): Random seed for reproducible weight initialization. Defaults to 42.
        dropout_rate (float, optional): Dropout probability for hidden layers (0.0-1.0). Defaults to 0.0.
        dropout_type (str, optional): Dropout variant ("normal", "alpha"). Defaults to "normal".

    Attributes:
        weights (list[NDArray[np.float64]]): Internal weight matrices for each layer.
        biases (list[NDArray[np.float64]]): Internal bias vectors for each layer.
        compiled (bool): Whether the model has been compiled for training.

    Example:
        >>> from neuroscope.mlp import MLP
        >>> model = MLP([784, 128, 64, 10], activation="relu", out_activation="softmax")
        >>> model.compile(optimizer="adam", lr=1e-3)
        >>> history = model.fit(X_train, y_train, epochs=100)
        >>> predictions = model.predict(X_test)
    """


[docs]
    def __init__(
        self,
        layer_dims,
        hidden_activation="leaky_relu",
        out_activation=None,
        init_method="smart",
        init_seed=42,
        dropout_rate=0.0,
        dropout_type="normal",
    ):
        self.layer_dims = layer_dims
        self.hidden_activation = hidden_activation
        self.out_activation = out_activation
        self.init_method = init_method
        self.init_seed = init_seed
        self.dropout_rate = dropout_rate
        self.dropout_type = dropout_type

        self._initialize_weights()
        # Training configuration (set by compile)
        self.optimizer = None
        self.lr = None
        self.reg = None
        self.lamda = None
        self.gradient_clip = None
        self.compiled = False


    def _initialize_weights(self):
        if self.init_method == "he":
            self.weights, self.biases = WeightInits.he_init(
                self.layer_dims, self.init_seed
            )
        elif self.init_method == "xavier":
            self.weights, self.biases = WeightInits.xavier_init(
                self.layer_dims, self.init_seed
            )
        elif self.init_method == "random":
            self.weights, self.biases = WeightInits.random_init(
                self.layer_dims, seed=self.init_seed
            )
        elif self.init_method == "selu_init":
            self.weights, self.biases = WeightInits.selu_init(
                self.layer_dims, self.init_seed
            )
        elif self.init_method == "smart":
            self.weights, self.biases = WeightInits.smart_init(
                self.layer_dims, self.hidden_activation, self.init_seed
            )
        else:
            raise ValueError(f"Unknown init_method: {self.init_method}")


[docs]
    def reset_weights(self):
        self._initialize_weights()
        return self



[docs]
    def reset_optimizer(self):
        """Reset optimizer state (e.g., momentum buffers, Adam moments)."""
        if self.optimizer is not None and hasattr(self.optimizer, "_state"):
            # Reset optimizer state by reinitializing
            if isinstance(self.optimizer, SGD):
                # SGD has no state to reset
                pass
            elif isinstance(self.optimizer, SGDMomentum):
                self.optimizer._state = {
                    "velocity_w": [],
                    "velocity_b": [],
                    "initialized": False,
                }
            elif isinstance(self.optimizer, RMSprop):
                self.optimizer._state = {
                    "square_avg_weights": [],
                    "square_avg_biases": [],
                    "velocity_w": [],
                    "velocity_b": [],
                    "initialized": False,
                }
            elif isinstance(self.optimizer, Adam):
                self.optimizer._state = {
                    "m_weights": [],
                    "v_weights": [],
                    "m_biases": [],
                    "v_biases": [],
                    "t": 0,
                    "initialized": False,
                }
        return self



[docs]
    def reset_all(self):
        self.reset_weights()
        self.reset_optimizer()
        return self



[docs]
    def compile(
        self, optimizer="adam", lr=0.001, reg=None, lamda=0.01, gradient_clip=None
    ):
        """
        Configure the model for training.

        Sets up the optimizer, learning rate, regularization, and other training
        hyperparameters. Must be called before training the model.

        Args:
            optimizer (str, optional): Optimization algorithm.
                Options: "sgd", "sgdm" (SGD with momentum), "sgdnm" (SGD with Nesterov momentum),
                "rmsprop", "adam".
                Defaults to "adam".
            lr (float, optional): Learning rate for parameter updates. Defaults to 0.001.
            reg (str, optional): Regularization type ("l2", None). Defaults to None.
            lamda (float, optional): Regularization strength (lambda parameter). Defaults to 0.01.
            gradient_clip (float, optional): Maximum gradient norm for clipping. Defaults to None.

        Raises:
            ValueError: If invalid optimizer is specified.

        Example:
            >>> model.compile(optimizer="adam", lr=1e-3, reg="l2", lamda=0.01)
            >>> model.compile(optimizer="sgdm", lr=0.01)  # SGD with momentum
            >>> model.compile(optimizer="sgdnm", lr=0.01)  # SGD with Nesterov momentum
            >>> model.compile(optimizer="rmsprop", lr=0.001)  # RMSprop
        """
        # Create optimizer instance based on string identifier
        if optimizer == "sgd":
            self.optimizer = SGD(learning_rate=lr)
        elif optimizer == "sgdm":
            self.optimizer = SGDMomentum(learning_rate=lr, momentum=0.9, nesterov=False)
        elif optimizer == "sgdnm":
            self.optimizer = SGDMomentum(learning_rate=lr, momentum=0.9, nesterov=True)
        elif optimizer == "rmsprop":
            self.optimizer = RMSprop(learning_rate=lr, rho=0.9, eps=1e-8, momentum=0.0)
        elif optimizer == "adam":
            self.optimizer = Adam(learning_rate=lr, beta1=0.9, beta2=0.999, eps=1e-8)
        else:
            raise ValueError(
                f"Unknown optimizer: {optimizer}. "
                f"Choose from: 'sgd', 'sgdm', 'sgdnm', 'rmsprop', 'adam'"
            )

        self.lr = lr
        self.reg = reg
        self.lamda = lamda
        self.gradient_clip = gradient_clip
        self.compiled = True

        # Print model summary
        self._print_summary()


    def _print_summary(self):
        print("=" * 63)
        print("                    MLP ARCHITECTURE SUMMARY")
        print("=" * 63)

        total_params = 0
        print(f"{'Layer':<12} {'Type':<18} {'Output Shape':<15} {'Params':<10}")
        print("-" * 63)

        for i, (W, b) in enumerate(zip(self.weights, self.biases)):
            if i == 0:
                layer_type = "Input → Hidden"
            elif i == len(self.weights) - 1:
                layer_type = "Hidden → Output"
            else:
                layer_type = "Hidden → Hidden"

            layer_params = W.size + b.size
            total_params += layer_params
            output_shape = f"({W.shape[1]},)"

            print(
                f"{f'Layer {i + 1}':<12} {layer_type:<18} {output_shape:<15} {layer_params:<10}"
            )

        print("-" * 63)
        print(f"{'TOTAL':<47} {total_params:<10}")
        print("=" * 63)
        print(f"{'Hidden Activation':<47} {self.hidden_activation}")
        print(f"{'Output Activation':<47} {self.out_activation or 'Linear'}")
        print(f"{'Optimizer':<47} {self.optimizer.__class__.__name__}")
        print(f"{'Learning Rate':<47} {self.lr}")
        if self.dropout_rate > 0:
            print(f"{'Dropout':<47} {self.dropout_rate:.1%} ({self.dropout_type})")
        if self.reg:
            print(f"{'L2 Regularization':<47} λ = {self.lamda}")
        if self.gradient_clip:
            print(f"{'Gradient Clipping':<47} max_norm = {self.gradient_clip}")
        print("=" * 63)


[docs]
    def predict(self, X):
        """
        Generate predictions for input samples.

        Performs forward propagation through the network without dropout
        to generate predictions on new data.

        Args:
            X (NDArray[np.float64]): Input data of shape (N, input_dim).

        Returns:
            NDArray[np.float64]: Model predictions of shape (N, output_dim).
                For regression: continuous values.
                For binary classification: probabilities (0-1).
                For multiclass: class probabilities.

        Example:
            >>> predictions = model.predict(X_test)
            >>> binary_preds = (predictions > 0.5).astype(int)  # For binary classification
        """
        activations, _, _ = _ForwardPass.forward_mlp(
            X,
            self.weights,
            self.biases,
            self.hidden_activation,
            self.out_activation,
            dropout_rate=0.0,
            training=False,
        )
        return activations[-1]



[docs]
    def evaluate(self, X, y, metric="smart", binary_thresh=0.5):
        """
        Evaluate model performance on given data.

        Computes loss and evaluation metric on the provided dataset.
        Automatically selects appropriate loss function based on output activation.

        Args:
            X (NDArray[np.float64]): Input data of shape (N, input_dim).
            y (NDArray[np.float64]): Target values of shape (N,) or (N, output_dim).
            metric (str, optional): Evaluation metric ("smart", "accuracy", "mse", "rmse",
                "mae", "r2", "f1", "precision", "recall"). Defaults to "smart".
            binary_thresh (float, optional): Threshold for binary classification. Defaults to 0.5.

        Returns:
            tuple[float, float]: (loss, metric_score) where metric_score depends on the metric type.

        Example:
            >>> loss, accuracy = model.evaluate(X_test, y_test, metric="accuracy")
            >>> print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.2%}")
        """
        return self._evaluate_mlp(
            X,
            y,
            self.weights,
            self.biases,
            self.lamda,
            self.reg,
            self.hidden_activation,
            self.out_activation,
            binary_thresh,
            metric,
        )


    def _evaluate_mlp(
        self,
        X,
        y,
        weights,
        biases,
        lamda,
        reg=None,
        hidden_activation=None,
        out_activation=None,
        binary_thresh=0.5,
        metric="smart",
    ):
        activations, z_values, _ = _ForwardPass.forward_mlp(
            X,
            weights,
            biases,
            hidden_activation,
            out_activation,
            dropout_rate=0.0,
            training=False,
        )
        y_pred = activations[-1]
        if reg is None:
            if out_activation is None:
                loss = LossFunctions.mse(y, y_pred)
            elif out_activation == "sigmoid":
                loss = LossFunctions.bce(y, y_pred)
            elif out_activation == "softmax":
                loss = LossFunctions.cce(y, y_pred)
            else:
                raise ValueError(f"Unknown output activation: {out_activation}")
        else:
            if out_activation is None:
                loss = LossFunctions.mse_with_reg(y, y_pred, weights, lamda=lamda)
            elif out_activation == "sigmoid":
                loss = LossFunctions.bce_with_reg(y, y_pred, weights, lamda=lamda)
            elif out_activation == "softmax":
                loss = LossFunctions.cce_with_reg(y, y_pred, weights, lamda=lamda)
            else:
                raise ValueError(f"Unknown output activation: {out_activation}")
        if metric == "smart":
            if out_activation is None:
                eval_score = LossFunctions.mse(y, y_pred)
            elif out_activation == "sigmoid":
                eval_score = Metrics.accuracy_binary(y, y_pred, thresh=binary_thresh)
            elif out_activation == "softmax":
                eval_score = Metrics.accuracy_multiclass(y, y_pred)
            else:
                raise ValueError(f"Unknown output activation: {out_activation}")
        elif metric == "mse":
            eval_score = Metrics.mse(y, y_pred)
        elif metric == "accuracy":
            if out_activation == "sigmoid":
                eval_score = Metrics.accuracy_binary(y, y_pred, thresh=binary_thresh)
            elif out_activation == "softmax":
                eval_score = Metrics.accuracy_multiclass(y, y_pred)
            else:
                raise ValueError("Accuracy metric only valid for classification tasks")
        elif metric == "rmse":
            eval_score = Metrics.rmse(y, y_pred)
        elif metric == "mae":
            eval_score = Metrics.mae(y, y_pred)
        elif metric == "r2":
            eval_score = Metrics.r2_score(y, y_pred)
        elif metric == "f1":
            eval_score = Metrics.f1_score(y, y_pred)
        elif metric == "precision":
            eval_score = Metrics.precision(y, y_pred)
        elif metric == "recall":
            eval_score = Metrics.recall(y, y_pred)
        else:
            raise ValueError(f"Unknown metric: {metric}")
        return loss, eval_score

    def _update_parameters(self, dW, db, lr):
        """
        Update network parameters using the configured optimizer.

        Args:
            dW: Weight gradients
            db: Bias gradients
            lr: Current learning rate (may differ from self.lr due to decay)
        """
        # Update optimizer learning rate if it has changed (e.g., due to lr_decay)
        if hasattr(self.optimizer, "learning_rate"):
            self.optimizer.learning_rate = lr

        # Perform parameter update
        self.optimizer.update(self.weights, self.biases, dW, db)

    def _get_metric_display_name(self, metric):
        """Get the display name for the metric based on task type and metric parameter"""
        if metric == "smart":
            if self.out_activation is None:
                return "MSE"
            elif self.out_activation in ["sigmoid", "softmax"]:
                return "Accuracy"
            else:
                return "Score"
        elif metric == "accuracy":
            return "Accuracy"
        elif metric == "mse":
            return "MSE"
        elif metric == "rmse":
            return "RMSE"
        elif metric == "mae":
            return "MAE"
        elif metric == "r2":
            return "R²"
        elif metric == "f1":
            return "F1"
        elif metric == "precision":
            return "Precision"
        elif metric == "recall":
            return "Recall"
        else:
            return metric.upper()


[docs]
    def fit(
        self,
        X_train,
        y_train,
        X_val=None,
        y_val=None,
        epochs=10,
        batch_size=32,
        verbose=True,
        log_every=5,
        early_stopping_patience=50,
        lr_decay=None,
        numerical_check_freq=100,
        metric="smart",
        reset_before_training=True,
        monitor=None,
        monitor_freq=1,
    ):
        """
        Train the neural network on provided data.

        Implements full training loop with support for validation, early stopping,
        learning rate decay, and comprehensive monitoring. Returns detailed training
        history and statistics for analysis.

        Args:
            X_train (NDArray[np.float64]): Training input data of shape (N, input_dim).
            y_train (NDArray[np.float64]): Training targets of shape (N,) or (N, output_dim).
            X_val (NDArray[np.float64], optional): Validation input data. Defaults to None.
            y_val (NDArray[np.float64], optional): Validation targets. Defaults to None.
            epochs (int, optional): Number of training epochs. Defaults to 10.
            batch_size (int, optional): Mini-batch size. If None, uses full batch. Defaults to None.
            verbose (bool, optional): Whether to print training progress. Defaults to True.
            log_every (int, optional): Frequency of progress logging in epochs. Defaults to 1.
            early_stopping_patience (int, optional): Epochs to wait for improvement before stopping.
                Defaults to 50.
            lr_decay (float, optional): Learning rate decay factor per epoch. Defaults to None.
            numerical_check_freq (int, optional): Frequency of numerical stability checks. Defaults to 100.
            metric (str, optional): Evaluation metric for monitoring. Defaults to "smart".
            reset_before_training (bool, optional): Whether to reset weights before training. Defaults to True.
            monitor (TrainingMonitor, optional): Real-time training monitor. Defaults to None.
            monitor_freq (int, optional): Monitoring frequency in epochs. Defaults to 1.

        Returns:
            dict: Comprehensive training results containing:
                - weights: Final trained weight matrices
                - biases: Final trained bias vectors
                - history: Training/validation loss and metrics per epoch
                - activations: Sample activations from middle epoch
                - gradients: Sample gradients from middle epoch
                - weight_stats_over_epochs: Weight statistics evolution
                - activation_stats_over_epochs: Activation statistics evolution
                - gradient_stats_over_epochs: Gradient statistics evolution

        Raises:
            ValueError: If model is not compiled or if input dimensions are incompatible.

        Example:
            >>> history = model.fit(X_train, y_train, X_val, y_val,
            ...                     epochs=100, batch_size=32,
            ...                     early_stopping_patience=10)
            >>> print(f"Final training loss: {history['history']['train_loss'][-1]:.4f}")
        """
        if not isinstance(log_every, int) or log_every < 1:
            raise ValueError("log_every must be an integer >= 1")
        if not isinstance(monitor_freq, int) or monitor_freq < 1:
            raise ValueError("monitor_freq must be an integer >= 1")
        if not isinstance(numerical_check_freq, int) or numerical_check_freq < 1:
            raise ValueError("numerical_check_freq must be an integer >= 1")
        if not isinstance(early_stopping_patience, int) or early_stopping_patience < 1:
            raise ValueError("early_stopping_patience must be an integer >= 1")
        if not isinstance(epochs, int) or epochs < 1:
            raise ValueError("epochs must be an integer >= 1")
        if not isinstance(batch_size, int) or batch_size < 1:
            raise ValueError("batch_size must be an integer >= 1")

        if reset_before_training:
            self.reset_all()
        if not self.compiled:
            raise ValueError(
                "Model must be compiled before training. Call model.compile() first."
            )

        # Input validation
        X_train = Utils.validate_array_input(X_train, "X_train", min_dims=2, max_dims=2)
        y_train = Utils.validate_array_input(y_train, "y_train", min_dims=1, max_dims=2)

        if X_val is not None:
            X_val = Utils.validate_array_input(X_val, "X_val", min_dims=2, max_dims=2)
            y_val = Utils.validate_array_input(y_val, "y_val", min_dims=1, max_dims=2)

        # Set defaults
        if batch_size is None:
            batch_size = X_train.shape[0]

        # Training history
        history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

        # Representative batch data for distribution plots (captured from middle epoch, middle batch)
        activations_ = None
        gradients_ = None

        # Initialize tracking for statistics over epochs
        num_layers = len(self.weights)

        # Time series statistics: dict with mean/std for each layer
        weight_stats_over_epochs = {
            f"layer_{i}": {"mean": [], "std": []} for i in range(num_layers)
        }
        activation_stats_over_epochs = {
            f"layer_{i}": {"mean": [], "std": []} for i in range(num_layers)
        }
        gradient_stats_over_epochs = {
            f"layer_{i}": {"mean": [], "std": []} for i in range(num_layers)
        }
        gradient_norms_over_epochs = {f"layer_{i}": [] for i in range(num_layers)}
        weight_update_ratios_over_epochs = {f"layer_{i}": [] for i in range(num_layers)}
        # Store representative samples from each epoch
        max_samples_per_epoch = 1000
        epoch_distribution_data = {
            "activations": {f"layer_{i}": [] for i in range(num_layers)},
            "gradients": {f"layer_{i}": [] for i in range(num_layers)},
            "weights": {f"layer_{i}": [] for i in range(num_layers)},
        }

        # Temporary storage for batch-level data within each epoch
        epoch_weights_batch_data = {f"layer_{i}": [] for i in range(num_layers)}
        epoch_activations_batch_data = {f"layer_{i}": [] for i in range(num_layers)}
        epoch_gradients_batch_data = {f"layer_{i}": [] for i in range(num_layers)}
        epoch_gradient_norms_batch_data = {f"layer_{i}": [] for i in range(num_layers)}
        epoch_weight_update_ratios_batch_data = {
            f"layer_{i}": [] for i in range(num_layers)
        }

        best_val_loss = np.inf
        patience_counter = 0
        current_lr = self.lr

        _BackwardPass.reset_warning_throttling()

        for epoch in range(1, epochs + 1):
            # Learning rate decay
            if lr_decay is not None:
                current_lr = self.lr * (lr_decay ** (epoch - 1))

            epoch_errors = 0
            numerical_issues = 0

            # Variables for monitoring
            monitor_activations = None
            monitor_gradients = None
            monitor_weight_updates = None
            # Precompute monitoring stride (about 10 samples per epoch)
            num_batches = int(np.ceil(X_train.shape[0] / batch_size))
            monitor_stride = max(1, num_batches // 10)

            # Reset epoch-level batch data collectors at start of each epoch
            for layer_idx in range(num_layers):
                epoch_weights_batch_data[f"layer_{layer_idx}"].clear()
                epoch_activations_batch_data[f"layer_{layer_idx}"].clear()
                epoch_gradients_batch_data[f"layer_{layer_idx}"].clear()
                epoch_gradient_norms_batch_data[f"layer_{layer_idx}"].clear()
                epoch_weight_update_ratios_batch_data[f"layer_{layer_idx}"].clear()

            # Training loop
            for batch_idx, (Xb, yb) in enumerate(
                Utils.get_batches(X_train, y_train, batch_size, shuffle=True)
            ):
                try:
                    yb = yb.reshape(-1, 1) if yb.ndim == 1 else yb
                    capture_monitor = bool(
                        monitor and (batch_idx % monitor_stride == 0)
                    )

                    # Forward pass
                    activations, z_values, dropout_masks = _ForwardPass.forward_mlp(
                        Xb,
                        self.weights,
                        self.biases,
                        self.hidden_activation,
                        self.out_activation,
                        dropout_rate=self.dropout_rate,
                        dropout_type=self.dropout_type,
                        training=True,
                    )

                    # Capture middle batch from middle epoch for distributions
                    if batch_idx == num_batches // 2 and activations_ is None:
                        activations_ = [act.copy() for act in activations]

                    # For monitoring: a clean snapshot without dropout to avoid false positives
                    if capture_monitor:
                        monitor_activations, _, _ = _ForwardPass.forward_mlp(
                            Xb,
                            self.weights,
                            self.biases,
                            self.hidden_activation,
                            self.out_activation,
                            dropout_rate=0.0,
                            training=False,
                        )

                    # Numerical stability check
                    if batch_idx % numerical_check_freq == 0:
                        issues = Utils.check_numerical_stability(
                            activations, f"epoch_{epoch}_batch_{batch_idx}"
                        )
                        if issues:
                            numerical_issues += len(issues)
                            if numerical_issues <= 3:
                                warnings.warn(f"Numerical issues: {issues[0]}")

                    # Backward pass
                    dW, db = _BackwardPass.backward_mlp(
                        yb,
                        activations,
                        z_values,
                        self.weights,
                        self.biases,
                        Xb,
                        self.hidden_activation,
                        self.out_activation,
                        dropout_masks=dropout_masks,
                    )

                    # Store gradients for monitoring
                    if capture_monitor and monitor_activations is not None:
                        monitor_gradients = (
                            dW.copy() if isinstance(dW, list) else [dW.copy()]
                        )

                    # Gradient clipping
                    if self.gradient_clip is not None:
                        all_grads = dW + db
                        clipped_grads = Utils.gradient_clipping(
                            all_grads, self.gradient_clip
                        )
                        dW = clipped_grads[: len(dW)]
                        db = clipped_grads[len(dW) :]

                    # Add L2 regularization
                    if self.reg:
                        m = Xb.shape[0]
                        for i in range(len(self.weights)):
                            dW[i] += (self.lamda / m) * self.weights[i]

                    # Capture middle batch gradients
                    if batch_idx == num_batches // 2 and gradients_ is None:
                        gradients_ = [grad.copy() for grad in dW]

                    # Collect batch-level statistics for each layer
                    for layer_idx in range(num_layers):
                        # Weight statistics
                        layer_weights = self.weights[layer_idx].flatten()
                        epoch_weights_batch_data[f"layer_{layer_idx}"].extend(
                            layer_weights
                        )
                        # Activation statistics
                        layer_activations = activations[layer_idx].flatten()
                        epoch_activations_batch_data[f"layer_{layer_idx}"].extend(
                            layer_activations
                        )
                        # Gradient statistics
                        layer_gradients = dW[layer_idx].flatten()
                        epoch_gradients_batch_data[f"layer_{layer_idx}"].extend(
                            layer_gradients
                        )
                        # Gradient norm
                        layer_gradient_norm = np.linalg.norm(layer_gradients)
                        epoch_gradient_norms_batch_data[f"layer_{layer_idx}"].append(
                            layer_gradient_norm
                        )

                    # Parameter updates
                    prev_weights = None
                    if capture_monitor and monitor_activations is not None:
                        prev_weights = [W.copy() for W in self.weights]

                    prev_weights_ = [W.copy() for W in self.weights]
                    self._update_parameters(dW, db, current_lr)

                    # Compute weight update ratios (||ΔW|| / ||W||) for each layer
                    for layer_idx in range(num_layers):
                        weight_norm = np.linalg.norm(prev_weights_[layer_idx])
                        update_norm = np.linalg.norm(
                            prev_weights_[layer_idx] - self.weights[layer_idx]
                        )
                        if weight_norm > 1e-12:
                            update_ratio = update_norm / weight_norm
                        else:
                            update_ratio = 0.0
                        epoch_weight_update_ratios_batch_data[
                            f"layer_{layer_idx}"
                        ].append(update_ratio)

                    # Compute actual weight updates for monitoring
                    if (
                        capture_monitor
                        and monitor_activations is not None
                        and prev_weights is not None
                    ):
                        monitor_weight_updates = [
                            prev - curr
                            for prev, curr in zip(prev_weights, self.weights)
                        ]
                except Exception as batch_error:
                    epoch_errors += 1
                    if epoch_errors <= 3:
                        warnings.warn(
                            f"Batch {batch_idx} error: {str(batch_error)[:100]}"
                        )
                    continue

            # Compute epoch-level statistics (mean,std) across all batch data
            for layer_idx in range(num_layers):
                layer_key = f"layer_{layer_idx}"
                # Compute weight statistics (mean and std) for this layer across all batches in this epoch
                if epoch_weights_batch_data[layer_key]:
                    weight_data = np.abs(np.array(epoch_weights_batch_data[layer_key]))
                    weight_stats_over_epochs[layer_key]["mean"].append(
                        np.mean(weight_data)
                    )
                    weight_stats_over_epochs[layer_key]["std"].append(
                        np.std(weight_data)
                    )
                else:
                    weight_stats_over_epochs[layer_key]["mean"].append(0.0)
                    weight_stats_over_epochs[layer_key]["std"].append(0.0)
                # Compute activation statistics (mean and std) for this layer across all batches in this epoch
                if epoch_activations_batch_data[layer_key]:
                    activation_data = np.abs(
                        np.array(epoch_activations_batch_data[layer_key])
                    )
                    activation_stats_over_epochs[layer_key]["mean"].append(
                        np.mean(activation_data)
                    )
                    activation_stats_over_epochs[layer_key]["std"].append(
                        np.std(activation_data)
                    )
                else:
                    activation_stats_over_epochs[layer_key]["mean"].append(0.0)
                    activation_stats_over_epochs[layer_key]["std"].append(0.0)
                # Compute gradient statistics (mean and std) for this layer across all batches in this epoch
                if epoch_gradients_batch_data[layer_key]:
                    gradient_data = np.abs(
                        np.array(epoch_gradients_batch_data[layer_key])
                    )
                    gradient_stats_over_epochs[layer_key]["mean"].append(
                        np.mean(gradient_data)
                    )
                    gradient_stats_over_epochs[layer_key]["std"].append(
                        np.std(gradient_data)
                    )
                else:
                    gradient_stats_over_epochs[layer_key]["mean"].append(0.0)
                    gradient_stats_over_epochs[layer_key]["std"].append(0.0)
                # Compute gradient norm statistics from batch-level norms for this layer
                if epoch_gradient_norms_batch_data[layer_key]:
                    batch_norms = np.array(epoch_gradient_norms_batch_data[layer_key])
                    # Store mean of batch norms as the representative norm for this epoch
                    gradient_norms_over_epochs[layer_key].append(np.mean(batch_norms))
                else:
                    gradient_norms_over_epochs[layer_key].append(0.0)
                # Compute weight update ratio statistics from batch-level ratios for this layer
                if epoch_weight_update_ratios_batch_data[layer_key]:
                    batch_ratios = np.array(
                        epoch_weight_update_ratios_batch_data[layer_key]
                    )
                    # Store mean of batch ratios as the representative ratio for this epoch
                    weight_update_ratios_over_epochs[layer_key].append(
                        np.mean(batch_ratios)
                    )
                else:
                    weight_update_ratios_over_epochs[layer_key].append(0.0)

                # Collect representative samples for distribution plots
                if epoch_activations_batch_data[layer_key]:
                    activation_samples = np.array(
                        epoch_activations_batch_data[layer_key]
                    )
                    if len(activation_samples) > max_samples_per_epoch:
                        indices = np.random.choice(
                            len(activation_samples),
                            max_samples_per_epoch,
                            replace=False,
                        )
                        activation_samples = activation_samples[indices]
                    epoch_distribution_data["activations"][layer_key].append(
                        activation_samples
                    )

                if epoch_gradients_batch_data[layer_key]:
                    gradient_samples = np.array(epoch_gradients_batch_data[layer_key])
                    if len(gradient_samples) > max_samples_per_epoch:
                        indices = np.random.choice(
                            len(gradient_samples), max_samples_per_epoch, replace=False
                        )
                        gradient_samples = gradient_samples[indices]
                    epoch_distribution_data["gradients"][layer_key].append(
                        gradient_samples
                    )

                if epoch_weights_batch_data[layer_key]:
                    weight_samples = np.array(epoch_weights_batch_data[layer_key])
                    if len(weight_samples) > max_samples_per_epoch:
                        indices = np.random.choice(
                            len(weight_samples), max_samples_per_epoch, replace=False
                        )
                        weight_samples = weight_samples[indices]
                    epoch_distribution_data["weights"][layer_key].append(weight_samples)

            # Evaluate
            train_loss, train_acc = self.evaluate(X_train, y_train, metric=metric)
            if X_val is not None:
                val_loss, val_acc = self.evaluate(X_val, y_val, metric=metric)
            else:
                val_loss, val_acc = None, None

            # Store history
            history["train_loss"].append(train_loss)
            history["train_acc"].append(train_acc)
            history["val_loss"].append(val_loss)
            history["val_acc"].append(val_acc)

            # Real-time monitoring
            if monitor and epoch % monitor_freq == 0:
                try:
                    activ_fns = None
                    if monitor_activations is not None:
                        L = len(self.weights)
                        last_act = (
                            self.out_activation
                            if self.out_activation is not None
                            else "linear"
                        )
                        activ_fns = [self.hidden_activation] * (L - 1) + [last_act]
                    monitor_results = monitor.monitor_step(
                        epoch=epoch,
                        train_loss=train_loss,
                        val_loss=val_loss,
                        activations=monitor_activations,
                        gradients=monitor_gradients,
                        weights=self.weights,
                        weight_updates=monitor_weight_updates,
                        activation_functions=activ_fns,
                    )

                    monitor_output = monitor.format_monitoring_output(monitor_results)
                    print(f"{monitor_output}")
                except Exception as monitor_error:
                    if verbose:
                        print(f"Monitor error: {str(monitor_error)[:100]}")

            # Verbose
            if verbose and (epoch % log_every == 0 or epoch == 1 or epoch == epochs):
                metric_name = self._get_metric_display_name(metric)
                lr_info = f", lr: {current_lr:.6f}" if lr_decay else ""
                if X_val is not None:
                    print(
                        f"Epoch {epoch:3d}  Train loss: {train_loss:.6f}, Train {metric_name}: {train_acc:.4f} "
                        f"Val loss: {val_loss:.7f}, Val {metric_name}: {val_acc:.5f}{lr_info}"
                    )
                else:
                    print(
                        f"Epoch {epoch:3d}  Train loss: {train_loss:.6f}, Train {metric_name}: {train_acc:.4f}{lr_info}"
                    )

            # Early stopping
            if X_val is not None and early_stopping_patience is not None:
                if val_loss < best_val_loss - 1e-12:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= early_stopping_patience:
                        if verbose:
                            print(
                                f"Early stopping at epoch {epoch} (no improvement in {early_stopping_patience} epochs)"
                            )
                        break

        results = {
            "weights": self.weights,
            "biases": self.biases,
            "history": history,
            "final_lr": current_lr,
            "activations": activations_,
            "gradients": gradients_,
            "weight_stats_over_epochs": weight_stats_over_epochs,
            "activation_stats_over_epochs": activation_stats_over_epochs,
            "gradient_stats_over_epochs": gradient_stats_over_epochs,
            "gradient_norms_over_epochs": gradient_norms_over_epochs,
            "weight_update_ratios_over_epochs": weight_update_ratios_over_epochs,
            "epoch_distributions": epoch_distribution_data,
            "method": "fit",
            "metric": metric,
            "metric_display_name": self._get_metric_display_name(metric),
        }
        return results



[docs]
    def fit_fast(
        self,
        X_train,
        y_train,
        X_val=None,
        y_val=None,
        epochs=10,
        batch_size=32,
        verbose=True,
        log_every=1,
        early_stopping_patience=50,
        lr_decay=None,
        numerical_check_freq=100,
        metric="smart",
        reset_before_training=True,
        eval_freq=5,
    ):
        """
        High-performance training method optimized for fast training.

        Ultra-fast training loop that eliminates statistics collection overhead
        and monitoring bottlenecks. Provides ~5-10× speedup over standard fit()
        while maintaining identical API and training quality.

        Key Performance Optimizations:
        - Eliminates expensive statistics collection (main bottleneck)
        - Uses optimized batch processing with array views
        - Streamlined training loop with only essential operations
        - Configurable evaluation frequency to reduce overhead

        Expected Performance:
        - ~5-10× faster than fit() method
        - 60-80% less memory usage

        Args:
            X_train (NDArray[np.float64]): Training input data of shape (N, input_dim).
            y_train (NDArray[np.float64]): Training targets of shape (N,) or (N, output_dim).
            X_val (NDArray[np.float64], optional): Validation input data. Defaults to None.
            y_val (NDArray[np.float64], optional): Validation targets. Defaults to None.
            epochs (int, optional): Number of training epochs. Defaults to 10.
            batch_size (int, optional): Mini-batch size. If None, uses full batch. Defaults to None.
            verbose (bool, optional): Whether to print training progress. Defaults to True.
            log_every (int, optional): Frequency of progress logging in epochs. Defaults to 1.
            early_stopping_patience (int, optional): Epochs to wait for improvement before stopping.
                Defaults to 50.
            lr_decay (float, optional): Learning rate decay factor per epoch. Defaults to None.
            numerical_check_freq (int, optional): Frequency of numerical stability checks. Defaults to 100.
            metric (str, optional): Evaluation metric for monitoring. Defaults to "smart".
            reset_before_training (bool, optional): Whether to reset weights before training. Defaults to True.
            monitor (TrainingMonitor, optional): Real-time training monitor. Defaults to None.
            monitor_freq (int, optional): Monitoring frequency in epochs. Defaults to 1.
            eval_freq (int, optional): Evaluation frequency in epochs for performance. Defaults to 5.

        Returns:
            dict: Streamlined training results containing:
                - weights: Final trained weight matrices
                - biases: Final trained bias vectors
                - history: Training/validation loss and metrics per epoch
                - performance_stats: Training time and speed metrics

        Raises:
            ValueError: If model is not compiled or if input dimensions are incompatible.

        Example:
            >>> # Ultra-fast training
            >>> history = model.fit_fast(X_train, y_train, X_val, y_val,
            ...                          epochs=100, batch_size=256, eval_freq=5)

        Note:
            For research and debugging with full diagnostics, use the standard fit() method.
            This method prioritizes speed over detailed monitoring capabilities.
        """
        if not isinstance(log_every, int) or log_every < 1:
            raise ValueError("log_every must be an integer >= 1")
        if not isinstance(numerical_check_freq, int) or numerical_check_freq < 1:
            raise ValueError("numerical_check_freq must be an integer >= 1")
        if not isinstance(early_stopping_patience, int) or early_stopping_patience < 1:
            raise ValueError("early_stopping_patience must be an integer >= 1")
        if not isinstance(epochs, int) or epochs < 1:
            raise ValueError("epochs must be an integer >= 1")
        if not isinstance(batch_size, int) or batch_size < 1:
            raise ValueError("batch_size must be an integer >= 1")
        if not isinstance(eval_freq, int) or eval_freq < 1:
            raise ValueError("eval_freq must be an integer >= 1")
        if reset_before_training:
            self.reset_all()
        if not self.compiled:
            raise ValueError(
                "Model must be compiled before training. Call model.compile() first."
            )

        # Fast input validation (skips expensive NaN/inf checks)
        X_train = Utils.validate_array_input(
            X_train, "X_train", min_dims=2, max_dims=2, fast_mode=True
        )
        y_train = Utils.validate_array_input(
            y_train, "y_train", min_dims=1, max_dims=2, fast_mode=True
        )

        if X_val is not None:
            X_val = Utils.validate_array_input(
                X_val, "X_val", min_dims=2, max_dims=2, fast_mode=True
            )
            y_val = Utils.validate_array_input(
                y_val, "y_val", min_dims=1, max_dims=2, fast_mode=True
            )

        # Set defaults
        if batch_size is None:
            batch_size = X_train.shape[0]

        # Streamlined training history (no heavy statistics)
        history = {
            "train_loss": [],
            "train_acc": [],
            "epochs": [],
        }
        if X_val is not None and y_val is not None:
            history["val_loss"] = []
            history["val_acc"] = []
        best_val_loss = np.inf
        patience_counter = 0
        current_lr = self.lr

        _BackwardPass.reset_warning_throttling()

        for epoch in range(1, epochs + 1):

            # Learning rate decay
            if lr_decay is not None:
                current_lr = self.lr * (lr_decay ** (epoch - 1))

            numerical_issues = 0
            is_eval = ((epoch - 1) % eval_freq == 0) or (epoch == epochs)

            # OPTIMIZED TRAINING LOOP - No statistics collection overhead
            for batch_idx, (Xb, yb) in enumerate(
                Utils.get_batches_fast(X_train, y_train, batch_size, shuffle=True)
            ):

                try:
                    # Forward pass
                    activations, z_values, dropout_masks = _ForwardPass.forward_mlp(
                        Xb,
                        self.weights,
                        self.biases,
                        self.hidden_activation,
                        self.out_activation,
                        dropout_rate=self.dropout_rate,
                        dropout_type=self.dropout_type,
                        training=True,
                    )

                    # Minimal numerical stability check (only critical issues)
                    if batch_idx % numerical_check_freq == 0:
                        if np.any(np.isnan(activations[-1])) or np.any(
                            np.isinf(activations[-1])
                        ):
                            numerical_issues += 1
                            if numerical_issues <= 3:
                                warnings.warn(
                                    f"Numerical instability detected at epoch {epoch}, batch {batch_idx}"
                                )

                    # Backward pass
                    dW, db = _BackwardPass.backward_mlp(
                        yb,
                        activations,
                        z_values,
                        self.weights,
                        self.biases,
                        Xb,
                        self.hidden_activation,
                        self.out_activation,
                        dropout_masks=dropout_masks,
                    )

                    # Gradient clipping
                    if self.gradient_clip is not None:
                        all_grads = dW + db
                        clipped_grads = Utils.gradient_clipping(
                            all_grads, self.gradient_clip
                        )
                        dW = clipped_grads[: len(dW)]
                        db = clipped_grads[len(dW) :]

                    # Add L2 regularization
                    if self.reg:
                        m = Xb.shape[0]
                        for i in range(len(self.weights)):
                            dW[i] += (self.lamda / m) * self.weights[i]

                    # Weight updates
                    self._update_parameters(dW, db, current_lr)

                except Exception as e:
                    warnings.warn(f"Error in batch {batch_idx}: {str(e)}")
                    continue

            # OPTIMIZED EVALUATION
            if is_eval:
                history["epochs"].append(epoch)
                train_loss, train_acc = self.evaluate(X_train, y_train, metric=metric)
                history["train_loss"].append(train_loss)
                history["train_acc"].append(train_acc)

                if X_val is not None and y_val is not None:
                    val_loss, val_acc = self.evaluate(X_val, y_val, metric=metric)
                    history["val_loss"].append(val_loss)
                    history["val_acc"].append(val_acc)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        patience_counter = 0
                    else:
                        patience_counter += 1

                    if patience_counter >= early_stopping_patience:
                        if verbose:
                            print(f"Early stopping at epoch {epoch}")
                        break

            # Verbose logging
            if verbose and epoch % log_every == 0 and is_eval:
                log_msg = f"Epoch {epoch:3d}- Loss: {history['train_loss'][-1]:.6f}"
                log_msg += f" - Train {self._get_metric_display_name(metric)}: {history['train_acc'][-1]:.4f}"
                if X_val is not None and history.get("val_acc"):
                    log_msg += f" - Val {self._get_metric_display_name(metric)}: {history['val_acc'][-1]:.4f}"
                print(log_msg)

        return {
            "weights": [w.copy() for w in self.weights],
            "biases": [b.copy() for b in self.biases],
            "history": history,
            "final_lr": current_lr,
            "method": "fit_fast",
            "metric": metric,
            "metric_display_name": self._get_metric_display_name(metric),
        }



[docs]
    def fit_batch(self, X_batch, y_batch, epochs=10, verbose=True, metric="smart"):
        """
        Train on a single batch for specified epochs. Uses 2-8 samples of given batch.
        Note:
            The range (2-8) samples is based on PyTorch implementation and literature such as
            blog of Karpathy (A Recipe for Training Neural Networks), Universal Approximation Theorem (Hornik et al., 1989),
            Empirical Risk Minimization (Vapnik, 1998) and others.
        """
        if not self.compiled:
            raise ValueError(
                "Model must be compiled before training. Call model.compile() first."
            )

        original_lr = self.lr
        self.reset_all()
        self.lr = 0.01

        # Input validation
        X_batch = Utils.validate_array_input(X_batch, "X_batch", min_dims=2, max_dims=2)
        y_batch = Utils.validate_array_input(y_batch, "y_batch", min_dims=1, max_dims=2)

        # Smart batch selection: 2-8 samples for overfitting test
        n = (
            min(8, max(2, X_batch.shape[0] // 10))
            if X_batch.shape[0] >= 10
            else X_batch.shape[0]
        )
        X_batch = X_batch[:n]
        y_batch = y_batch[:n]

        if verbose:
            initial_loss, initial_acc = self.evaluate(X_batch, y_batch)
            metric_name = self._get_metric_display_name("smart")
            print(f"Initial: Loss={initial_loss:.4f}, {metric_name}={initial_acc:.2%}")

        # Reset warning throttling for new training session
        _BackwardPass.reset_warning_throttling()

        # Training loop
        for epoch in range(epochs):
            # Forward pass
            activations, z_values, dropout_masks = _ForwardPass.forward_mlp(
                X_batch,
                self.weights,
                self.biases,
                self.hidden_activation,
                self.out_activation,
                training=True,
            )

            # Backward pass
            dW, db = _BackwardPass.backward_mlp(
                y_batch,
                activations,
                z_values,
                self.weights,
                self.biases,
                X_batch,
                self.hidden_activation,
                self.out_activation,
                dropout_masks=dropout_masks,
            )

            # Parameter updates using current learning rate
            self._update_parameters(dW, db, self.lr)

        # Evaluation
        final_loss, final_acc = self.evaluate(X_batch, y_batch, metric=metric)
        success = final_acc >= 0.99 or final_loss < 0.01

        if verbose:
            metric_name = self._get_metric_display_name(metric)
            print(f"Final  : Loss={final_loss:.4f}, {metric_name}={final_acc:.2%}")
            print(f"{'OVERFITTING SUCCESS!' if success else 'OVERFITTING FAILED!'}")
        self.lr = original_lr
        self.reset_all()



[docs]
    def save(self, filepath: str, save_optimizer: bool = False, **metadata) -> None:
        """
        Save model to disk in NeuroScope format (.ns).

        Saves model architecture, weights, and optionally optimizer state
        for resuming training. Uses pickle for efficient serialization.

        Args:
            filepath: Path to save file (will add .ns extension if missing)
            save_optimizer: If True, saves optimizer state for training resume
            \\**metadata: Additional metadata to save (e.g., epoch, accuracy)

        Examples:
            >>> # Basic save
            >>> model.save('my_model.ns')

            >>> # Save with metadata
            >>> model.save('checkpoint.ns', epoch=50, accuracy=0.95)

            >>> # Save without optimizer (inference only)
            >>> model.save('model.ns', save_optimizer=False)
        Notes:
            - File format: Python pickle (protocol 4)
            - Extension: .ns (NeuroScope)
            - Compatible with NumPy arrays
        """
        # Ensure .ns extension
        filepath = Path(filepath)
        if filepath.suffix != ".ns":
            filepath = filepath.with_suffix(".ns")
        try:
            from importlib.metadata import version

            neuroscope_version = version("neuroscope")
        except Exception:
            neuroscope_version = "N/A"
        # Build save dictionary
        save_dict = {
            "neuroscope_version": neuroscope_version,
            "model_config": {
                "layer_dims": self.layer_dims,
                "hidden_activation": self.hidden_activation,
                "out_activation": self.out_activation,
                "init_method": self.init_method,
                "init_seed": self.init_seed,
                "dropout_rate": self.dropout_rate,
                "dropout_type": self.dropout_type,
            },
            "weights": [w.copy() for w in self.weights],
            "biases": [b.copy() for b in self.biases],
            "metadata": {"timestamp": datetime.now().isoformat(), **metadata},
        }

        # Save optimizer configuration and state if requested
        if save_optimizer and self.compiled:
            optimizer_state = self.optimizer.state_dict()
            save_dict["optimizer_state"] = optimizer_state

        # Write to disk
        with open(filepath, "wb") as f:
            pickle.dump(save_dict, f, protocol=4)

        print(f"Model saved to: {filepath}")
        if save_optimizer and self.compiled:
            print(f"Optimizer: {self.optimizer.__class__.__name__}")
            if hasattr(self.optimizer, "_state") and self.optimizer._state.get(
                "initialized", False
            ):
                print("Optimizer state included (training resumable)")



[docs]
    @classmethod
    def load(cls, filepath: str, load_optimizer: bool = False):
        """
        Load model from disk. (.ns format)
        Args:
            filepath: Path to saved model file (.ns)
            load_optimizer: If True, loads optimizer state for training resume

        Returns:
            tuple: (model, info) where:
            - model: Loaded MLP instance
            - info: ModelInfo dict with nice __repr__ for printing

        Examples:
            >>> # Load and inspect metadata
            >>> model, info = MLP.load('checkpoint.ns')
            >>> print(info)  # Shows nice formatted summary
            >>> predictions = model.predict(X_test)

            >>> # Load for continued training
            >>> model, info = MLP.load('checkpoint.ns', load_optimizer=True)
            >>> print(f"Resuming from epoch {info['custom_metadata']['epoch']}")
            >>> model.fit(X, y, epochs=50)

            >>> # Access metadata programmatically
            >>> model, info = MLP.load('model.ns')
            >>> print(f"Architecture: {info['model_config']['layer_dims']}")
            >>> print(f"Saved at: {info['timestamp']}")

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file format is invalid
        """
        filepath = Path(filepath)
        if not filepath.exists():
            raise FileNotFoundError(f"Model file not found: {filepath}")

        try:
            with open(filepath, "rb") as f:
                save_dict = pickle.load(
                    f
                )  # nosec B301 - Model file from trusted source
        except Exception as e:
            raise ValueError(f"Failed to load model file: {str(e)}")

        required_keys = ["model_config", "weights", "biases"]
        if not all(key in save_dict for key in required_keys):
            raise ValueError(
                f"Invalid .ns file format. Missing required keys: "
                f"{set(required_keys) - set(save_dict.keys())}"
            )

        # Reconstruct model
        config = save_dict["model_config"]
        model = cls(
            layer_dims=config["layer_dims"],
            hidden_activation=config["hidden_activation"],
            out_activation=config["out_activation"],
            init_method=config["init_method"],
            init_seed=config["init_seed"],
            dropout_rate=config["dropout_rate"],
            dropout_type=config["dropout_type"],
        )

        # Restore weights and biases
        model.weights = [w.copy() for w in save_dict["weights"]]
        model.biases = [b.copy() for b in save_dict["biases"]]

        # Restore optimizer state if requested
        if load_optimizer and "optimizer_state" in save_dict:
            optimizer_state = save_dict["optimizer_state"]
            optimizer_type = optimizer_state.get("type", "Adam")

            # Recreate optimizer instance based on saved type
            if optimizer_type == "SGD":
                model.optimizer = SGD(learning_rate=optimizer_state["learning_rate"])
            elif optimizer_type == "SGDMomentum":
                model.optimizer = SGDMomentum(
                    learning_rate=optimizer_state["learning_rate"],
                    momentum=optimizer_state.get("momentum", 0.9),
                    nesterov=optimizer_state.get("nesterov", False),
                )
            elif optimizer_type == "RMSprop":
                model.optimizer = RMSprop(
                    learning_rate=optimizer_state["learning_rate"],
                    rho=optimizer_state.get("rho", 0.9),
                    eps=optimizer_state.get("eps", 1e-8),
                    momentum=optimizer_state.get("momentum", 0.0),
                )
            elif optimizer_type == "Adam":
                model.optimizer = Adam(
                    learning_rate=optimizer_state["learning_rate"],
                    beta1=optimizer_state.get("beta1", 0.9),
                    beta2=optimizer_state.get("beta2", 0.999),
                    eps=optimizer_state.get("eps", 1e-8),
                )
            else:
                raise ValueError(f"Unknown optimizer type: {optimizer_type}")

            # Restore optimizer internal state
            model.optimizer.load_state_dict(optimizer_state)

            # Set other training configurations
            model.lr = optimizer_state["learning_rate"]
            model.compiled = True

            print(f"Optimizer state restored ({optimizer_type})")

        print(f"Model loaded from: {filepath}")
        arch_str = " → ".join(map(str, config["layer_dims"]))
        print(f"Architecture: {arch_str}")

        class ModelInfo(dict):
            def __repr__(self):
                lines = ["=" * 70, "MODEL METADATA", "=" * 70]
                lines.append(f"{'File':<25} {self.get('file_path', 'N/A')}")
                lines.append(f"{'Size':<25} {self.get('file_size_mb', 0):.2f} MB")
                lines.append(
                    f"{'NeuroScope Version':<25} {self.get('neuroscope_version', 'unknown')}"
                )
                lines.append(f"{'Saved At':<25} {self.get('timestamp', 'unknown')}")
                lines.append("-" * 70)

                cfg = self.get("model_config", {})
                lines.append(
                    f"{'Architecture':<25} {' → '.join(map(str, cfg.get('layer_dims', [])))}"
                )
                lines.append(
                    f"{'Hidden Activation':<25} {cfg.get('hidden_activation', 'N/A')}"
                )
                lines.append(
                    f"{'Output Activation':<25} {cfg.get('out_activation') or 'Linear'}"
                )
                lines.append(f"{'Dropout':<25} {cfg.get('dropout_rate', 0):.1%}")
                lines.append("-" * 70)

                if self.get("has_optimizer_state"):
                    opt = self.get("optimizer_config", {})
                    lines.append(f"{'Optimizer':<25} {opt.get('type', 'N/A')}")
                    lines.append(
                        f"{'Learning Rate':<25} {opt.get('learning_rate', 'N/A')}"
                    )
                else:
                    lines.append(f"{'Optimizer State':<25} Not saved")
                lines.append("-" * 70)
                lines.append("CUSTOM METADATA")
                lines.append("-" * 70)

                meta = self.get("custom_metadata", {})
                if meta:
                    for k, v in meta.items():
                        if k != "timestamp":  # Don't duplicate timestamp
                            lines.append(f"{k:<25} {v}")
                else:
                    lines.append(f"{'(none)':<25}")

                lines.append("=" * 70)
                return "\n".join(lines)

        info = ModelInfo(
            {
                "neuroscope_version": save_dict.get("neuroscope_version", "unknown"),
                "timestamp": save_dict.get("metadata", {}).get("timestamp", "unknown"),
                "model_config": config,
                "optimizer_config": save_dict.get("optimizer_state"),
                "has_optimizer_state": "optimizer_state" in save_dict,
                "custom_metadata": save_dict.get("metadata", {}),
                "file_path": str(filepath),
                "file_size_mb": filepath.stat().st_size / (1024 * 1024),
            }
        )
        return model, info