Source code for neuroscope.mlp.optimizers

"""
Optimizers for NeuroScope MLP.

"""

from abc import ABC, abstractmethod
from typing import Any, Dict, List

import numpy as np



[docs]
class Optimizer(ABC):
    """
    Base class for all optimizers.

    Provides common interface for parameter updates and state management.
    """


[docs]
    def __init__(self, learning_rate: float = 0.01):
        """
        Initialize optimizer.

        Args:
            learning_rate: Step size for parameter updates
        """
        if learning_rate <= 0:
            raise ValueError(f"Learning rate must be positive, got {learning_rate}")

        self.learning_rate = learning_rate
        self._state: Dict[str, Any] = {}



[docs]
    @abstractmethod
    def update(
        self,
        weights: List[np.ndarray],
        biases: List[np.ndarray],
        weight_grads: List[np.ndarray],
        bias_grads: List[np.ndarray],
    ) -> None:
        """
        Update parameters using gradients.

        Args:
            weights: List of weight matrices
            biases: List of bias vectors
            weight_grads: Gradients for weights
            bias_grads: Gradients for biases
        """
        pass



[docs]
    def state_dict(self) -> Dict[str, Any]:
        """
        Get optimizer state for checkpointing.

        Returns:
            Dictionary containing optimizer configuration and state
        """
        return {
            "type": self.__class__.__name__,
            "learning_rate": self.learning_rate,
            "state": self._state,
        }



[docs]
    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        """
        Load optimizer state from checkpoint.

        Args:
            state_dict: State dictionary from state_dict()
        """
        self.learning_rate = state_dict["learning_rate"]
        self._state = state_dict["state"]





[docs]
class SGD(Optimizer):
    """
    Stochastic Gradient Descent optimizer.

    Implements basic gradient descent with fixed learning rate:
        θ_t = θ_{t-1} - α * ∇L(θ_{t-1})

    Args:
        learning_rate: Learning rate (step size), default: 0.01

    References:
        Robbins & Monro (1951). "A Stochastic Approximation Method."
        Annals of Mathematical Statistics.

    Example:
        >>> from neuroscope import MLP
        >>> model = MLP([10, 20, 5])
        >>> model.compile(optimizer="sgd", lr=0.01)
        >>> history = model.fit(X_train, y_train, epochs=100)
    """


[docs]
    def __init__(self, learning_rate: float = 0.01):
        super().__init__(learning_rate)



[docs]
    def update(
        self,
        weights: List[np.ndarray],
        biases: List[np.ndarray],
        weight_grads: List[np.ndarray],
        bias_grads: List[np.ndarray],
    ) -> None:
        """Apply gradient descent update."""
        for i in range(len(weights)):
            weights[i] -= self.learning_rate * weight_grads[i]
            biases[i] -= self.learning_rate * bias_grads[i]





[docs]
class SGDMomentum(Optimizer):
    """
    SGD with Momentum optimizer.

    Implements momentum-accelerated gradient descent. Momentum accumulates
    gradients over time, allowing faster convergence and reduced oscillation.

    Standard Momentum (Polyak, 1964):
        v_t = μ * v_{t-1} + ∇L(θ_{t-1})
        θ_t = θ_{t-1} - α * v_t

    Nesterov Momentum (Nesterov, 1983):
        v_t = μ * v_{t-1} + ∇L(θ_{t-1})
        θ_t = θ_{t-1} - α * (μ * v_t + ∇L(θ_{t-1}))

    Args:
        learning_rate: Learning rate (step size), default: 0.01
        momentum: Momentum coefficient μ ∈ [0, 1), default: 0.9
        nesterov: Enable Nesterov accelerated gradient, default: False

    References:
        - Polyak, B. T. (1964). "Some methods of speeding up the convergence
          of iteration methods." USSR Computational Mathematics and
          Mathematical Physics, 4(5), 1-17.

        - Sutskever, I., Martens, J., Dahl, G., & Hinton, G. (2013).
          "On the importance of initialization and momentum in deep learning."
          ICML 2013.

        - Nesterov, Y. (1983). "A method for unconstrained convex minimization
          problem with the rate of convergence O(1/k^2)." Doklady AN USSR, 269.

    Example:
        >>> from neuroscope import MLP
        >>> model = MLP([784, 128, 64, 10])
        >>> # Standard momentum
        >>> model.compile(optimizer="sgdm", lr=0.01)
        >>> history = model.fit(X_train, y_train, epochs=100)

        >>> # Nesterov momentum (recommended for deep networks)
        >>> model.compile(optimizer="sgdnm", lr=0.01)
        >>> history = model.fit(X_train, y_train, epochs=100)

    Notes:
        - Typical momentum values: 0.9 (default) or 0.95 (aggressive)
        - Nesterov momentum often converges faster than standard momentum
        - Momentum helps escape local minima and traverse flat regions
    """


[docs]
    def __init__(
        self, learning_rate: float = 0.01, momentum: float = 0.9, nesterov: bool = False
    ):
        super().__init__(learning_rate)

        # Validate momentum
        if not 0 <= momentum < 1:
            raise ValueError(f"Momentum must be in [0, 1), got {momentum}")

        self.momentum = momentum
        self.nesterov = nesterov

        # Initialize velocity buffers (created on first update)
        self._state = {"velocity_w": [], "velocity_b": [], "initialized": False}



[docs]
    def update(
        self,
        weights: List[np.ndarray],
        biases: List[np.ndarray],
        weight_grads: List[np.ndarray],
        bias_grads: List[np.ndarray],
    ) -> None:
        """
        Apply momentum-accelerated gradient update.

        Implements the momentum update rule from Polyak (1964) with
        optional Nesterov acceleration from Nesterov (1983).
        """
        # Initialize velocity buffers on first call
        if not self._state["initialized"]:
            self._state["velocity_w"] = [np.zeros_like(w) for w in weights]
            self._state["velocity_b"] = [np.zeros_like(b) for b in biases]
            self._state["initialized"] = True

        velocity_w = self._state["velocity_w"]
        velocity_b = self._state["velocity_b"]

        # Update each layer
        for i in range(len(weights)):
            # --- Weight updates ---
            # Momentum update: v_t = μ * v_{t-1} + g_t
            velocity_w[i] = self.momentum * velocity_w[i] + weight_grads[i]

            if self.nesterov:
                # Nesterov: θ_t = θ_{t-1} - α * (μ * v_t + g_t)
                update_w = self.momentum * velocity_w[i] + weight_grads[i]
            else:
                # Standard: θ_t = θ_{t-1} - α * v_t
                update_w = velocity_w[i]

            weights[i] -= self.learning_rate * update_w

            # --- Bias updates (same logic) ---
            velocity_b[i] = self.momentum * velocity_b[i] + bias_grads[i]

            if self.nesterov:
                update_b = self.momentum * velocity_b[i] + bias_grads[i]
            else:
                update_b = velocity_b[i]

            biases[i] -= self.learning_rate * update_b



[docs]
    def state_dict(self) -> Dict[str, Any]:
        """Get optimizer state including velocity buffers."""
        return {
            "type": self.__class__.__name__,
            "learning_rate": self.learning_rate,
            "momentum": self.momentum,
            "nesterov": self.nesterov,
            "state": self._state,
        }



[docs]
    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        """Load optimizer state including velocity buffers."""
        self.learning_rate = state_dict["learning_rate"]
        self.momentum = state_dict["momentum"]
        self.nesterov = state_dict["nesterov"]
        self._state = state_dict["state"]





[docs]
class Adam(Optimizer):
    """
    Adam (Adaptive Moment Estimation) optimizer.

    Combines momentum with adaptive learning rates. Maintains exponential moving
    averages of gradients (first moment) and squared gradients (second moment).
    Includes bias correction to account for initialization at zero.

    Algorithm (Kingma & Ba, 2014):
        m_t = β₁ * m_{t-1} + (1 - β₁) * g_t       [First moment estimate]
        v_t = β₂ * v_{t-1} + (1 - β₂) * g_t²      [Second moment estimate]
        m̂_t = m_t / (1 - β₁ᵗ)                     [Bias-corrected first moment]
        v̂_t = v_t / (1 - β₂ᵗ)                     [Bias-corrected second moment]
        θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε)     [Parameter update]

    Args:
        learning_rate: Learning rate α, default: 0.001
        beta1: First moment decay rate β₁ ∈ [0, 1), default: 0.9
        beta2: Second moment decay rate β₂ ∈ [0, 1), default: 0.999
        eps: Numerical stability constant ε, default: 1e-8

    References:
        Kingma, D. P., & Ba, J. (2014). "Adam: A Method for Stochastic
        Optimization." arXiv preprint arXiv:1412.6980.

    Example:
        >>> from neuroscope import MLP
        >>> model = MLP([784, 128, 64, 10])
        >>> # Standard Adam (recommended default)
        >>> model.compile(optimizer="adam", lr=0.001)
        >>> history = model.fit(X_train, y_train, epochs=100)

        >>> # Higher learning rate for faster convergence
        >>> model.compile(optimizer="adam", lr=0.01)
        >>> history = model.fit(X_train, y_train, epochs=100)

    Notes:
        - Default hyperparameters work well for most problems
        - Adam is particularly effective for sparse gradients and noisy data
        - Less sensitive to learning rate than SGD
        - Memory overhead: 2x parameters (stores m and v)
    """


[docs]
    def __init__(
        self,
        learning_rate: float = 0.001,
        beta1: float = 0.9,
        beta2: float = 0.999,
        eps: float = 1e-8,
    ):
        super().__init__(learning_rate)

        # Validate hyperparameters
        if not 0 <= beta1 < 1:
            raise ValueError(f"beta1 must be in [0, 1), got {beta1}")
        if not 0 <= beta2 < 1:
            raise ValueError(f"beta2 must be in [0, 1), got {beta2}")
        if eps <= 0:
            raise ValueError(f"eps must be positive, got {eps}")

        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        # Initialize state (created on first update)
        self._state = {
            "m_weights": [],
            "v_weights": [],
            "m_biases": [],
            "v_biases": [],
            "t": 0,
            "initialized": False,
        }



[docs]
    def update(
        self,
        weights: List[np.ndarray],
        biases: List[np.ndarray],
        weight_grads: List[np.ndarray],
        bias_grads: List[np.ndarray],
    ) -> None:
        """
        Apply Adam adaptive gradient update.

        Implements the Adam algorithm from Kingma & Ba (2014) with
        exponential moving averages and bias correction.
        """
        # Initialize moment buffers on first call
        if not self._state["initialized"]:
            self._state["m_weights"] = [np.zeros_like(w) for w in weights]
            self._state["v_weights"] = [np.zeros_like(w) for w in weights]
            self._state["m_biases"] = [np.zeros_like(b) for b in biases]
            self._state["v_biases"] = [np.zeros_like(b) for b in biases]
            self._state["initialized"] = True

        # Increment timestep
        self._state["t"] += 1
        t = self._state["t"]

        # Retrieve moment buffers
        m_w = self._state["m_weights"]
        v_w = self._state["v_weights"]
        m_b = self._state["m_biases"]
        v_b = self._state["v_biases"]

        # Update each layer
        for i in range(len(weights)):
            # --- Weight updates ---
            # Update biased first moment: m_t = β₁ * m_{t-1} + (1 - β₁) * g_t
            m_w[i] = self.beta1 * m_w[i] + (1 - self.beta1) * weight_grads[i]

            # Update biased second moment: v_t = β₂ * v_{t-1} + (1 - β₂) * g_t²
            v_w[i] = self.beta2 * v_w[i] + (1 - self.beta2) * (weight_grads[i] ** 2)

            # Bias correction
            m_hat_w = m_w[i] / (1 - self.beta1**t)
            v_hat_w = v_w[i] / (1 - self.beta2**t)

            # Parameter update: θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε)
            weights[i] -= self.learning_rate * m_hat_w / (np.sqrt(v_hat_w) + self.eps)

            # --- Bias updates (same logic) ---
            m_b[i] = self.beta1 * m_b[i] + (1 - self.beta1) * bias_grads[i]
            v_b[i] = self.beta2 * v_b[i] + (1 - self.beta2) * (bias_grads[i] ** 2)

            m_hat_b = m_b[i] / (1 - self.beta1**t)
            v_hat_b = v_b[i] / (1 - self.beta2**t)

            biases[i] -= self.learning_rate * m_hat_b / (np.sqrt(v_hat_b) + self.eps)



[docs]
    def state_dict(self) -> Dict[str, Any]:
        """Get optimizer state including moment estimates and timestep."""
        return {
            "type": self.__class__.__name__,
            "learning_rate": self.learning_rate,
            "beta1": self.beta1,
            "beta2": self.beta2,
            "eps": self.eps,
            "state": self._state,
        }



[docs]
    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        """Load optimizer state including moment estimates and timestep."""
        self.learning_rate = state_dict["learning_rate"]
        self.beta1 = state_dict["beta1"]
        self.beta2 = state_dict["beta2"]
        self.eps = state_dict["eps"]
        self._state = state_dict["state"]





[docs]
class RMSprop(Optimizer):
    """
    RMSprop (Root Mean Square Propagation) optimizer.

    Maintains moving average of squared gradients to normalize learning rates.
    Particularly effective for non-stationary objectives and recurrent networks.
    Can be seen as a precursor to Adam, using only second moment adaptation.

    Algorithm (Hinton, 2012; Tieleman & Hinton, 2012):
        E[g²]_t = ρ * E[g²]_{t-1} + (1 - ρ) * g_t²     [Moving average of squared gradients]
        θ_t = θ_{t-1} - α * g_t / (√E[g²]_t + ε)       [Parameter update]

    With momentum (optional):
        v_t = μ * v_{t-1} + α * g_t / (√E[g²]_t + ε)   [Momentum accumulation]
        θ_t = θ_{t-1} - v_t                             [Parameter update]

    Args:
        learning_rate: Learning rate α, default: 0.001
        rho: Decay rate for moving average ρ ∈ [0, 1), default: 0.9
        eps: Numerical stability constant ε, default: 1e-8
        momentum: Momentum coefficient μ ∈ [0, 1), default: 0.0 (disabled)

    References:
        - Tieleman, T., & Hinton, G. (2012). "Lecture 6.5 - RMSprop: Divide
          the gradient by a running average of its recent magnitude."
          COURSERA: Neural Networks for Machine Learning.

        - Hinton, G., Srivastava, N., & Swersky, K. (2012). "Neural Networks
          for Machine Learning Lecture 6a Overview of mini-batch gradient descent."

    Example:
        >>> from neuroscope import MLP
        >>> model = MLP([784, 128, 64, 10])
        >>> # Standard RMSprop (recommended for RNNs)
        >>> model.compile(optimizer="rmsprop", lr=0.001)
        >>> history = model.fit(X_train, y_train, epochs=100)

        >>> # Note: RMSprop uses built-in momentum=0.0 by default
        >>> # For momentum-based training, use "sgdm" or "sgdnm" instead

    Notes:
        - Default rho=0.9 works well for most problems
        - RMSprop handles sparse gradients better than standard SGD
        - Adding momentum can improve convergence stability
        - Less memory intensive than Adam (no first moment)
    """


[docs]
    def __init__(
        self,
        learning_rate: float = 0.001,
        rho: float = 0.9,
        eps: float = 1e-8,
        momentum: float = 0.0,
    ):
        super().__init__(learning_rate)

        # Validate hyperparameters
        if not 0 <= rho < 1:
            raise ValueError(f"rho must be in [0, 1), got {rho}")
        if eps <= 0:
            raise ValueError(f"eps must be positive, got {eps}")
        if not 0 <= momentum < 1:
            raise ValueError(f"momentum must be in [0, 1), got {momentum}")

        self.rho = rho
        self.eps = eps
        self.momentum = momentum

        # Initialize state (created on first update)
        self._state = {
            "square_avg_weights": [],
            "square_avg_biases": [],
            "velocity_w": [],  # Only used if momentum > 0
            "velocity_b": [],  # Only used if momentum > 0
            "initialized": False,
        }



[docs]
    def update(
        self,
        weights: List[np.ndarray],
        biases: List[np.ndarray],
        weight_grads: List[np.ndarray],
        bias_grads: List[np.ndarray],
    ) -> None:
        """
        Apply RMSprop adaptive gradient update.

        Implements the RMSprop algorithm from Tieleman & Hinton (2012)
        with optional momentum acceleration.
        """
        # Initialize buffers on first call
        if not self._state["initialized"]:
            self._state["square_avg_weights"] = [np.zeros_like(w) for w in weights]
            self._state["square_avg_biases"] = [np.zeros_like(b) for b in biases]
            if self.momentum > 0:
                self._state["velocity_w"] = [np.zeros_like(w) for w in weights]
                self._state["velocity_b"] = [np.zeros_like(b) for b in biases]
            self._state["initialized"] = True

        # Retrieve buffers
        sq_avg_w = self._state["square_avg_weights"]
        sq_avg_b = self._state["square_avg_biases"]

        # Update each layer
        for i in range(len(weights)):
            # --- Weight updates ---
            # Update moving average of squared gradients: E[g²]_t = ρ * E[g²]_{t-1} + (1-ρ) * g_t²
            sq_avg_w[i] = self.rho * sq_avg_w[i] + (1 - self.rho) * (
                weight_grads[i] ** 2
            )

            # Compute adaptive update: g_t / √E[g²]_t
            adaptive_grad_w = weight_grads[i] / (np.sqrt(sq_avg_w[i]) + self.eps)

            if self.momentum > 0:
                # Apply momentum: v_t = μ * v_{t-1} + α * adaptive_grad
                velocity_w = self._state["velocity_w"]
                velocity_w[i] = (
                    self.momentum * velocity_w[i] + self.learning_rate * adaptive_grad_w
                )
                weights[i] -= velocity_w[i]
            else:
                # Direct update: θ_t = θ_{t-1} - α * adaptive_grad
                weights[i] -= self.learning_rate * adaptive_grad_w

            # --- Bias updates (same logic) ---
            sq_avg_b[i] = self.rho * sq_avg_b[i] + (1 - self.rho) * (bias_grads[i] ** 2)
            adaptive_grad_b = bias_grads[i] / (np.sqrt(sq_avg_b[i]) + self.eps)

            if self.momentum > 0:
                velocity_b = self._state["velocity_b"]
                velocity_b[i] = (
                    self.momentum * velocity_b[i] + self.learning_rate * adaptive_grad_b
                )
                biases[i] -= velocity_b[i]
            else:
                biases[i] -= self.learning_rate * adaptive_grad_b



[docs]
    def state_dict(self) -> Dict[str, Any]:
        """Get optimizer state including squared gradient averages and momentum."""
        return {
            "type": self.__class__.__name__,
            "learning_rate": self.learning_rate,
            "rho": self.rho,
            "eps": self.eps,
            "momentum": self.momentum,
            "state": self._state,
        }



[docs]
    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        """Load optimizer state including squared gradient averages and momentum."""
        self.learning_rate = state_dict["learning_rate"]
        self.rho = state_dict["rho"]
        self.eps = state_dict["eps"]
        self.momentum = state_dict["momentum"]
        self._state = state_dict["state"]