Source code for neuroscope.mlp.optimizers

"""
Optimizers for NeuroScope MLP.

"""

from abc import ABC, abstractmethod
from typing import Any, Dict, List

import numpy as np


[docs] class Optimizer(ABC): """ Base class for all optimizers. Provides common interface for parameter updates and state management. """
[docs] def __init__(self, learning_rate: float = 0.01): """ Initialize optimizer. Args: learning_rate: Step size for parameter updates """ if learning_rate <= 0: raise ValueError(f"Learning rate must be positive, got {learning_rate}") self.learning_rate = learning_rate self._state: Dict[str, Any] = {}
[docs] @abstractmethod def update( self, weights: List[np.ndarray], biases: List[np.ndarray], weight_grads: List[np.ndarray], bias_grads: List[np.ndarray], ) -> None: """ Update parameters using gradients. Args: weights: List of weight matrices biases: List of bias vectors weight_grads: Gradients for weights bias_grads: Gradients for biases """ pass
[docs] def state_dict(self) -> Dict[str, Any]: """ Get optimizer state for checkpointing. Returns: Dictionary containing optimizer configuration and state """ return { "type": self.__class__.__name__, "learning_rate": self.learning_rate, "state": self._state, }
[docs] def load_state_dict(self, state_dict: Dict[str, Any]) -> None: """ Load optimizer state from checkpoint. Args: state_dict: State dictionary from state_dict() """ self.learning_rate = state_dict["learning_rate"] self._state = state_dict["state"]
[docs] class SGD(Optimizer): """ Stochastic Gradient Descent optimizer. Implements basic gradient descent with fixed learning rate: θ_t = θ_{t-1} - α * ∇L(θ_{t-1}) Args: learning_rate: Learning rate (step size), default: 0.01 References: Robbins & Monro (1951). "A Stochastic Approximation Method." Annals of Mathematical Statistics. Example: >>> from neuroscope import MLP >>> model = MLP([10, 20, 5]) >>> model.compile(optimizer="sgd", lr=0.01) >>> history = model.fit(X_train, y_train, epochs=100) """
[docs] def __init__(self, learning_rate: float = 0.01): super().__init__(learning_rate)
[docs] def update( self, weights: List[np.ndarray], biases: List[np.ndarray], weight_grads: List[np.ndarray], bias_grads: List[np.ndarray], ) -> None: """Apply gradient descent update.""" for i in range(len(weights)): weights[i] -= self.learning_rate * weight_grads[i] biases[i] -= self.learning_rate * bias_grads[i]
[docs] class SGDMomentum(Optimizer): """ SGD with Momentum optimizer. Implements momentum-accelerated gradient descent. Momentum accumulates gradients over time, allowing faster convergence and reduced oscillation. Standard Momentum (Polyak, 1964): v_t = μ * v_{t-1} + ∇L(θ_{t-1}) θ_t = θ_{t-1} - α * v_t Nesterov Momentum (Nesterov, 1983): v_t = μ * v_{t-1} + ∇L(θ_{t-1}) θ_t = θ_{t-1} - α * (μ * v_t + ∇L(θ_{t-1})) Args: learning_rate: Learning rate (step size), default: 0.01 momentum: Momentum coefficient μ ∈ [0, 1), default: 0.9 nesterov: Enable Nesterov accelerated gradient, default: False References: - Polyak, B. T. (1964). "Some methods of speeding up the convergence of iteration methods." USSR Computational Mathematics and Mathematical Physics, 4(5), 1-17. - Sutskever, I., Martens, J., Dahl, G., & Hinton, G. (2013). "On the importance of initialization and momentum in deep learning." ICML 2013. - Nesterov, Y. (1983). "A method for unconstrained convex minimization problem with the rate of convergence O(1/k^2)." Doklady AN USSR, 269. Example: >>> from neuroscope import MLP >>> model = MLP([784, 128, 64, 10]) >>> # Standard momentum >>> model.compile(optimizer="sgdm", lr=0.01) >>> history = model.fit(X_train, y_train, epochs=100) >>> # Nesterov momentum (recommended for deep networks) >>> model.compile(optimizer="sgdnm", lr=0.01) >>> history = model.fit(X_train, y_train, epochs=100) Notes: - Typical momentum values: 0.9 (default) or 0.95 (aggressive) - Nesterov momentum often converges faster than standard momentum - Momentum helps escape local minima and traverse flat regions """
[docs] def __init__( self, learning_rate: float = 0.01, momentum: float = 0.9, nesterov: bool = False ): super().__init__(learning_rate) # Validate momentum if not 0 <= momentum < 1: raise ValueError(f"Momentum must be in [0, 1), got {momentum}") self.momentum = momentum self.nesterov = nesterov # Initialize velocity buffers (created on first update) self._state = {"velocity_w": [], "velocity_b": [], "initialized": False}
[docs] def update( self, weights: List[np.ndarray], biases: List[np.ndarray], weight_grads: List[np.ndarray], bias_grads: List[np.ndarray], ) -> None: """ Apply momentum-accelerated gradient update. Implements the momentum update rule from Polyak (1964) with optional Nesterov acceleration from Nesterov (1983). """ # Initialize velocity buffers on first call if not self._state["initialized"]: self._state["velocity_w"] = [np.zeros_like(w) for w in weights] self._state["velocity_b"] = [np.zeros_like(b) for b in biases] self._state["initialized"] = True velocity_w = self._state["velocity_w"] velocity_b = self._state["velocity_b"] # Update each layer for i in range(len(weights)): # --- Weight updates --- # Momentum update: v_t = μ * v_{t-1} + g_t velocity_w[i] = self.momentum * velocity_w[i] + weight_grads[i] if self.nesterov: # Nesterov: θ_t = θ_{t-1} - α * (μ * v_t + g_t) update_w = self.momentum * velocity_w[i] + weight_grads[i] else: # Standard: θ_t = θ_{t-1} - α * v_t update_w = velocity_w[i] weights[i] -= self.learning_rate * update_w # --- Bias updates (same logic) --- velocity_b[i] = self.momentum * velocity_b[i] + bias_grads[i] if self.nesterov: update_b = self.momentum * velocity_b[i] + bias_grads[i] else: update_b = velocity_b[i] biases[i] -= self.learning_rate * update_b
[docs] def state_dict(self) -> Dict[str, Any]: """Get optimizer state including velocity buffers.""" return { "type": self.__class__.__name__, "learning_rate": self.learning_rate, "momentum": self.momentum, "nesterov": self.nesterov, "state": self._state, }
[docs] def load_state_dict(self, state_dict: Dict[str, Any]) -> None: """Load optimizer state including velocity buffers.""" self.learning_rate = state_dict["learning_rate"] self.momentum = state_dict["momentum"] self.nesterov = state_dict["nesterov"] self._state = state_dict["state"]
[docs] class Adam(Optimizer): """ Adam (Adaptive Moment Estimation) optimizer. Combines momentum with adaptive learning rates. Maintains exponential moving averages of gradients (first moment) and squared gradients (second moment). Includes bias correction to account for initialization at zero. Algorithm (Kingma & Ba, 2014): m_t = β₁ * m_{t-1} + (1 - β₁) * g_t [First moment estimate] v_t = β₂ * v_{t-1} + (1 - β₂) * g_t² [Second moment estimate] m̂_t = m_t / (1 - β₁ᵗ) [Bias-corrected first moment] v̂_t = v_t / (1 - β₂ᵗ) [Bias-corrected second moment] θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε) [Parameter update] Args: learning_rate: Learning rate α, default: 0.001 beta1: First moment decay rate β₁ ∈ [0, 1), default: 0.9 beta2: Second moment decay rate β₂ ∈ [0, 1), default: 0.999 eps: Numerical stability constant ε, default: 1e-8 References: Kingma, D. P., & Ba, J. (2014). "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980. Example: >>> from neuroscope import MLP >>> model = MLP([784, 128, 64, 10]) >>> # Standard Adam (recommended default) >>> model.compile(optimizer="adam", lr=0.001) >>> history = model.fit(X_train, y_train, epochs=100) >>> # Higher learning rate for faster convergence >>> model.compile(optimizer="adam", lr=0.01) >>> history = model.fit(X_train, y_train, epochs=100) Notes: - Default hyperparameters work well for most problems - Adam is particularly effective for sparse gradients and noisy data - Less sensitive to learning rate than SGD - Memory overhead: 2x parameters (stores m and v) """
[docs] def __init__( self, learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, eps: float = 1e-8, ): super().__init__(learning_rate) # Validate hyperparameters if not 0 <= beta1 < 1: raise ValueError(f"beta1 must be in [0, 1), got {beta1}") if not 0 <= beta2 < 1: raise ValueError(f"beta2 must be in [0, 1), got {beta2}") if eps <= 0: raise ValueError(f"eps must be positive, got {eps}") self.beta1 = beta1 self.beta2 = beta2 self.eps = eps # Initialize state (created on first update) self._state = { "m_weights": [], "v_weights": [], "m_biases": [], "v_biases": [], "t": 0, "initialized": False, }
[docs] def update( self, weights: List[np.ndarray], biases: List[np.ndarray], weight_grads: List[np.ndarray], bias_grads: List[np.ndarray], ) -> None: """ Apply Adam adaptive gradient update. Implements the Adam algorithm from Kingma & Ba (2014) with exponential moving averages and bias correction. """ # Initialize moment buffers on first call if not self._state["initialized"]: self._state["m_weights"] = [np.zeros_like(w) for w in weights] self._state["v_weights"] = [np.zeros_like(w) for w in weights] self._state["m_biases"] = [np.zeros_like(b) for b in biases] self._state["v_biases"] = [np.zeros_like(b) for b in biases] self._state["initialized"] = True # Increment timestep self._state["t"] += 1 t = self._state["t"] # Retrieve moment buffers m_w = self._state["m_weights"] v_w = self._state["v_weights"] m_b = self._state["m_biases"] v_b = self._state["v_biases"] # Update each layer for i in range(len(weights)): # --- Weight updates --- # Update biased first moment: m_t = β₁ * m_{t-1} + (1 - β₁) * g_t m_w[i] = self.beta1 * m_w[i] + (1 - self.beta1) * weight_grads[i] # Update biased second moment: v_t = β₂ * v_{t-1} + (1 - β₂) * g_t² v_w[i] = self.beta2 * v_w[i] + (1 - self.beta2) * (weight_grads[i] ** 2) # Bias correction m_hat_w = m_w[i] / (1 - self.beta1**t) v_hat_w = v_w[i] / (1 - self.beta2**t) # Parameter update: θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε) weights[i] -= self.learning_rate * m_hat_w / (np.sqrt(v_hat_w) + self.eps) # --- Bias updates (same logic) --- m_b[i] = self.beta1 * m_b[i] + (1 - self.beta1) * bias_grads[i] v_b[i] = self.beta2 * v_b[i] + (1 - self.beta2) * (bias_grads[i] ** 2) m_hat_b = m_b[i] / (1 - self.beta1**t) v_hat_b = v_b[i] / (1 - self.beta2**t) biases[i] -= self.learning_rate * m_hat_b / (np.sqrt(v_hat_b) + self.eps)
[docs] def state_dict(self) -> Dict[str, Any]: """Get optimizer state including moment estimates and timestep.""" return { "type": self.__class__.__name__, "learning_rate": self.learning_rate, "beta1": self.beta1, "beta2": self.beta2, "eps": self.eps, "state": self._state, }
[docs] def load_state_dict(self, state_dict: Dict[str, Any]) -> None: """Load optimizer state including moment estimates and timestep.""" self.learning_rate = state_dict["learning_rate"] self.beta1 = state_dict["beta1"] self.beta2 = state_dict["beta2"] self.eps = state_dict["eps"] self._state = state_dict["state"]
[docs] class RMSprop(Optimizer): """ RMSprop (Root Mean Square Propagation) optimizer. Maintains moving average of squared gradients to normalize learning rates. Particularly effective for non-stationary objectives and recurrent networks. Can be seen as a precursor to Adam, using only second moment adaptation. Algorithm (Hinton, 2012; Tieleman & Hinton, 2012): E[g²]_t = ρ * E[g²]_{t-1} + (1 - ρ) * g_t² [Moving average of squared gradients] θ_t = θ_{t-1} - α * g_t / (√E[g²]_t + ε) [Parameter update] With momentum (optional): v_t = μ * v_{t-1} + α * g_t / (√E[g²]_t + ε) [Momentum accumulation] θ_t = θ_{t-1} - v_t [Parameter update] Args: learning_rate: Learning rate α, default: 0.001 rho: Decay rate for moving average ρ ∈ [0, 1), default: 0.9 eps: Numerical stability constant ε, default: 1e-8 momentum: Momentum coefficient μ ∈ [0, 1), default: 0.0 (disabled) References: - Tieleman, T., & Hinton, G. (2012). "Lecture 6.5 - RMSprop: Divide the gradient by a running average of its recent magnitude." COURSERA: Neural Networks for Machine Learning. - Hinton, G., Srivastava, N., & Swersky, K. (2012). "Neural Networks for Machine Learning Lecture 6a Overview of mini-batch gradient descent." Example: >>> from neuroscope import MLP >>> model = MLP([784, 128, 64, 10]) >>> # Standard RMSprop (recommended for RNNs) >>> model.compile(optimizer="rmsprop", lr=0.001) >>> history = model.fit(X_train, y_train, epochs=100) >>> # Note: RMSprop uses built-in momentum=0.0 by default >>> # For momentum-based training, use "sgdm" or "sgdnm" instead Notes: - Default rho=0.9 works well for most problems - RMSprop handles sparse gradients better than standard SGD - Adding momentum can improve convergence stability - Less memory intensive than Adam (no first moment) """
[docs] def __init__( self, learning_rate: float = 0.001, rho: float = 0.9, eps: float = 1e-8, momentum: float = 0.0, ): super().__init__(learning_rate) # Validate hyperparameters if not 0 <= rho < 1: raise ValueError(f"rho must be in [0, 1), got {rho}") if eps <= 0: raise ValueError(f"eps must be positive, got {eps}") if not 0 <= momentum < 1: raise ValueError(f"momentum must be in [0, 1), got {momentum}") self.rho = rho self.eps = eps self.momentum = momentum # Initialize state (created on first update) self._state = { "square_avg_weights": [], "square_avg_biases": [], "velocity_w": [], # Only used if momentum > 0 "velocity_b": [], # Only used if momentum > 0 "initialized": False, }
[docs] def update( self, weights: List[np.ndarray], biases: List[np.ndarray], weight_grads: List[np.ndarray], bias_grads: List[np.ndarray], ) -> None: """ Apply RMSprop adaptive gradient update. Implements the RMSprop algorithm from Tieleman & Hinton (2012) with optional momentum acceleration. """ # Initialize buffers on first call if not self._state["initialized"]: self._state["square_avg_weights"] = [np.zeros_like(w) for w in weights] self._state["square_avg_biases"] = [np.zeros_like(b) for b in biases] if self.momentum > 0: self._state["velocity_w"] = [np.zeros_like(w) for w in weights] self._state["velocity_b"] = [np.zeros_like(b) for b in biases] self._state["initialized"] = True # Retrieve buffers sq_avg_w = self._state["square_avg_weights"] sq_avg_b = self._state["square_avg_biases"] # Update each layer for i in range(len(weights)): # --- Weight updates --- # Update moving average of squared gradients: E[g²]_t = ρ * E[g²]_{t-1} + (1-ρ) * g_t² sq_avg_w[i] = self.rho * sq_avg_w[i] + (1 - self.rho) * ( weight_grads[i] ** 2 ) # Compute adaptive update: g_t / √E[g²]_t adaptive_grad_w = weight_grads[i] / (np.sqrt(sq_avg_w[i]) + self.eps) if self.momentum > 0: # Apply momentum: v_t = μ * v_{t-1} + α * adaptive_grad velocity_w = self._state["velocity_w"] velocity_w[i] = ( self.momentum * velocity_w[i] + self.learning_rate * adaptive_grad_w ) weights[i] -= velocity_w[i] else: # Direct update: θ_t = θ_{t-1} - α * adaptive_grad weights[i] -= self.learning_rate * adaptive_grad_w # --- Bias updates (same logic) --- sq_avg_b[i] = self.rho * sq_avg_b[i] + (1 - self.rho) * (bias_grads[i] ** 2) adaptive_grad_b = bias_grads[i] / (np.sqrt(sq_avg_b[i]) + self.eps) if self.momentum > 0: velocity_b = self._state["velocity_b"] velocity_b[i] = ( self.momentum * velocity_b[i] + self.learning_rate * adaptive_grad_b ) biases[i] -= velocity_b[i] else: biases[i] -= self.learning_rate * adaptive_grad_b
[docs] def state_dict(self) -> Dict[str, Any]: """Get optimizer state including squared gradient averages and momentum.""" return { "type": self.__class__.__name__, "learning_rate": self.learning_rate, "rho": self.rho, "eps": self.eps, "momentum": self.momentum, "state": self._state, }
[docs] def load_state_dict(self, state_dict: Dict[str, Any]) -> None: """Load optimizer state including squared gradient averages and momentum.""" self.learning_rate = state_dict["learning_rate"] self.rho = state_dict["rho"] self.eps = state_dict["eps"] self.momentum = state_dict["momentum"] self._state = state_dict["state"]