import numpy as np
from .optimizer import Optimizer
[docs]class Adadelta(Optimizer):
    """
    Implementation of Adadelta algorithm proposed in [1].
    .. math::
       h_t = \\rho h_{t-1} + (1 - \\rho) g_t^2
    .. math::
       g'_t = \sqrt{\\frac{\Delta \\theta_{t-1} + \epsilon}{h_t + \epsilon}} \cdot g_t
    .. math::
       \Delta \\theta_t = \\rho \Delta \\theta_{t-1} + (1 - \\rho) (g'_t)^2
    .. math::
       \\theta_t = \\theta_{t-1} - g'_t
    where :math:`h` is the moving average of the squared gradients,
    :math:`\epsilon` is for improving numerical stability.
    Parameters
    ----------
    params : iterable
        An iterable of Tensor
    rho : float, optional, default=0.9
        Coefficient used for computing a running average of squared gradients
    eps : float, optional, default=1e-6
        Term added to the denominator to improve numerical stability
    lr : float, optional, default=1.0
        Coefficient that scale delta before it is applied to the parameters
    weight_decay : float, optional, default=0
        Weight decay (L2 penalty)
    References
    ----------
    1. "`ADADELTA: An Adaptive Learning Rate Method. Matthew D. Zeiler. <https://arxiv.org/abs/1212.5701>`_" arxiv 2012.
    """
    def __init__(
        self,
        params = None,
        rho: float = 0.99,
        eps: float = 1e-6,
        lr: float = 1.0,
        weight_decay: float = 0.
    ):
        super(Adadelta, self).__init__(params, lr, weight_decay)
        self.eps = eps
        self.rho = rho
        self.h = [np.zeros_like(p.data) for p in self.params]
        self.delta = [np.zeros_like(p.data) for p in self.params]
[docs]    def step(self):
        for i, (h, delta, p) in enumerate(zip(self.h, self.delta, self.params)):
            if p.requires_grad:
                # l2 penalty
                p_grad = p.grad + self.weight_decay * p.data
                # moving average of the squared gradients
                h = self.rho * h + (1 - self.rho) * (p.grad ** 2)
                self.h[i] = h
                # compute g'_t and delta_t
                g_ = np.sqrt(delta + self.eps) / np.sqrt(h + self.eps) * p.grad
                delta = self.rho * delta + (1 - self.rho) * (g_ ** 2)
                self.delta[i] = delta
                # update parameters
                p.data -= self.lr * g_
        super(Adadelta, self).step()