Source code for flint.optim.adagrad

import numpy as np
from .optimizer import Optimizer

[docs]class Adagrad(Optimizer):
    """
    Implementation of Adagrad algorithm proposed in [1].

    .. math::
      h_t = h_{t-1} + g_t^2
    .. math::
      \\theta_{t+1} = \\theta_t - \\frac{\\text{lr}}{\sqrt{h_t + \epsilon}} \cdot g_t

    Parameters
    ----------
    params : iterable
        An iterable of Tensor

    lr : float, optional, default=0.01
        Learning rate

    eps : float, optional, default=1e-10
        Term added to the denominator to improve numerical stability

    weight_decay : float, optional, default=0)
        Weight decay (L2 penalty)

    References
    ----------
    1. "`Adaptive Subgradient Methods for Online Learning and Stochastic Optimization. <https://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_" John Duchi, et al. JMRL 2011.
    """

    def __init__(
        self,
        params = None,
        lr: float = 0.01,
        eps: float = 1e-10,
        weight_decay: float = 0.
    ):
        super(Adagrad, self).__init__(params, lr, weight_decay)
        self.eps = eps
        self.h = [np.zeros_like(p.data) for p in self.params]

[docs]    def step(self):
        for i, (h, p) in enumerate(zip(self.h, self.params)):
            if p.requires_grad:
                # l2 penalty
                p_grad = p.grad + self.weight_decay * p.data
                # accumulate squared gradients
                h += p.grad ** 2
                self.h[i] = h
                # update parameters
                p.data -= self.lr * p.grad / np.sqrt(h + self.eps)

        super(Adagrad, self).step()