Source code for flint.nn.init

"""
Some of the code is borrowed from: https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
"""
import math
import numpy as np
from typing import Optional, Union

from flint import Tensor

[docs]def calculate_gain(nonlinearity: str, param: Optional[Union[int, float]] = None):
    """
    Return the recommended gain value for the given nonlinearity function.

    The values are as follows:

    ================= ====================================================
    nonlinearity      gain
    ================= ====================================================
    Linear / Identity :math:`1`
    Conv{1,2,3}D      :math:`1`
    Sigmoid           :math:`1`
    Tanh              :math:`\\frac{5}{3}`
    ReLU              :math:`\sqrt{2}`
    Leaky Relu        :math:`\sqrt{\\frac{2}{1 + \\text{negative\_slope}^2}}`
    SELU              :math:`\\frac{3}{4}`
    ================= ====================================================

    Parameters
    ----------
    nonlinearity : str
        Name of the non-linear function

    param : Union[int, float], optional
        Optional parameter for the non-linear function
    """

    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d']
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
            # True/False are instances of int, hence check above
            negative_slope = param
        else:
            raise ValueError("negative_slope {} not a valid number".format(param))
        return math.sqrt(2.0 / (1 + negative_slope ** 2))
    elif nonlinearity == 'selu':
        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
    else:
        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))


[docs]def zeros_(tensor: Tensor) -> None:
    """
    Fill the tensor with the scalar value ``0``.

    Args:
        tensor (Tensor): A Tensor
    """
    tensor.zero_()

[docs]def ones_(tensor: Tensor) -> None:
    """
    Fill the tensor with the scalar value ``1``.

    Args:
        tensor (Tensor): A Tensor
    """
    tensor.one_()

[docs]def constant_(tensor: Tensor, val: float) -> None:
    """
    Fill the tensor with the given scalar value ``val``.

    Args:
        tensor (Tensor): A Tensor
        val (float): The value to fill the tensor with
    """
    tensor.fill_(val)

[docs]def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> None:
    """
    Fills the tensor with values drawn from the uniform distribution.

    Args:
        tensor (Tensor): A Tensor
        low (float): The lower bound of the uniform distribution
        high (float): The upper bound of the uniform distribution
    """
    tensor.uniform_(low=a, high=b)

[docs]def normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> None:
    """
    Fills the tensor with values drawn from the normal distribution.

    Args:
        tensor (Tensor): A Tensor
        mean (float): The mean of the normal distribution
        std (float): The standard deviation of the normal distribution
    """
    tensor.normal_(mean=mean, std=std)


def _calculate_fan_in_and_fan_out(tensor: Tensor):
    """
    Compute number of input and output nodes for a tensor.

    Parameters
    ----------
    tensor : Tensor
        A Tensor

    Returns
    -------
    fan_in : int
        Number of input nodes

    fan_out : int
        Number of output nodes
    """
    dimensions = tensor.ndim
    if dimensions < 2:
        raise ValueError('Fan in and fan out can not be computed for tensor with fewer than 2 dimensions')

    num_input_fmaps = tensor.shape[1]
    num_output_fmaps = tensor.shape[0]
    receptive_field_size = 1
    if dimensions > 2:
        receptive_field_size = np.prod(tensor.shape[2:])
    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out

[docs]def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> None:
    """
    Implementation of Xavier initialization proposed in [1]. Also known
    as Glorot initialization, using a uniform distribution.

    The resulting tensor will have values sampled from :math:`U(-a, a)`,
    where ``a = gain * sqrt(6 / (fan_in + fan_out))``.

    Parameters
    ----------
    tensor : Tensor
        A Tensor

    gain : float, optional, default=1.
        An optional scaling factor

    References
    ----------
    1. "`Understanding the Difficulty of Training Deep Feedforward Neural Networks. <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_" Xavier Glorot and Yoshua Bengio. AISTATS 2010.
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / (fan_in + fan_out))
    a = math.sqrt(3.0) * std  # calculate uniform bounds from standard deviation

    tensor.uniform_(low=-a, high=a)

[docs]def xavier_normal_(tensor: Tensor, gain: float = 1.) -> None:
    """
    Implementation of Xavier initialization proposed in [1]. Also known
    as Glorot initialization, using a normal distribution.

    The resulting tensor will have values sampled from :math:`N(0, \\text{std}^2)`,
    where ``std = gain * sqrt(2 / (fan_in + fan_out))``

    Parameters
    ----------
    tensor : Tensor
        A Tensor

    gain : float, optional, default=1.
        An optional scaling factor

    References
    ----------
    1. "`Understanding the Difficulty of Training Deep Feedforward Neural Networks. <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_" Xavier Glorot and Yoshua Bengio. AISTATS 2010.
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / (fan_in + fan_out))

    tensor.normal_(mean=0, std=std)


def _calculate_correct_fan(tensor: Tensor, mode: str):
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))

    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    return fan_in if mode == 'fan_in' else fan_out

[docs]def kaiming_uniform_(
    tensor: Tensor,
    a: float = 0.,
    mode: str = 'fan_in',
    nonlinearity: str = 'leaky_relu'
) -> None:
    """
    Implementation of Kaiming initialization proposed in [1]. Also known
    as He initialization, using a uniform distribution.

    The resulting tensor will have values sampled from :math:`U(-\\text{bound}, \\text{bound})`,
    where ``bound = gain * sqrt*(3 / fan_mode)``.

    Parameters
    ----------
    tensor : Tensor
        A Tensor

    a : float, optional, default=0.
        The negative slope of the rectifier used after this layer (only used
        with 'leaky_relu')

    mode : str, optional, default='fan_in'
        Either ``'fan_in'`` or ``'fan_out'``. ``'fan_in'`` for preserving the
        magnitude of the variance of the weights in the forward pass. ``'fan_out'``
        for preserving the magnitudes in the backwards pass.

    nonlinearity : str, optional, default='leaky_relu'
        Name of the non-linear function, recommended to use only with 'relu'
        or 'leaky_relu'

    References
    ----------
    1. "`Delving Deep into Rectifiers: Surpassing Human-level Performance on ImageNet Classification. \
        <https://arxiv.org/pdf/1502.01852.pdf>`_" Kaiming He, et al. ICCV 2015.
    """
    fan = _calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    bound = math.sqrt(3.0) * std  # calculate uniform bounds from standard deviation

    tensor.uniform_(low=-bound, high=bound)

[docs]def kaiming_normal_(
    tensor: Tensor,
    a: float = 0.,
    mode: str = 'fan_in',
    nonlinearity: str = 'leaky_relu'
) -> None:
    """
    Implementation of Kaiming initialization proposed in [1]. Also known
    as He initialization, using a normal distribution.

    The resulting tensor will have values sampled from :math:`N(0, \\text{std}^2)`,
    where ``std = gain / sqrt(fan_mode)``.

    Parameters
    ----------
    tensor : Tensor
        A Tensor

    a : float, optional, default=0.
        The negative slope of the rectifier used after this layer (only used
        with 'leaky_relu')

    mode : str, optional, default='fan_in'
        Either ``'fan_in'`` or ``'fan_out'``. ``'fan_in'`` for preserving the
        magnitude of the variance of the weights in the forward pass. ``'fan_out'``
        for preserving the magnitudes in the backwards pass.

    nonlinearity : str, optional, default='leaky_relu'
        Name of the non-linear function, recommended to use only with 'relu'
        or 'leaky_relu'

    References
    ----------
    1. "`Delving Deep into Rectifiers: Surpassing Human-level Performance on ImageNet Classification. \
        <https://arxiv.org/pdf/1502.01852.pdf>`_" Kaiming He, et al. ICCV 2015.
    """
    fan = _calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)

    tensor.normal_(mean=0, std=std)


[docs]def lecun_uniform_(tensor: Tensor) -> None:
    """
    Implementation of LeCun initialization, using a uniform distribution.

    The resulting tensor will have values sampled from :math:`U(-\\text{bound}, \\text{bound})`,
    where ``bound = sqrt(3 / fan_in)``.

    Args:
        tensor (Tensor): A Tensor

    References
    ----------
    1. "`Efficient Backprop. <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_" Yann LeCun, et al. 1998.
    """
    fan_in, _ = _calculate_fan_in_and_fan_out(tensor)
    bound = math.sqrt(3.0 / fan_in)  # calculate uniform bounds from standard deviation

    tensor.uniform_(low=-bound, high=bound)

[docs]def lecun_normal_(tensor: Tensor) -> None:
    """
    Implementation of LeCun initialization, using a normal distribution.

    The resulting tensor will have values sampled from :math:`N(0, \\text{std}^2)`,
    where ``std = sqrt(1 / fan_in)``.

    Args:
        tensor (Tensor): A Tensor

    References
    ----------
    1. "`Efficient Backprop. <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_" Yann LeCun, et al. 1998.
    """
    fan_in, _ = _calculate_fan_in_and_fan_out(tensor)
    std = math.sqrt(1.0 / fan_in)

    tensor.normal_(mean=0, std=std)