Source code for flint.nn.functional

import math
import numpy as np
from typing import Union, Tuple

import flint
from ..tensor import Tensor
from ..utils import *
from .types import _tuple_1_t, _tuple_2_t, _tuple_any_t, _size_2_t
from .utils import im2col
from .modules.utils import _pair

# ---------------------- activators ----------------------

[docs]def relu(input: Tensor) -> Tensor: """ Compute ReLU (Rectified Linear Unit) element-wise. """ out = Tensor( data = np.maximum(0., input.data), depends_on = [input], requires_grad = input.requires_grad ) def grad_relu(): if input.requires_grad: input.grad += out.grad * ((input.data > 0) * np.ones_like(input.data)) if out.requires_grad: out.grad_fn = grad_relu return out
[docs]def leaky_relu(input: Tensor, negative_slope: float = 0.01) -> Tensor: """ Compute Leaky ReLU element-wise. .. math:: \\text{LeakyReLU}(x) = \max(0, x) + \\text{negative\_slope} * \min(0, x) Parameters ---------- negative_slope : float, optional, default=1e-2 Controls the angle of the negative slope. """ out = Tensor( data = np.maximum(negative_slope * input.data, input.data), depends_on = [input], requires_grad = input.requires_grad ) def grad_leaky_relu(): if input.requires_grad: grad = np.ones_like(input.data) grad[input.data < 0] = negative_slope input.grad += out.grad * grad if out.requires_grad: out.grad_fn = grad_leaky_relu return out
[docs]def sigmoid(input: Tensor) -> Tensor: """ Compute Sigmoid element-wise. .. math:: \\text{sigmoid}(x) = \\frac{1}{1 + \exp(-x)} """ ret = 1 / (1 + np.exp(-input.data)) out = Tensor( data = ret, depends_on = [input], requires_grad = input.requires_grad ) def grad_sigmoid(): if input.requires_grad: input.grad += out.grad * out.data * (1 - out.data) if out.requires_grad: out.grad_fn = grad_sigmoid return out
[docs]def tanh(input: Tensor) -> Tensor: """ Compute Tanh (Hyperbolic Tangent) element-wise. .. math:: \\text{tanh}(x) = \\frac{\sinh(x)}{\cosh(x)} = \\frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)} """ ret = np.tanh(input.data) out = Tensor( data = ret, depends_on = [input], requires_grad = input.requires_grad ) def grad_tanh(): if input.requires_grad: input.grad += out.grad * (1 - np.square(out.data)) if out.requires_grad: out.grad_fn = grad_tanh return out
[docs]def gelu(input: Tensor) -> Tensor: """ Compute GELU (Gaussian Error Linear Units) [1] element-wise. .. math:: \\text{GELU}(x) = x \cdot \Phi(x) = x \cdot \\frac{1}{2} [1 + \\text{erf} (x / \sqrt{2})] where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. We can approximate it with: .. math:: \\text{GELU}(x) = 0.5 x (1 + \\text{tanh}[ \sqrt{2 / \pi} (x + 0.044715 x^3) ]) or .. math:: \\text{GELU}(x) = x \sigma(1.702 x) References ---------- 1. "`Gaussian Error Linear Units (GELUs). <https://arxiv.org/pdf/1606.08415.pdf>`_" \ Dan Hendrycks and Kevin Gimpel. arXiv 2016. """ out = 0.5 * input * (1.0 + tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * (input ** 3.0)))) return out
# ---------------------- loss functions ----------------------
[docs]def nll_loss( input: Tensor, target: Tensor, reduction: str = 'mean' ) -> Tensor: """ Negative Log Likelihood Loss NOTE: Here I apply ``log()`` on the prediction data, which is DIFFERENT FROM ``nn.functional.nll_loss()`` IN PYTORCH! Parameters ---------- input : Tensor A 2-dim (batch_size, n_classes) tensor target : Tensor A 1-dim (batch_size) tensor where each value: 0 <= target[i] <= n_classes-1 reduction : str, optional, default='mean' 'none' / 'mean' / 'sum' """ dim = input.ndim if dim != 2: raise ValueError("Expected 2 dimensions (got {})".format(dim)) if input.shape[0] != target.shape[0]: raise ValueError( "Expected input batch_size ({}) to match target batch_size ({}).".format(input.shape[0], target.shape[0]) ) batch_size = input.shape[0] n_classes = input.shape[1] delta = 1e-7 # deal with the situation that input.data = 0 ret = - np.log(input.data[np.arange(batch_size), target.data.astype(np.int)] + delta) if reduction in ['sum', 'mean']: ret = np.sum(ret) if reduction == 'mean': ret = ret / batch_size out = Tensor( data = ret, depends_on = [input], requires_grad = input.requires_grad ) def grad_nll(): if input.requires_grad: p = np.clip(input.data, 1e-15, 1 - 1e-15) y = to_categorical(target.data, n_classes=n_classes) if reduction == 'mean': input.grad += (p - y) / batch_size # (batch_size, n_classes) elif reduction == 'sum': input.grad += (p - y) # (batch_size, n_classes) if out.requires_grad and reduction != 'none': out.grad_fn = grad_nll return out
[docs]def cross_entropy( input: Tensor, target: Tensor, reduction: str = 'mean' ) -> Tensor: """ Cross Entropy Loss NOTE: Combine ``softmax()`` and ``nll_loss()``, which is DIFFERENT FROM ``nn.functional.cross_entropy()`` IN PYTORCH! Parameters ---------- input : Tensor A 2-dim (batch_size, n_classes) tensor target : Tensor A 1-dim (batch_size) tensor where each value: 0 <= target[i] <= n_classes-1 reduction : str, optional, default='mean' 'none' / 'mean' / 'sum' """ after_softmax = input.softmax(dim=-1) out = nll_loss(after_softmax, target, reduction) return out
[docs]def mse_loss( input: Tensor, target: Tensor, reduction: str = 'mean' ) -> Tensor: """ Mean Squared Error Loss :math:`(x - y)^2` Parameters ---------- input : Tensor Tensor of shape (batch_size, *) target : Tensor Tensor of the same shape as input reduction : str, optional, default='mean' 'none' / 'mean' / 'sum' """ if target.shape != input.shape: raise ValueError( "The target size ({}) is different to the input size ({}). " "Please ensure they have the same size.".format(target.shape, input.shape) ) n = input.numel out = (input - target) ** 2 if reduction in ['sum', 'mean']: out = out.sum() if reduction == 'mean': out = out / n return out
[docs]def binary_cross_entropy( input: Tensor, target: Tensor, reduction: str = 'mean' ) -> Tensor: """ Binary Cross Entropy Loss .. math:: \\text{loss} = - (y \log(x) + (1 - y) \log(1 - x)) Parameters ---------- input : Tensor Tensor of shape (batch_size, *) target : Tensor Tensor of the same shape as input reduction : str, optional, default='mean' 'none' / 'mean' / 'sum' """ if target.shape != input.shape: raise ValueError( "The target size ({}) is different to the input size ({}). " "Please ensure they have the same size.".format(target.shape, input.shape) ) n = input.numel out = - (target * input.log() + (-target + 1.) * (-input + 1.).log()) if reduction in ['sum', 'mean']: out = out.sum() if reduction == 'mean': out = out / n return out
# ---------------------- pad ----------------------
[docs]def pad(input: Tensor, pad: _tuple_any_t[int], value: int = 0) -> Tensor: """ Pad tensor. Parameters ---------- input : Tensor N-dimensional tensor pad : _tuple_any_t[int] Padding sizes, a m-elements tuple, where ``m/2`` <= input dimensions and ``m`` is even. The padding sizes are described starting from the ``m/2`` to last dimension to the last dimension. That is, ``m/2`` dimensions of input will be padded. value : int, optional, default=0 Fill value for 'constant' padding """ n_pad_dims = int(len(pad) / 2) ndims = input.ndim no_pad_width = [(0, 0) for i in range(0, ndims - n_pad_dims)] pad_width = no_pad_width + [(pad[i * 2], pad[i * 2 + 1]) for i in range(0, n_pad_dims)] ret = np.pad( input.data, pad_width = pad_width, mode = 'constant', constant_values = value, ) out = Tensor( data = ret, depends_on = [input], requires_grad = input.requires_grad ) def unpad(x: Tensor): slices = [slice(p[0], None if p[1] == 0 else -p[1]) for p in pad_width] return x[tuple(slices)] def grad_pad(): if input.requires_grad: input.grad += unpad(out.grad) if out.requires_grad: out.grad_fn = grad_pad return out
# ---------------------- linear ----------------------
[docs]def linear(input: Tensor, weight: Tensor, bias: Tensor = None): """ Apply a linear transformation to the incoming data. .. math:: y = x A^T + b """ out = input @ weight if bias is not None: out += bias return out
# ---------------------- unfold ----------------------
[docs]def unfold( input: Tensor, kernel_size: _size_2_t, stride: _size_2_t = 1, padding: _size_2_t = 0, dilation: _size_2_t = 1 ): """ Extracts sliding local blocks from a batched input tensor. - input shape: :math:`(N, C, H, W)` - output shape: :math:`(N, C \\times \prod(\\text{kernel\_size}), L)` where: .. math:: L = \prod_d \\frac{\\text{spatial\_size[d] + 2 * padding[d] - dilation[d] * (kernel\_size[d] - 1) - 1}}{\\text{stride}[d]} + 1 where :math:`\\text{spatial\_size}` is formed by the spatial dimensions of ``input`` (H and W above), and :math:`d` is over all spatial dimensions. Parameters ---------- input : Tensor Input tensor kernel_size : int or tuple Size of the sliding blocks. stride : int or tuple, optional, default=1 Stride of the sliding blocks in the input spatial dimensions. padding : int or tuple, optional, default=0 Implicit zero padding to be added on both sides of input. dilation : int or tuple, optional, default=1 A parameter that controls the stride of elements within the neighborhood. """ # Union[int, Tuple[int, int]] -> Tuple[int] kernel_size = _pair(kernel_size) stride = _pair(stride) padding = _pair(padding) dilation = _pair(dilation) batch_size, in_channels, h_in, w_in = input.shape kernel_h, kernel_w = kernel_size # compute the dimensions of the pooling output h_out = int((h_in + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1) w_out = int((w_in + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1) # padding input tensor padded_data = pad(input, (0, 0, 0, 0, padding[0], padding[0], padding[1], padding[1])) # convert input tensor and weights/kernels into a 2D matrices unfolded = im2col(padded_data, kernel_size, (h_out, w_out), stride, dilation) # (batch_size, kernel_h * kernel_w * in_channels, L = h_out * w_out) return unfolded, h_out, w_out
# ---------------------- conv ----------------------
[docs]def conv2d( input: Tensor, weight: Tensor, bias: Tensor = None, stride: _tuple_2_t[int] = (1, 1), padding: _tuple_2_t[int] = (0, 0), dilation: _tuple_2_t[int] = (1, 1) ): """ Apply a 2D convolution over an input signal composed of several input planes. - input shape: ``(batch_size, in_channels, h_in, w_in)`` - output shape: ``(batch_size, out_channels, h_out, w_out)`` where: .. math:: \\text{h\_out} = \\frac{\\text{h\_in + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1}}{\\text{stride}[0]} + 1 .. math:: \\text{w\_out} = \\frac{\\text{w\_in + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1}}{\\text{stride}[1]} + 1 NOTE: Use ``unfold`` function to perform the convolution as a single matrix multiplication. For more details, see [1]. Parameters ---------- input : Tensor Input tensor weight : Tensor Weight of the conv1d layer bias : Tensor, optional Bias of the conv2d layer stride : Tuple[int, int], optional, default=(1, 1) Stride of the convolution padding : Tuple[int, int], optional, default=(0, 0)) Zero-padding added to both sides of the input dilation : Tuple[int, int], optional, default=(1, 1) Spacing between kernel elements References ---------- 1. `Why GEMM is at the heart of deep learning? Pete Warden. <https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/>`_ 2015. """ batch_size, in_channels, h_in, w_in = input.shape out_channels, in_channels, kernel_h, kernel_w = weight.shape input_col, h_out, w_out = unfold(input, (kernel_h, kernel_w), stride, padding, dilation) input_col = input_col.permute(1, 2, 0).view(kernel_h * kernel_w * in_channels, -1) # (kernel_h * kernel_w * in_channels, batch_size * h_out * w_out) weight_col = weight.view(out_channels, -1) out = (weight_col @ input_col).view(out_channels, h_out, w_out, batch_size).permute(3, 0, 1, 2) if bias is not None: out += bias return out
[docs]def conv1d( input: Tensor, weight: Tensor, bias: Tensor = None, stride: _tuple_1_t[int] = (1, ), padding: _tuple_1_t[int] = (0, ), dilation: _tuple_1_t[int] = (1, ) ): """ Apply a 1D convolution over an input signal composed of several input planes. - input shape: ``(batch_size, in_channels, L_in)`` - output shape: ``(batch_size, out_channels, L_out)`` where: .. math:: \\text{L\_out} = \\frac{\\text{L\_in + 2 * padding - dilation * (kernel\_size - 1) - 1}}{\\text{stride}} + 1 Parameters ---------- input : Tensor Input tensor weight : Tensor Weight of the conv1d layer bias : Tensor, optional Bias of the conv1d layer stride : Tuple[int], optional, default: (1, ) Stride of the convolution padding : Tuple[int], optional, default: (0, ) Zero-padding added to both sides of the input dilation : Tuple[int], optional, default: (1, ) Spacing between kernel elements """ # add a dimension to tensors so we can use conv2d input_2d = input.unsqueeze(dim=2) weight_2d = weight.unsqueeze(dim=2) bias_2d = bias.unsqueeze(dim=2) stride_2d = (1, stride[0]) pad_2d = (0, padding[0]) dilation_2d = (1, dilation[0]) out_2d = conv2d(input_2d, weight_2d, bias_2d, stride_2d, pad_2d, dilation_2d) # (batch_size, out_channels, 1, L_out) # drop the added dimension out = out_2d.squeeze(dim=2) return out
# ---------------------- max pooling ----------------------
[docs]def max_pool2d( input: Tensor, kernel_size: _tuple_2_t[int], stride: _tuple_2_t[int], padding: _tuple_2_t[int] = (0, 0), dilation: _tuple_2_t[int] = (1, 1), return_indices: bool = False ): """ Apply a 2D max pooling over an input signal composed of several input planes. - input shape: ``(batch_size, in_channels, h_in, w_in)`` - output shape: ``(batch_size, out_channels, h_out, w_out)`` where: .. math:: \\text{h\_out} = \\frac{\\text{h\_in + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1}}{\\text{stride}[0]} + 1 .. math:: \\text{w\_out} = \\frac{\\text{w\_in + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1}}{\\text{stride}[1]} + 1 NOTE: Use ``unfold`` function to perform the max pooling as a single matrix multiplication. For more details, see [1]. NOTE: It should be noted that, PyTorch argues the input will be implicitly zero-padded when ``padding`` is non-zero in its `documentation <https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html>`_. However, in fact, it uses implicit **negative infinity** padding rather than zero-padding, see `this issue <https://github.com/pytorch/pytorch/issues/33384>`_. In this class, zero-padding is used. Parameters ---------- kernel_size : Tuple[int, int] Size of the sliding window, must be > 0. stride : Tuple[int, int] Stride/hop of the window. Default to ``kernel_size``. padding : Tuple[int, int], optional, default=(0, 0) Zero-padding added to both sides of the input, must be >= 0 and <= ``kernel_size / 2``. dilation : Tuple[int, int], optional, default=(1, 1) Spacing between the elements in the window, must be > 0 return_indices : bool, optional, default=False If ``True``, will return the max indices along with the outputs References ---------- 1. `Why GEMM is at the heart of deep learning? Pete Warden. \ <https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/>`_ 2015. """ batch_size, in_channels, h_in, w_in = input.shape kernel_h, kernel_w = kernel_size input_col, h_out, w_out = unfold(input, kernel_size, stride, padding, dilation) input_col = input_col.permute(1, 2, 0).view(in_channels, kernel_h * kernel_w, -1) out_max = input_col.max(dim=1).view(in_channels, h_out, w_out, batch_size).permute(3, 0, 1, 2) return out_max
[docs]def max_pool1d( input: Tensor, kernel_size: _tuple_1_t[int], stride: _tuple_1_t[int] = (1, ), padding: _tuple_1_t[int] = (0, ), dilation: _tuple_1_t[int] = (1, ), return_indices: bool = False ): """ Apply a 1D max pooling over an input signal composed of several input planes. - input shape: ``(batch_size, in_channels, L_in)`` - output shape: ``(batch_size, out_channels, L_out)`` where: .. math:: \\text{L\_out} = \\frac{\\text{L\_in + 2 * padding - dilation * (kernel\_size - 1) - 1}}{\\text{stride}} + 1 NOTE: It should be noted that, PyTorch argues the input will be implicitly zero-padded when ``padding`` is non-zero in its `documentation <https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html>`_. However, in fact, it uses implicit **negative infinity** padding rather than zero-padding, see `this issue <https://github.com/pytorch/pytorch/issues/33384>`_. In this class, zero-padding is used. Parameters ---------- kernel_size : Tuple[int] Size of the sliding window, must be > 0. stride : Tuple[int] Stride of the window, must be > 0. Default to ``kernel_size``. padding : Tuple[int], optional, default=0 Zero-padding added to both sides of the input, must be >= 0 and <= ``kernel_size / 2``. dilation : Tuple[int], optional, default=1 Spacing between the elements in the window, must be > 0 return_indices : bool, optional, default=False) If ``True``, will return the max indices along with the outputs """ # add a dimension to tensors so we can use max_pool2d input_2d = input.unsqueeze(dim=2) kernel_size_2d = (1, kernel_size) stride_2d = (1, stride[0]) pad_2d = (0, padding[0]) dilation_2d = (1, dilation[0]) out_2d = max_pool2d(input_2d, kernel_size_2d, stride_2d, pad_2d, dilation_2d, return_indices) # (batch_size, out_channels, 1, L_out) # drop the added dimension out = out_2d.squeeze(dim=2) return out
# ---------------------- dropout ----------------------
[docs]def dropout(input: Tensor, p: float = 0.5, training: bool = True) -> Tensor: """ Dropout is used to randomly zeroes some of the elements of the input tensor with probability ``p`` using samples from a Bernoulli distribution during training. Furthermore, the outputs are scaled by a factor of :math:`\\frac{1}{1 - p}` during training. Each channel will be zeroed out independently on every forward call. During evaluation, the module simply computes an identity function. This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons as described in the paper [1]. Parameters ---------- p : float, optional, default=0.5 Probability of an element to be zeroed training : bool Apply dropout if is ``True`` References ---------- 1. "`Improving Neural Networks by Preventing Co-adaptation of Feature Detectors. \ <https://arxiv.org/abs/1207.0580>`_" Geoffrey E. Hinton, et al. arXiv 2012. """ ret = input.data scaler = 1.0 / (1.0 - p) mask = np.random.binomial(1, 1 - p, size=input.shape) if training: ret = scaler * mask * ret out = Tensor( data = ret, depends_on = [input], requires_grad = input.requires_grad ) def grad_dropout(): if input.requires_grad: input.grad += scaler * mask * out.grad if out.requires_grad: out.grad_fn = grad_dropout return out
# ---------------------- flatten ----------------------
[docs]def flatten(input: Tensor) -> Tensor: """ Flatten the input. Does not affect the batch size. NOTE: If inputs are shaped ``(batch,)`` without a feature axis, then flattening adds an extra channel dimension and output shape is ``(batch, 1)``. """ return input.view(input.size(0), -1)