Source code for penne.nn

"""Special expressions for neural networks."""

__all__ = ['sigmoid', 'rectify', 'hardtanh', 'softmax', 'logsoftmax', 'crossentropy', 'distance2', 'Layer', 'make_layer', 'Dropout']

from six.moves import range
from . import backend as numpy
from .expr import *

## Activation functions

[docs]class sigmoid(Unary):
    """Logistic sigmoid function, :math:`\frac{1}{1+\exp -x}`."""
    gain = 0.25
    @staticmethod 
    def f(x):      
        #with numpy.errstate(over='ignore'): 
        try:
            return numpy.sigmoid(x)
        except AttributeError:
            return 1./(1.+numpy.exp(-x))

    @staticmethod 
    def dfdx(x, y): 
        try:
            return numpy.xonemx(y)
        except AttributeError:
            return y*(1.-y)

[docs]class rectify(Unary):
    """Rectified linear unit, :math:`\max(0, x)`."""
    gain = 0.5
    @staticmethod 
    def f(x):       return numpy.maximum(x, 0.)
    @staticmethod 
    def dfdx(x, y): return numpy.where(x > 0., 1., 0.)

[docs]class hardtanh(Unary):
    """Hard tanh function, equivalent to clip(x, -1, 1)."""
    gain = 1.
    @staticmethod 
    def f(x):       return numpy.clip(x, -1., 1.)
    @staticmethod 
    def dfdx(x, y): return numpy.where(numpy.logical_and(-1. < x, x < 1.), 1., 0.)

[docs]class softmax(Expression):
    """Softmax function, :math:`y_i = \exp x_i / \sum_{i'} \exp x_{i'}`.
    
    :param axis: along which to perform the softmax (default is last).
    """
    gain = 1.
    def __init__(self, arg, axis=-1):
        Expression.__init__(self, arg)
        self.axis = axis

    def forward(self, values):
        axis = self.axis
        v = values[self.args[0]]
        v = v - numpy.amax(v, axis=axis, keepdims=True)
        v = numpy.exp(v)
        values[self] = v / numpy.sum(v, axis=axis, keepdims=True)

    def backward(self, values, gradients):
        axis = self.axis
        arg = self.args[0]
        if arg in gradients:
            gradients[arg] += (gradients[self] - numpy.vecdot(values[self], gradients[self], axis=axis, keepdims=True)) * values[self]

[docs]class logsoftmax(Expression):
    """Log-softmax function, :math:`y_i = \log \left(\exp x_i / \sum_{i'} \exp x_{i'}\right)`.
    
    Use this instead of log(softmax(x)) for better numerical stability.
    
    :param axis: along which to perform the softmax (default is last).
    """
    gain = 1.
    def __init__(self, arg, axis=-1):
        Expression.__init__(self, arg)
        self.axis = axis

    def forward(self, values):
        axis = self.axis
        v = values[self.args[0]]
        v = v - numpy.amax(v, axis=axis, keepdims=True)
        values[self] = v - numpy.log(numpy.sum(numpy.exp(v), axis=axis, keepdims=True))

    def backward(self, values, gradients):
        axis = self.axis
        arg = self.args[0]
        if arg in gradients:
            gradients[arg] += gradients[self] - numpy.sum(gradients[self], axis=axis, keepdims=True) * numpy.exp(values[self])

### Dropout

class _dropout_gate(Expression):
    def __init__(self, factory, arg):
        Expression.__init__(self, arg)
        self.factory = factory

    def forward(self, values):
        if self.factory.enabled:
            p = self.factory.p
            arg = self.args[0]
            values[self] = (numpy.random.uniform(0., 1., values[arg].shape) > p) / (1-p)
        else:
            values[self] = 1.

[docs]class Dropout(object):
    """Factory for dropouts.

    Example usage::

        d = Dropout(0.5)
        y = d(x)

    The reason for the extra level of indirection is so that all the
    dropouts can be enabled or disabled together.

    :param p: probability of dropout
    :type p:  float
    :return:  dropout function
    :rtype:   Expression -> Expression
    """
    def __init__(self, p=0.5):
        self.p = p
        self.enabled = True
    def __call__(self, arg):
        return arg * _dropout_gate(self, arg)
    def enable(self):
        """Enable all dropouts that originated from this factory."""
        self.enabled = True
    def disable(self):
        """Disable all dropouts that originated from this factory."""
        self.enabled = False

### Loss functions

[docs]def crossentropy(logp, correct):
    """Cross-entropy, a.k.a. log-loss.

    :param logp:    vector of log-probabilities
    :param correct: observed probabilities
    """
    return -vecdot(logp, correct)

[docs]def distance2(x, y):
    """Squared Euclidean distance, a.k.a. mean-squared loss."""
    d = x - y
    return vecdot(d, d)

### Fully-connected layer

def guess_gain(f, d):
    """Try to figure out how the activation function affects the
    variance of inputs/gradients."""

    if f is None: return 1.
    if hasattr(f, "gain"): return f.gain

    # As is standard, use the gradient of f at zero.  However, since f
    # might not be differentiable at zero (e.g., ReLU), compute
    # gradient a little bit to the left and right and average.

    delta = 0.1
    g = []
    for xv in [-delta, 0., delta]:
        x = constant(0.)
        y = f(x)
        values = {x: xv}
        gradients = {x: 0., y: 1.}
        y.forward(values)
        y.backward(values, gradients)
        g.append(gradients[x])

    if abs(g[2]-g[0])/2. > delta:
        return (g[0] + g[2]) / 2.
    else:
        return g[1]

[docs]class Layer(object):
    """Fully-connected layer.

    :param insize:  Input size or sequence of input sizes. 

                    - If an input size is n > 0, then that input will
                      expect an n-dimensional vector.

                    - If an input size is n < 0, then that input will
                      expect an integer in [0, n), which you can either
                      think of as a one-hot vector or as an index into a
                      lookup table.

                    - If an input size is "diag", then that input will
                      have a diagonal weight matrix.

    :param outsize: Output size.
    :param f:       Activation function (default tanh).
    :param bias:    Initial bias, or None for no bias."""

    def __init__(self, insize, outsize, f=tanh, gain=None, bias=0., model=None):
        if type(insize) is int: insize = [insize]
        if model is None: model = parameter.all

        if gain is None: gain = guess_gain(f, outsize)
        def random(variance, shape):
            #return numpy.random.normal(0., variance**0.5, shape)
            return numpy.random.uniform(-(variance*3)**0.5, (variance*3)**0.5, shape)

        # Although it is more conventional to left-multiply by the weight
        # matrix, we right-multiply so it works correctly with stacks of
        # vectors.

        total_insize = 0
        for d in insize:
            if d == "diag":
                # Var[input * weight] = Var[input] * Var[weight]
                total_insize += 1
            elif d >= 0:
                # Var[dot(input, weight)] = d * Var[input] * Var[weight]
                total_insize += d
            else:
                # Var[weight[input]] = Var[weight]
                total_insize += 1
        if bias is not None: total_insize += 1
        variance = 2. / (total_insize + outsize) / gain**2

        self.weight = []
        for a, d in enumerate(insize):
            if d == "diag":
                w = parameter(random(variance, (outsize,)), model=model)
                self.weight.append(w)

            elif d >= 0:
                w = parameter(random(variance, (d, outsize)), model=model)
                self.weight.append(w)

            else:
                # Use lots of small parameter vectors, because usually
                # only one is updated at a time
                w = [parameter(random(variance, (outsize,)), model=model) for i in range(-d)]
                self.weight.append(w)

        if bias is not None:
            self.bias = parameter(random(variance, (outsize,)), model=model)
        else:
            self.bias = constant(0.)

        self.activation = f

    def __call__(self, *args):
        if len(args) != len(self.weight):
            raise TypeError("wrong number of inputs")
        s = self.bias
        for w, x in zip(self.weight, args):
            if isinstance(w, list): # lookup table
                if isinstance(x, int):
                    s += w[x]
                else:
                    # x should be a vector or list of ints
                    s += stack([w[i] for i in x])
            elif w.value.ndim == 1: # diagonal
                s += x * w
            else:
                s += dot(x, w)
        if self.activation:
            return self.activation(s)
        else:
            return s

make_layer = Layer # for backward compatibility