Source code for penne.nn

"""Special expressions for neural networks."""

__all__ = ['sigmoid', 'rectify', 'hardtanh', 'softmax', 'logsoftmax', 'crossentropy', 'distance2', 'Layer', 'make_layer', 'Dropout']

from six.moves import range
from . import backend as numpy
from .expr import *

## Activation functions

[docs]class sigmoid(Unary): """Logistic sigmoid function, :math:`\frac{1}{1+\exp -x}`.""" gain = 0.25 @staticmethod def f(x): #with numpy.errstate(over='ignore'): try: return numpy.sigmoid(x) except AttributeError: return 1./(1.+numpy.exp(-x)) @staticmethod def dfdx(x, y): try: return numpy.xonemx(y) except AttributeError: return y*(1.-y)
[docs]class rectify(Unary): """Rectified linear unit, :math:`\max(0, x)`.""" gain = 0.5 @staticmethod def f(x): return numpy.maximum(x, 0.) @staticmethod def dfdx(x, y): return numpy.where(x > 0., 1., 0.)
[docs]class hardtanh(Unary): """Hard tanh function, equivalent to clip(x, -1, 1).""" gain = 1. @staticmethod def f(x): return numpy.clip(x, -1., 1.) @staticmethod def dfdx(x, y): return numpy.where(numpy.logical_and(-1. < x, x < 1.), 1., 0.)
[docs]class softmax(Expression): """Softmax function, :math:`y_i = \exp x_i / \sum_{i'} \exp x_{i'}`. :param axis: along which to perform the softmax (default is last). """ gain = 1. def __init__(self, arg, axis=-1): Expression.__init__(self, arg) self.axis = axis def forward(self, values): axis = self.axis v = values[self.args[0]] v = v - numpy.amax(v, axis=axis, keepdims=True) v = numpy.exp(v) values[self] = v / numpy.sum(v, axis=axis, keepdims=True) def backward(self, values, gradients): axis = self.axis arg = self.args[0] if arg in gradients: gradients[arg] += (gradients[self] - numpy.vecdot(values[self], gradients[self], axis=axis, keepdims=True)) * values[self]
[docs]class logsoftmax(Expression): """Log-softmax function, :math:`y_i = \log \left(\exp x_i / \sum_{i'} \exp x_{i'}\right)`. Use this instead of log(softmax(x)) for better numerical stability. :param axis: along which to perform the softmax (default is last). """ gain = 1. def __init__(self, arg, axis=-1): Expression.__init__(self, arg) self.axis = axis def forward(self, values): axis = self.axis v = values[self.args[0]] v = v - numpy.amax(v, axis=axis, keepdims=True) values[self] = v - numpy.log(numpy.sum(numpy.exp(v), axis=axis, keepdims=True)) def backward(self, values, gradients): axis = self.axis arg = self.args[0] if arg in gradients: gradients[arg] += gradients[self] - numpy.sum(gradients[self], axis=axis, keepdims=True) * numpy.exp(values[self])
### Dropout class _dropout_gate(Expression): def __init__(self, factory, arg): Expression.__init__(self, arg) self.factory = factory def forward(self, values): if self.factory.enabled: p = self.factory.p arg = self.args[0] values[self] = (numpy.random.uniform(0., 1., values[arg].shape) > p) / (1-p) else: values[self] = 1.
[docs]class Dropout(object): """Factory for dropouts. Example usage:: d = Dropout(0.5) y = d(x) The reason for the extra level of indirection is so that all the dropouts can be enabled or disabled together. :param p: probability of dropout :type p: float :return: dropout function :rtype: Expression -> Expression """ def __init__(self, p=0.5): self.p = p self.enabled = True def __call__(self, arg): return arg * _dropout_gate(self, arg) def enable(self): """Enable all dropouts that originated from this factory.""" self.enabled = True def disable(self): """Disable all dropouts that originated from this factory.""" self.enabled = False
### Loss functions
[docs]def crossentropy(logp, correct): """Cross-entropy, a.k.a. log-loss. :param logp: vector of log-probabilities :param correct: observed probabilities """ return -vecdot(logp, correct)
[docs]def distance2(x, y): """Squared Euclidean distance, a.k.a. mean-squared loss.""" d = x - y return vecdot(d, d)
### Fully-connected layer def guess_gain(f, d): """Try to figure out how the activation function affects the variance of inputs/gradients.""" if f is None: return 1. if hasattr(f, "gain"): return f.gain # As is standard, use the gradient of f at zero. However, since f # might not be differentiable at zero (e.g., ReLU), compute # gradient a little bit to the left and right and average. delta = 0.1 g = [] for xv in [-delta, 0., delta]: x = constant(0.) y = f(x) values = {x: xv} gradients = {x: 0., y: 1.} y.forward(values) y.backward(values, gradients) g.append(gradients[x]) if abs(g[2]-g[0])/2. > delta: return (g[0] + g[2]) / 2. else: return g[1]
[docs]class Layer(object): """Fully-connected layer. :param insize: Input size or sequence of input sizes. - If an input size is n > 0, then that input will expect an n-dimensional vector. - If an input size is n < 0, then that input will expect an integer in [0, n), which you can either think of as a one-hot vector or as an index into a lookup table. - If an input size is "diag", then that input will have a diagonal weight matrix. :param outsize: Output size. :param f: Activation function (default tanh). :param bias: Initial bias, or None for no bias.""" def __init__(self, insize, outsize, f=tanh, gain=None, bias=0., model=None): if type(insize) is int: insize = [insize] if model is None: model = parameter.all if gain is None: gain = guess_gain(f, outsize) def random(variance, shape): #return numpy.random.normal(0., variance**0.5, shape) return numpy.random.uniform(-(variance*3)**0.5, (variance*3)**0.5, shape) # Although it is more conventional to left-multiply by the weight # matrix, we right-multiply so it works correctly with stacks of # vectors. total_insize = 0 for d in insize: if d == "diag": # Var[input * weight] = Var[input] * Var[weight] total_insize += 1 elif d >= 0: # Var[dot(input, weight)] = d * Var[input] * Var[weight] total_insize += d else: # Var[weight[input]] = Var[weight] total_insize += 1 if bias is not None: total_insize += 1 variance = 2. / (total_insize + outsize) / gain**2 self.weight = [] for a, d in enumerate(insize): if d == "diag": w = parameter(random(variance, (outsize,)), model=model) self.weight.append(w) elif d >= 0: w = parameter(random(variance, (d, outsize)), model=model) self.weight.append(w) else: # Use lots of small parameter vectors, because usually # only one is updated at a time w = [parameter(random(variance, (outsize,)), model=model) for i in range(-d)] self.weight.append(w) if bias is not None: self.bias = parameter(random(variance, (outsize,)), model=model) else: self.bias = constant(0.) self.activation = f def __call__(self, *args): if len(args) != len(self.weight): raise TypeError("wrong number of inputs") s = self.bias for w, x in zip(self.weight, args): if isinstance(w, list): # lookup table if isinstance(x, int): s += w[x] else: # x should be a vector or list of ints s += stack([w[i] for i in x]) elif w.value.ndim == 1: # diagonal s += x * w else: s += dot(x, w) if self.activation: return self.activation(s) else: return s
make_layer = Layer # for backward compatibility