"""Special expressions for neural networks."""
__all__ = ['sigmoid', 'rectify', 'hardtanh', 'softmax', 'logsoftmax', 'crossentropy', 'distance2', 'Layer', 'make_layer', 'Dropout']
from six.moves import range
from . import backend as numpy
from .expr import *
## Activation functions
[docs]class sigmoid(Unary):
"""Logistic sigmoid function, :math:`\frac{1}{1+\exp -x}`."""
gain = 0.25
@staticmethod
def f(x):
#with numpy.errstate(over='ignore'):
try:
return numpy.sigmoid(x)
except AttributeError:
return 1./(1.+numpy.exp(-x))
@staticmethod
def dfdx(x, y):
try:
return numpy.xonemx(y)
except AttributeError:
return y*(1.-y)
[docs]class rectify(Unary):
"""Rectified linear unit, :math:`\max(0, x)`."""
gain = 0.5
@staticmethod
def f(x): return numpy.maximum(x, 0.)
@staticmethod
def dfdx(x, y): return numpy.where(x > 0., 1., 0.)
[docs]class hardtanh(Unary):
"""Hard tanh function, equivalent to clip(x, -1, 1)."""
gain = 1.
@staticmethod
def f(x): return numpy.clip(x, -1., 1.)
@staticmethod
def dfdx(x, y): return numpy.where(numpy.logical_and(-1. < x, x < 1.), 1., 0.)
[docs]class softmax(Expression):
"""Softmax function, :math:`y_i = \exp x_i / \sum_{i'} \exp x_{i'}`.
:param axis: along which to perform the softmax (default is last).
"""
gain = 1.
def __init__(self, arg, axis=-1):
Expression.__init__(self, arg)
self.axis = axis
def forward(self, values):
axis = self.axis
v = values[self.args[0]]
v = v - numpy.amax(v, axis=axis, keepdims=True)
v = numpy.exp(v)
values[self] = v / numpy.sum(v, axis=axis, keepdims=True)
def backward(self, values, gradients):
axis = self.axis
arg = self.args[0]
if arg in gradients:
gradients[arg] += (gradients[self] - numpy.vecdot(values[self], gradients[self], axis=axis, keepdims=True)) * values[self]
[docs]class logsoftmax(Expression):
"""Log-softmax function, :math:`y_i = \log \left(\exp x_i / \sum_{i'} \exp x_{i'}\right)`.
Use this instead of log(softmax(x)) for better numerical stability.
:param axis: along which to perform the softmax (default is last).
"""
gain = 1.
def __init__(self, arg, axis=-1):
Expression.__init__(self, arg)
self.axis = axis
def forward(self, values):
axis = self.axis
v = values[self.args[0]]
v = v - numpy.amax(v, axis=axis, keepdims=True)
values[self] = v - numpy.log(numpy.sum(numpy.exp(v), axis=axis, keepdims=True))
def backward(self, values, gradients):
axis = self.axis
arg = self.args[0]
if arg in gradients:
gradients[arg] += gradients[self] - numpy.sum(gradients[self], axis=axis, keepdims=True) * numpy.exp(values[self])
### Dropout
class _dropout_gate(Expression):
def __init__(self, factory, arg):
Expression.__init__(self, arg)
self.factory = factory
def forward(self, values):
if self.factory.enabled:
p = self.factory.p
arg = self.args[0]
values[self] = (numpy.random.uniform(0., 1., values[arg].shape) > p) / (1-p)
else:
values[self] = 1.
[docs]class Dropout(object):
"""Factory for dropouts.
Example usage::
d = Dropout(0.5)
y = d(x)
The reason for the extra level of indirection is so that all the
dropouts can be enabled or disabled together.
:param p: probability of dropout
:type p: float
:return: dropout function
:rtype: Expression -> Expression
"""
def __init__(self, p=0.5):
self.p = p
self.enabled = True
def __call__(self, arg):
return arg * _dropout_gate(self, arg)
def enable(self):
"""Enable all dropouts that originated from this factory."""
self.enabled = True
def disable(self):
"""Disable all dropouts that originated from this factory."""
self.enabled = False
### Loss functions
[docs]def crossentropy(logp, correct):
"""Cross-entropy, a.k.a. log-loss.
:param logp: vector of log-probabilities
:param correct: observed probabilities
"""
return -vecdot(logp, correct)
[docs]def distance2(x, y):
"""Squared Euclidean distance, a.k.a. mean-squared loss."""
d = x - y
return vecdot(d, d)
### Fully-connected layer
def guess_gain(f, d):
"""Try to figure out how the activation function affects the
variance of inputs/gradients."""
if f is None: return 1.
if hasattr(f, "gain"): return f.gain
# As is standard, use the gradient of f at zero. However, since f
# might not be differentiable at zero (e.g., ReLU), compute
# gradient a little bit to the left and right and average.
delta = 0.1
g = []
for xv in [-delta, 0., delta]:
x = constant(0.)
y = f(x)
values = {x: xv}
gradients = {x: 0., y: 1.}
y.forward(values)
y.backward(values, gradients)
g.append(gradients[x])
if abs(g[2]-g[0])/2. > delta:
return (g[0] + g[2]) / 2.
else:
return g[1]
[docs]class Layer(object):
"""Fully-connected layer.
:param insize: Input size or sequence of input sizes.
- If an input size is n > 0, then that input will
expect an n-dimensional vector.
- If an input size is n < 0, then that input will
expect an integer in [0, n), which you can either
think of as a one-hot vector or as an index into a
lookup table.
- If an input size is "diag", then that input will
have a diagonal weight matrix.
:param outsize: Output size.
:param f: Activation function (default tanh).
:param bias: Initial bias, or None for no bias."""
def __init__(self, insize, outsize, f=tanh, gain=None, bias=0., model=None):
if type(insize) is int: insize = [insize]
if model is None: model = parameter.all
if gain is None: gain = guess_gain(f, outsize)
def random(variance, shape):
#return numpy.random.normal(0., variance**0.5, shape)
return numpy.random.uniform(-(variance*3)**0.5, (variance*3)**0.5, shape)
# Although it is more conventional to left-multiply by the weight
# matrix, we right-multiply so it works correctly with stacks of
# vectors.
total_insize = 0
for d in insize:
if d == "diag":
# Var[input * weight] = Var[input] * Var[weight]
total_insize += 1
elif d >= 0:
# Var[dot(input, weight)] = d * Var[input] * Var[weight]
total_insize += d
else:
# Var[weight[input]] = Var[weight]
total_insize += 1
if bias is not None: total_insize += 1
variance = 2. / (total_insize + outsize) / gain**2
self.weight = []
for a, d in enumerate(insize):
if d == "diag":
w = parameter(random(variance, (outsize,)), model=model)
self.weight.append(w)
elif d >= 0:
w = parameter(random(variance, (d, outsize)), model=model)
self.weight.append(w)
else:
# Use lots of small parameter vectors, because usually
# only one is updated at a time
w = [parameter(random(variance, (outsize,)), model=model) for i in range(-d)]
self.weight.append(w)
if bias is not None:
self.bias = parameter(random(variance, (outsize,)), model=model)
else:
self.bias = constant(0.)
self.activation = f
def __call__(self, *args):
if len(args) != len(self.weight):
raise TypeError("wrong number of inputs")
s = self.bias
for w, x in zip(self.weight, args):
if isinstance(w, list): # lookup table
if isinstance(x, int):
s += w[x]
else:
# x should be a vector or list of ints
s += stack([w[i] for i in x])
elif w.value.ndim == 1: # diagonal
s += x * w
else:
s += dot(x, w)
if self.activation:
return self.activation(s)
else:
return s
make_layer = Layer # for backward compatibility