"""Training neural networks."""
__all__ = ['StochasticGradientDescent', 'SGD',
'AdaGrad', 'Adagrad', 'AdaDelta', 'Adadelta',
'Momentum', 'NesterovMomentum',
'RMSprop',
'Adam', 'Adamax']
from . import backend as numpy
from . import compute
from . import expr # parameter
def parameters(d):
for param in d:
if isinstance(param, expr.parameter):
yield param
def clip_gradients(gradients, p, r):
"""Clip gradients so that the p-norm of the gradients is at most r.
:param gradients: gradients to clip
:type gradients: NumPy array
:param p: 2 = squared l2 norm (currently theonly allowed value)
:param r: maximum radius
"""
if p == 2:
s = 0.
for param in parameters(gradients):
g = gradients[param]
s += numpy.sum(g * g)
s = numpy.sqrt(s)
if s > r:
for param in parameters(gradients):
gradients[param] *= r/s
else:
raise ValueError()
# to do: max-norm regularization
def regularize(values, p, strength):
"""Apply regularization to values.
Sets values to minimizer of
:math:`\min_{\mathbf{v}'} \frac12\|\mathbf{v}'-\mathbf{v}\|^2 + R(\mathbf{v})`
:param values: parameters to regularize
:type values: NumPy array
:param p: 2 = squared l2 regularizer (currently the only allowed value)
:param strength: regularizer strength
"""
if p == 2:
for param in parameters(values):
values[param] *= 1./(1.+strength)
else:
raise ValueError()
[docs]class StochasticGradientDescent(object):
"""Stochastic gradient descent.
:param learning_rate: learning rate
:param clip_gradients: maximum l2 norm of gradients, or None
"""
def __init__(self, learning_rate=0.1, clip_gradients=None):
self.learning_rate = learning_rate
self.clip_gradients = clip_gradients
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
if self.clip_gradients:
clip_gradients(gradients, 2, self.clip_gradients)
for param in parameters(gradients):
param.value -= self.learning_rate * gradients[param]
#regularize(values, 2, 1e-6)
return values[x]
SGD = StochasticGradientDescent
[docs]class AdaGrad(object):
"""AdaGrad (diagonal version).
John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient
methods for online learning and stochastic optimization. JMLR
12:2121-2159, 2011.
:param learning_rate: Learning rate.
:param epsilon: Small constant to prevent division by zero.
"""
def __init__(self, learning_rate=0.1, epsilon=1e-8):
self.learning_rate = learning_rate
self.epsilon = epsilon
self.sum_gradients2 = {}
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
for param in parameters(gradients):
if param not in self.sum_gradients2:
self.sum_gradients2[param] = numpy.zeros_like(param.value)
self.sum_gradients2[param] += gradients[param] ** 2
denom = numpy.sqrt(self.sum_gradients2[param]) + self.epsilon
param.value -= self.learning_rate * gradients[param] / denom
return values[x]
Adagrad = AdaGrad
[docs]class AdaDelta(object):
"""AdaDelta.
Matthew D. Zeiler. ADADELTA: An adaptive learning rate
method. arXiv:1212.5701, 2012.
:param decay: Decay rate of RMS average of updates and gradients.
:param epsilon: Small constant to prevent division by zero.
"""
def __init__(self, decay=0.95, epsilon=1e-6):
self.decay = decay
self.epsilon = epsilon
self.ave_gradients2 = {}
self.ave_updates2 = {}
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
for param in self.ave_gradients2:
self.ave_gradients2[param] *= self.decay
self.ave_updates2[param] *= self.decay
for param in parameters(gradients):
if param not in self.ave_gradients2:
self.ave_gradients2[param] = numpy.zeros_like(param.value)
self.ave_updates2[param] = numpy.zeros_like(param.value)
self.ave_gradients2[param] += (1-self.decay) * gradients[param] ** 2
update = gradients[param] * numpy.sqrt((self.ave_updates2[param]/self.decay + self.epsilon) /
(self.ave_gradients2[param] + self.epsilon))
self.ave_updates2[param] += (1-self.decay) * update ** 2
param.value -= update
return values[x]
Adadelta = AdaDelta
[docs]class Momentum(object):
"""Stochastic gradient descent with momentum.
:param learning_rate: Learning rate.
:param decay: Decay rate of sum of gradients (also known as the momentum coefficient).
"""
def __init__(self, learning_rate=0.01, decay=0.9):
self.learning_rate = learning_rate
self.decay = decay
self.sum_gradients = {}
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
for param in self.sum_gradients:
self.sum_gradients[param] *= self.decay
for param in parameters(gradients):
if param not in self.sum_gradients:
self.sum_gradients[param] = numpy.zeros_like(param.value)
self.sum_gradients[param] += gradients[param]
for param in self.sum_gradients:
param.value -= self.learning_rate * self.sum_gradients[param]
return values[x]
[docs]class NesterovMomentum(object):
"""Momentum-like version of Nesterov accelerated gradient.
Ilya Sutskever, James Martens, George Dahl, and Geoffrey
Hinton. On the importance of initialization and momentum in deep
learning. In Proc. ICML, 2013.
:param learning_rate: Learning rate.
:param decay: Decay rate of sum of gradients (also known as the momentum coefficient).
"""
def __init__(self, learning_rate=0.01, decay=0.9):
self.learning_rate = learning_rate
self.decay = decay
self.sum_gradients = {}
def receive(self, x):
for param in self.sum_gradients:
self.sum_gradients[param] *= self.decay
param.value -= self.learning_rate * self.sum_gradients[param]
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
for param in parameters(gradients):
if param not in self.sum_gradients:
self.sum_gradients[param] = numpy.zeros_like(param.value)
self.sum_gradients[param] += gradients[param]
param.value -= self.learning_rate * gradients[param]
return values[x]
[docs]class RMSprop(object):
"""RMSprop.
Hinton. Overview of mini-batch gradient descent.
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
:param learning_rate: Learning rate.
:param decay: Decay rate of RMS average of gradients.
:param epsilon: Small constant to prevent division by zero.
"""
def __init__(self, learning_rate=0.01, decay=0.9, epsilon=1e-8):
self.learning_rate = learning_rate
self.decay = decay
self.epsilon = epsilon
self.ave_gradients2 = {} # moving average of square of gradients
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
for param in self.ave_gradients2:
self.ave_gradients2[param] *= self.decay
for param in parameters(gradients):
if param not in self.ave_gradients2:
self.ave_gradients2[param] = numpy.zeros_like(param.value)
self.ave_gradients2[param] += (1-self.decay) * gradients[param] ** 2
denom = numpy.sqrt(self.ave_gradients2[param]) + self.epsilon
param.value -= self.learning_rate * gradients[param] / denom
return values[x]
[docs]class Adam(object):
"""Adam.
Diederik P. Kingma and Jimmy Lei Ba. Adam: A method for stochastic optimization.
In ICLR 2015. http://arxiv.org/pdf/1412.6980.pdf
:param learning_rate: Learning rate.
:param decay1: Decay rate of average of gradients.
:param decay2: Decay rate of RMS average of gradients.
:param epsilon: Small constant to prevent division by zero.
"""
def __init__(self, learning_rate=1e-3, decay1=0.9, decay2=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.decay1 = decay1
self.decay2 = decay2
self.epsilon = epsilon
self.ave_gradients1 = {} # moving average of gradients
self.ave_gradients2 = {} # moving average of square of gradients
self.t = 0
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
self.t += 1
for param in self.ave_gradients1:
self.ave_gradients1[param] *= self.decay1
self.ave_gradients2[param] *= self.decay2
for param in parameters(gradients):
if param not in self.ave_gradients1:
self.ave_gradients1[param] = numpy.zeros_like(param.value)
self.ave_gradients2[param] = numpy.zeros_like(param.value)
self.ave_gradients1[param] += (1-self.decay1) * gradients[param]
self.ave_gradients2[param] += (1-self.decay2) * gradients[param] ** 2
for param in self.ave_gradients1:
denom = numpy.sqrt(self.ave_gradients2[param]) + self.epsilon
bias_corr1 = 1 - self.decay1 ** self.t
bias_corr2 = 1 - self.decay2 ** self.t
step_size = self.learning_rate * numpy.sqrt(bias_corr2)/bias_corr1
param.value -= step_size * self.ave_gradients1[param]/denom
return values[x]
class Adamax(object):
"""Adamax.
Diederik P. Kingma and Jimmy Lei Ba. Adam: A method for stochastic optimization.
In ICLR 2015. http://arxiv.org/pdf/1412.6980.pdf
:param learning_rate: Learning rate.
:param decay1: Decay rate of average of gradients.
:param decaymax: Decay rate of maximum of gradients.
:param epsilon: Small constant to prevent division by zero.
"""
def __init__(self, learning_rate=2e-3, decay1=0.9, decaymax=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.decay1 = decay1
self.decaymax = decaymax
self.epsilon = epsilon
self.ave_gradients = {}
self.max_gradients = {}
self.t = 0
def receive(self, x):
values = compute.compute_values(x)
gradients = compute.compute_gradients(x, values)
self.t += 1
for param in self.ave_gradients:
self.ave_gradients[param] *= self.decay1
self.max_gradients[param] *= self.decaymax
for param in parameters(gradients):
if param not in self.ave_gradients:
self.ave_gradients[param] = numpy.zeros_like(param.value)
self.max_gradients[param] = numpy.zeros_like(param.value)
self.ave_gradients[param] += (1-self.decay1) * gradients[param]
self.max_gradients[param] = numpy.maximum(self.max_gradients[param], numpy.absolute(gradients[param]))
for param in self.ave_gradients:
step_size = self.learning_rate/(1-self.decay1**self.t)
param.value -= step_size * self.ave_gradients[param]/(self.max_gradients[param]+self.epsilon)
return values[x]