Source code for penne.optimize

"""Training neural networks."""

__all__ = ['StochasticGradientDescent', 'SGD',
           'AdaGrad', 'Adagrad', 'AdaDelta', 'Adadelta',
           'Momentum', 'NesterovMomentum',
           'RMSprop',
           'Adam', 'Adamax']

from . import backend as numpy
from . import compute
from . import expr # parameter

def parameters(d):
    for param in d:
        if isinstance(param, expr.parameter):
            yield param

def clip_gradients(gradients, p, r):
    """Clip gradients so that the p-norm of the gradients is at most r.

    :param gradients: gradients to clip
    :type gradients:  NumPy array
    :param p:         2 = squared l2 norm (currently theonly allowed value)
    :param r:         maximum radius
    """

    if p == 2:
        s = 0.
        for param in parameters(gradients):
            g = gradients[param]
            s += numpy.sum(g * g)
        s = numpy.sqrt(s)
        if s > r:
            for param in parameters(gradients):
                gradients[param] *= r/s
    else:
        raise ValueError()

# to do: max-norm regularization

def regularize(values, p, strength):
    """Apply regularization to values.

    Sets values to minimizer of 
    :math:`\min_{\mathbf{v}'} \frac12\|\mathbf{v}'-\mathbf{v}\|^2 + R(\mathbf{v})`

    :param values:   parameters to regularize
    :type values:    NumPy array
    :param p:        2 = squared l2 regularizer (currently the only allowed value)
    :param strength: regularizer strength
    """
    if p == 2:
        for param in parameters(values):
            values[param] *= 1./(1.+strength)
    else:
        raise ValueError()

[docs]class StochasticGradientDescent(object): """Stochastic gradient descent. :param learning_rate: learning rate :param clip_gradients: maximum l2 norm of gradients, or None """ def __init__(self, learning_rate=0.1, clip_gradients=None): self.learning_rate = learning_rate self.clip_gradients = clip_gradients def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) if self.clip_gradients: clip_gradients(gradients, 2, self.clip_gradients) for param in parameters(gradients): param.value -= self.learning_rate * gradients[param] #regularize(values, 2, 1e-6) return values[x]
SGD = StochasticGradientDescent
[docs]class AdaGrad(object): """AdaGrad (diagonal version). John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for online learning and stochastic optimization. JMLR 12:2121-2159, 2011. :param learning_rate: Learning rate. :param epsilon: Small constant to prevent division by zero. """ def __init__(self, learning_rate=0.1, epsilon=1e-8): self.learning_rate = learning_rate self.epsilon = epsilon self.sum_gradients2 = {} def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) for param in parameters(gradients): if param not in self.sum_gradients2: self.sum_gradients2[param] = numpy.zeros_like(param.value) self.sum_gradients2[param] += gradients[param] ** 2 denom = numpy.sqrt(self.sum_gradients2[param]) + self.epsilon param.value -= self.learning_rate * gradients[param] / denom return values[x]
Adagrad = AdaGrad
[docs]class AdaDelta(object): """AdaDelta. Matthew D. Zeiler. ADADELTA: An adaptive learning rate method. arXiv:1212.5701, 2012. :param decay: Decay rate of RMS average of updates and gradients. :param epsilon: Small constant to prevent division by zero. """ def __init__(self, decay=0.95, epsilon=1e-6): self.decay = decay self.epsilon = epsilon self.ave_gradients2 = {} self.ave_updates2 = {} def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) for param in self.ave_gradients2: self.ave_gradients2[param] *= self.decay self.ave_updates2[param] *= self.decay for param in parameters(gradients): if param not in self.ave_gradients2: self.ave_gradients2[param] = numpy.zeros_like(param.value) self.ave_updates2[param] = numpy.zeros_like(param.value) self.ave_gradients2[param] += (1-self.decay) * gradients[param] ** 2 update = gradients[param] * numpy.sqrt((self.ave_updates2[param]/self.decay + self.epsilon) / (self.ave_gradients2[param] + self.epsilon)) self.ave_updates2[param] += (1-self.decay) * update ** 2 param.value -= update return values[x]
Adadelta = AdaDelta
[docs]class Momentum(object): """Stochastic gradient descent with momentum. :param learning_rate: Learning rate. :param decay: Decay rate of sum of gradients (also known as the momentum coefficient). """ def __init__(self, learning_rate=0.01, decay=0.9): self.learning_rate = learning_rate self.decay = decay self.sum_gradients = {} def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) for param in self.sum_gradients: self.sum_gradients[param] *= self.decay for param in parameters(gradients): if param not in self.sum_gradients: self.sum_gradients[param] = numpy.zeros_like(param.value) self.sum_gradients[param] += gradients[param] for param in self.sum_gradients: param.value -= self.learning_rate * self.sum_gradients[param] return values[x]
[docs]class NesterovMomentum(object): """Momentum-like version of Nesterov accelerated gradient. Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton. On the importance of initialization and momentum in deep learning. In Proc. ICML, 2013. :param learning_rate: Learning rate. :param decay: Decay rate of sum of gradients (also known as the momentum coefficient). """ def __init__(self, learning_rate=0.01, decay=0.9): self.learning_rate = learning_rate self.decay = decay self.sum_gradients = {} def receive(self, x): for param in self.sum_gradients: self.sum_gradients[param] *= self.decay param.value -= self.learning_rate * self.sum_gradients[param] values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) for param in parameters(gradients): if param not in self.sum_gradients: self.sum_gradients[param] = numpy.zeros_like(param.value) self.sum_gradients[param] += gradients[param] param.value -= self.learning_rate * gradients[param] return values[x]
[docs]class RMSprop(object): """RMSprop. Hinton. Overview of mini-batch gradient descent. http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf :param learning_rate: Learning rate. :param decay: Decay rate of RMS average of gradients. :param epsilon: Small constant to prevent division by zero. """ def __init__(self, learning_rate=0.01, decay=0.9, epsilon=1e-8): self.learning_rate = learning_rate self.decay = decay self.epsilon = epsilon self.ave_gradients2 = {} # moving average of square of gradients def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) for param in self.ave_gradients2: self.ave_gradients2[param] *= self.decay for param in parameters(gradients): if param not in self.ave_gradients2: self.ave_gradients2[param] = numpy.zeros_like(param.value) self.ave_gradients2[param] += (1-self.decay) * gradients[param] ** 2 denom = numpy.sqrt(self.ave_gradients2[param]) + self.epsilon param.value -= self.learning_rate * gradients[param] / denom return values[x]
[docs]class Adam(object): """Adam. Diederik P. Kingma and Jimmy Lei Ba. Adam: A method for stochastic optimization. In ICLR 2015. http://arxiv.org/pdf/1412.6980.pdf :param learning_rate: Learning rate. :param decay1: Decay rate of average of gradients. :param decay2: Decay rate of RMS average of gradients. :param epsilon: Small constant to prevent division by zero. """ def __init__(self, learning_rate=1e-3, decay1=0.9, decay2=0.999, epsilon=1e-8): self.learning_rate = learning_rate self.decay1 = decay1 self.decay2 = decay2 self.epsilon = epsilon self.ave_gradients1 = {} # moving average of gradients self.ave_gradients2 = {} # moving average of square of gradients self.t = 0 def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) self.t += 1 for param in self.ave_gradients1: self.ave_gradients1[param] *= self.decay1 self.ave_gradients2[param] *= self.decay2 for param in parameters(gradients): if param not in self.ave_gradients1: self.ave_gradients1[param] = numpy.zeros_like(param.value) self.ave_gradients2[param] = numpy.zeros_like(param.value) self.ave_gradients1[param] += (1-self.decay1) * gradients[param] self.ave_gradients2[param] += (1-self.decay2) * gradients[param] ** 2 for param in self.ave_gradients1: denom = numpy.sqrt(self.ave_gradients2[param]) + self.epsilon bias_corr1 = 1 - self.decay1 ** self.t bias_corr2 = 1 - self.decay2 ** self.t step_size = self.learning_rate * numpy.sqrt(bias_corr2)/bias_corr1 param.value -= step_size * self.ave_gradients1[param]/denom return values[x]
class Adamax(object): """Adamax. Diederik P. Kingma and Jimmy Lei Ba. Adam: A method for stochastic optimization. In ICLR 2015. http://arxiv.org/pdf/1412.6980.pdf :param learning_rate: Learning rate. :param decay1: Decay rate of average of gradients. :param decaymax: Decay rate of maximum of gradients. :param epsilon: Small constant to prevent division by zero. """ def __init__(self, learning_rate=2e-3, decay1=0.9, decaymax=0.999, epsilon=1e-8): self.learning_rate = learning_rate self.decay1 = decay1 self.decaymax = decaymax self.epsilon = epsilon self.ave_gradients = {} self.max_gradients = {} self.t = 0 def receive(self, x): values = compute.compute_values(x) gradients = compute.compute_gradients(x, values) self.t += 1 for param in self.ave_gradients: self.ave_gradients[param] *= self.decay1 self.max_gradients[param] *= self.decaymax for param in parameters(gradients): if param not in self.ave_gradients: self.ave_gradients[param] = numpy.zeros_like(param.value) self.max_gradients[param] = numpy.zeros_like(param.value) self.ave_gradients[param] += (1-self.decay1) * gradients[param] self.max_gradients[param] = numpy.maximum(self.max_gradients[param], numpy.absolute(gradients[param])) for param in self.ave_gradients: step_size = self.learning_rate/(1-self.decay1**self.t) param.value -= step_size * self.ave_gradients[param]/(self.max_gradients[param]+self.epsilon) return values[x]