Skip to content

Commit

Permalink
Replace literals with K.epsilon() (keras-team#8710)
Browse files Browse the repository at this point in the history
* Replace literal 1e-7 with K.epsilon()

* Replace literal 1e-3 with K.epsilon()

* Replace literals 1e-8 with K.epsilon()

* Revert using K.epsilon() for BatchNormalization

* Replace literals with K.epsilon()

* Don't call K.epsilon() in signatures

* Add default epsilon to docstrings
  • Loading branch information
ozabluda authored and fchollet committed Dec 11, 2017
1 parent cb3ee31 commit 321d838
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 17 deletions.
4 changes: 2 additions & 2 deletions examples/conv_filter_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
def deprocess_image(x):
# normalize tensor: center on 0., ensure std is 0.1
x -= x.mean()
x /= (x.std() + 1e-5)
x /= (x.std() + K.epsilon())
x *= 0.1

# clip to [0, 1]
Expand Down Expand Up @@ -55,7 +55,7 @@ def deprocess_image(x):

def normalize(x):
# utility function to normalize a tensor by its L2 norm
return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)
return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())


kept_filters = []
Expand Down
2 changes: 1 addition & 1 deletion examples/deep_dream.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def deprocess_image(x):
# Compute the gradients of the dream wrt the loss.
grads = K.gradients(loss, dream)[0]
# Normalize gradients.
grads /= K.maximum(K.mean(K.abs(grads)), 1e-7)
grads /= K.maximum(K.mean(K.abs(grads)), K.epsilon())

# Set up function to retrieve the value
# of the loss and gradients given an input image.
Expand Down
4 changes: 2 additions & 2 deletions keras/backend/tensorflow_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3756,7 +3756,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
input_length = tf.to_int32(tf.squeeze(input_length))
sparse_labels = tf.to_int32(ctc_label_dense_to_sparse(y_true, label_length))

y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + epsilon())

return tf.expand_dims(ctc.ctc_loss(inputs=y_pred,
labels=sparse_labels,
Expand Down Expand Up @@ -3792,7 +3792,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100,
Tensor `(top_paths, )` that contains
the log probability of each decoded sequence.
"""
y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
input_length = tf.to_int32(input_length)

if greedy:
Expand Down
36 changes: 24 additions & 12 deletions keras/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,21 +203,23 @@ class RMSprop(Optimizer):
# Arguments
lr: float >= 0. Learning rate.
rho: float >= 0.
epsilon: float >= 0. Fuzz factor.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
"""

def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0.,
def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.,
**kwargs):
super(RMSprop, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.rho = K.variable(rho, name='rho')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay

Expand Down Expand Up @@ -263,19 +265,21 @@ class Adagrad(Optimizer):
# Arguments
lr: float >= 0. Learning rate.
epsilon: float >= 0.
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
"""

def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
super(Adagrad, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay

Expand Down Expand Up @@ -322,20 +326,22 @@ class Adadelta(Optimizer):
lr: float >= 0. Learning rate.
It is recommended to leave it at the default value.
rho: float >= 0.
epsilon: float >= 0. Fuzz factor.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
"""

def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0.,
def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.,
**kwargs):
super(Adadelta, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.rho = rho
self.epsilon = epsilon
self.initial_decay = decay
Expand Down Expand Up @@ -392,7 +398,7 @@ class Adam(Optimizer):
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Weather to apply the AMSGrad variant of this
algorithm from the paper "On the Convergence of Adam and
Expand All @@ -404,14 +410,16 @@ class Adam(Optimizer):
"""

def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, decay=0., amsgrad=False, **kwargs):
epsilon=None, decay=0., amsgrad=False, **kwargs):
super(Adam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
self.amsgrad = amsgrad
Expand Down Expand Up @@ -479,22 +487,24 @@ class Adamax(Optimizer):
# Arguments
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
"""

def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, decay=0., **kwargs):
epsilon=None, decay=0., **kwargs):
super(Adamax, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay

Expand Down Expand Up @@ -558,22 +568,24 @@ class Nadam(Optimizer):
# Arguments
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
# References
- [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
- [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
"""

def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, schedule_decay=0.004, **kwargs):
epsilon=None, schedule_decay=0.004, **kwargs):
super(Nadam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.m_schedule = K.variable(1., name='m_schedule')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.schedule_decay = schedule_decay

Expand Down

0 comments on commit 321d838

Please sign in to comment.