Replace literals with K.epsilon() (keras-team#8710)

* Replace literal 1e-7 with K.epsilon() * Replace literal 1e-3 with K.epsilon() * Replace literals 1e-8 with K.epsilon() * Revert using K.epsilon() for BatchNormalization * Replace literals with K.epsilon() * Don't call K.epsilon() in signatures * Add default epsilon to docstrings
optionalg · Dec 11, 2017 · 321d838 · 321d838
1 parent cb3ee31
commit 321d838
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 17 deletions.
diff --git a/examples/conv_filter_visualization.py b/examples/conv_filter_visualization.py
@@ -26,7 +26,7 @@
 def deprocess_image(x):
     # normalize tensor: center on 0., ensure std is 0.1
     x -= x.mean()
-    x /= (x.std() + 1e-5)
+    x /= (x.std() + K.epsilon())
     x *= 0.1
 
     # clip to [0, 1]
@@ -55,7 +55,7 @@ def deprocess_image(x):
 
 def normalize(x):
     # utility function to normalize a tensor by its L2 norm
-    return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)
+    return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())
 
 
 kept_filters = []

diff --git a/examples/deep_dream.py b/examples/deep_dream.py
@@ -96,7 +96,7 @@ def deprocess_image(x):
 # Compute the gradients of the dream wrt the loss.
 grads = K.gradients(loss, dream)[0]
 # Normalize gradients.
-grads /= K.maximum(K.mean(K.abs(grads)), 1e-7)
+grads /= K.maximum(K.mean(K.abs(grads)), K.epsilon())
 
 # Set up function to retrieve the value
 # of the loss and gradients given an input image.

diff --git a/keras/backend/tensorflow_backend.py b/keras/backend/tensorflow_backend.py
@@ -3756,7 +3756,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
     input_length = tf.to_int32(tf.squeeze(input_length))
     sparse_labels = tf.to_int32(ctc_label_dense_to_sparse(y_true, label_length))
 
-    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
 
     return tf.expand_dims(ctc.ctc_loss(inputs=y_pred,
                                        labels=sparse_labels,
@@ -3792,7 +3792,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100,
             Tensor `(top_paths, )` that contains
                 the log probability of each decoded sequence.
     """
-    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
     input_length = tf.to_int32(input_length)
 
     if greedy:

diff --git a/keras/optimizers.py b/keras/optimizers.py
@@ -203,21 +203,23 @@ class RMSprop(Optimizer):
     # Arguments
         lr: float >= 0. Learning rate.
         rho: float >= 0.
-        epsilon: float >= 0. Fuzz factor.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
         decay: float >= 0. Learning rate decay over each update.
 
     # References
         - [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
     """
 
-    def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0.,
+    def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.,
                  **kwargs):
         super(RMSprop, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.lr = K.variable(lr, name='lr')
             self.rho = K.variable(rho, name='rho')
             self.decay = K.variable(decay, name='decay')
             self.iterations = K.variable(0, dtype='int64', name='iterations')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.epsilon = epsilon
         self.initial_decay = decay
 
@@ -263,19 +265,21 @@ class Adagrad(Optimizer):
 
     # Arguments
         lr: float >= 0. Learning rate.
-        epsilon: float >= 0.
+        epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
         decay: float >= 0. Learning rate decay over each update.
 
     # References
         - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
     """
 
-    def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+    def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
         super(Adagrad, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.lr = K.variable(lr, name='lr')
             self.decay = K.variable(decay, name='decay')
             self.iterations = K.variable(0, dtype='int64', name='iterations')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.epsilon = epsilon
         self.initial_decay = decay
 
@@ -322,20 +326,22 @@ class Adadelta(Optimizer):
         lr: float >= 0. Learning rate.
             It is recommended to leave it at the default value.
         rho: float >= 0.
-        epsilon: float >= 0. Fuzz factor.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
         decay: float >= 0. Learning rate decay over each update.
 
     # References
         - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
     """
 
-    def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0.,
+    def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.,
                  **kwargs):
         super(Adadelta, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.lr = K.variable(lr, name='lr')
             self.decay = K.variable(decay, name='decay')
             self.iterations = K.variable(0, dtype='int64', name='iterations')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.rho = rho
         self.epsilon = epsilon
         self.initial_decay = decay
@@ -392,7 +398,7 @@ class Adam(Optimizer):
         lr: float >= 0. Learning rate.
         beta_1: float, 0 < beta < 1. Generally close to 1.
         beta_2: float, 0 < beta < 1. Generally close to 1.
-        epsilon: float >= 0. Fuzz factor.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
         decay: float >= 0. Learning rate decay over each update.
         amsgrad: boolean. Weather to apply the AMSGrad variant of this
             algorithm from the paper "On the Convergence of Adam and
@@ -404,14 +410,16 @@ class Adam(Optimizer):
     """
 
     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, decay=0., amsgrad=False, **kwargs):
+                 epsilon=None, decay=0., amsgrad=False, **kwargs):
         super(Adam, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.iterations = K.variable(0, dtype='int64', name='iterations')
             self.lr = K.variable(lr, name='lr')
             self.beta_1 = K.variable(beta_1, name='beta_1')
             self.beta_2 = K.variable(beta_2, name='beta_2')
             self.decay = K.variable(decay, name='decay')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.epsilon = epsilon
         self.initial_decay = decay
         self.amsgrad = amsgrad
@@ -479,22 +487,24 @@ class Adamax(Optimizer):
     # Arguments
         lr: float >= 0. Learning rate.
         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-        epsilon: float >= 0. Fuzz factor.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
         decay: float >= 0. Learning rate decay over each update.
 
     # References
         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
     """
 
     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, decay=0., **kwargs):
+                 epsilon=None, decay=0., **kwargs):
         super(Adamax, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.iterations = K.variable(0, dtype='int64', name='iterations')
             self.lr = K.variable(lr, name='lr')
             self.beta_1 = K.variable(beta_1, name='beta_1')
             self.beta_2 = K.variable(beta_2, name='beta_2')
             self.decay = K.variable(decay, name='decay')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.epsilon = epsilon
         self.initial_decay = decay
 
@@ -558,22 +568,24 @@ class Nadam(Optimizer):
     # Arguments
         lr: float >= 0. Learning rate.
         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-        epsilon: float >= 0. Fuzz factor.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
 
     # References
         - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
         - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
     """
 
     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, schedule_decay=0.004, **kwargs):
+                 epsilon=None, schedule_decay=0.004, **kwargs):
         super(Nadam, self).__init__(**kwargs)
         with K.name_scope(self.__class__.__name__):
             self.iterations = K.variable(0, dtype='int64', name='iterations')
             self.m_schedule = K.variable(1., name='m_schedule')
             self.lr = K.variable(lr, name='lr')
             self.beta_1 = K.variable(beta_1, name='beta_1')
             self.beta_2 = K.variable(beta_2, name='beta_2')
+        if epsilon is None:
+            epsilon = K.epsilon()
         self.epsilon = epsilon
         self.schedule_decay = schedule_decay