@@ -546,6 +546,42 @@ def update(self, index, weight, grad, state):
546
546
mean [:] = mean_t
547
547
variance [:] = variance_t
548
548
549
+ @register
550
+ class AdaGrad (Optimizer ):
551
+ """AdaGrad optimizer of Duchi et al., 2011,
552
+
553
+ This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf Eq(5)
554
+ by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
555
+ in some cases.
556
+
557
+ Parameters
558
+ ----------
559
+ learning_rate : float, optional
560
+ Step size.
561
+ Default value is set to 0.05.
562
+ wd : float, optional
563
+ L2 regularization coefficient add to all the weights
564
+ rescale_grad : float, optional
565
+ rescaling factor of gradient.
566
+ eps: float, optional
567
+ A small float number to make the updating processing stable
568
+ Default value is set to 1e-7.
569
+ """
570
+ def __init__ (self , learning_rate = 0.05 , wd = 0. , rescale_grad = 1 , eps = 1e-7 , arg_names = None ):
571
+ super (AdaGrad , self ).__init__ (rescale_grad , arg_names , wd )
572
+ self .lr = learning_rate
573
+ self .float_stable_eps = eps
574
+ self .rescale_grad = rescale_grad
575
+ def create_state (self , index , weight ):
576
+ return zeros (weight .shape , weight .context ) #history
577
+ def update (self , index , weight , grad , state ):
578
+ assert (isinstance (weight , NDArray ))
579
+ assert (isinstance (grad , NDArray ))
580
+ grad = grad * self .rescale_grad
581
+ history = state
582
+ history [:] += (grad * grad )
583
+ weight [:] += - self .lr * (grad / sqrt (history + self .float_stable_eps ) + self .wd * weight )
584
+
549
585
@register
550
586
class RMSProp (Optimizer ):
551
587
"""RMSProp optimizer of Tieleman & Hinton, 2012,
0 commit comments