Docstring updates

nblauch · Aug 27, 2019 · 17e8a55 · 17e8a55
1 parent ea7c209
commit 17e8a55
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 22 deletions.
diff --git a/apex/optimizers/fused_adam.py b/apex/optimizers/fused_adam.py
@@ -6,31 +6,33 @@ class FusedAdam(torch.optim.Optimizer):
 
     """Implements Adam algorithm.
 
-      Currently GPU-only.  Requires Apex to be installed via
+    Currently GPU-only.  Requires Apex to be installed via
     ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
 
-    This version of fused Adam implements 2 fusions:
-      - Fusion of the Adam update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused Adam implements 2 fusions.
 
-    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for torch.optim.Adam::
+      * Fusion of the Adam update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for ``torch.optim.Adam``::
 
         opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
         ...
         opt.step()
 
     :class:`apex.optimizers.FusedAdam` may be used with or without Amp.  If you wish to use :class:`FusedAdam` with Amp,
     you may choose any `opt_level`::
+
         opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
         model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
         ...
         opt.step()
 
-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.
 
 
     .. warning::
-        A previous version of :class:`FusedAdam` allowed a number of additional arguments to `step`.  These additional arguments
+        A previous version of :class:`FusedAdam` allowed a number of additional arguments to ``step``.  These additional arguments
         are now deprecated and unnecessary.
 
     Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

diff --git a/apex/optimizers/fused_lamb.py b/apex/optimizers/fused_lamb.py
@@ -8,9 +8,10 @@ class FusedLAMB(torch.optim.Optimizer):
     Currently GPU-only.  Requires Apex to be installed via
     ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
 
-    This version of fused LAMB implements 2 fusions:
-      - Fusion of the LAMB update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused LAMB implements 2 fusions.
+
+      * Fusion of the LAMB update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
 
     :class:`apex.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::
 
@@ -20,12 +21,13 @@ class FusedLAMB(torch.optim.Optimizer):
 
     :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
     you may choose any `opt_level`::
+
         opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
         model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
         ...
         opt.step()
 
-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.
 
     LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
 
@@ -50,7 +52,7 @@ class FusedLAMB(torch.optim.Optimizer):
         max_grad_norm (float, optional): value used to clip global grad norm
             (default: 1.0)
 
-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
         https://arxiv.org/abs/1904.00962
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ

diff --git a/apex/optimizers/fused_novograd.py b/apex/optimizers/fused_novograd.py
@@ -8,9 +8,10 @@ class FusedNovoGrad(torch.optim.Optimizer):
     Currently GPU-only.  Requires Apex to be installed via
     ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
 
-    This version of fused NovoGrad implements 2 fusions:
-      - Fusion of the NovoGrad update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused NovoGrad implements 2 fusions.
+
+      * Fusion of the NovoGrad update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
 
     :class:`apex.optimizers.FusedNovoGrad`'s usage is identical to any Pytorch optimizer::
 
@@ -20,12 +21,13 @@ class FusedNovoGrad(torch.optim.Optimizer):
 
     :class:`apex.optimizers.FusedNovoGrad` may be used with or without Amp.  If you wish to use :class:`FusedNovoGrad` with Amp,
     you may choose any `opt_level`::
+
         opt = apex.optimizers.FusedNovoGrad(model.parameters(), lr = ....)
         model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
         ...
         opt.step()
 
-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.
 
     It has been proposed in `Jasper: An End-to-End Convolutional Neural Acoustic Model`_.
     More info: https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd

diff --git a/apex/optimizers/fused_sgd.py b/apex/optimizers/fused_sgd.py
@@ -6,27 +6,29 @@
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
 
-      Currently GPU-only.  Requires Apex to be installed via
+    Currently GPU-only.  Requires Apex to be installed via
     ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
 
-    This version of fused SGD implements 2 fusions:
-      - Fusion of the SGD update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused SGD implements 2 fusions.
 
-    :class:`apex.optimizers.FusedSGD` may be used as a drop-in replacement for torch.optim.SGD::
+      * Fusion of the SGD update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``::
 
         opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
         ...
         opt.step()
 
     :class:`apex.optimizers.FusedSGD` may be used with or without Amp.  If you wish to use :class:`FusedSGD` with Amp,
     you may choose any `opt_level`::
+
         opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
         model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
         ...
         opt.step()
 
-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.
 
     Nesterov momentum is based on the formula from
     `On the importance of initialization and momentum in deep learning`__.