apache · aaronmarkham · Dec 5, 2018 · Nov 29, 2018
@@ -686,8 +686,11 @@ class LBSGD(Optimizer):
  state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
  weight = weight - state
 
- For details of the update algorithm see :class:`~mxnet.ndarray.lbsgd_update` and
- :class:`~mxnet.ndarray.lbsgd_mom_update`.
+ For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update`
+ and :class:`~mxnet.ndarray.sgd_mom_update`.
+ In addition to the SGD updates the LBSGD optimizer uses the LARS, Layer-wise
+ Adaptive Rate Scaling, algorithm to have a separate learning rate for each
+ layer of the network, which leads to better stability over large batch sizes.
 
  This optimizer accepts the following parameters in addition to those accepted
  by :class:`.Optimizer`.