further refactoring

zTaoplus · Dec 25, 2020 · 0888a3e · 0888a3e
1 parent dfd8ed4
commit 0888a3e
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 157 deletions.
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from apex.optimizers import FusedAdam as Adam
+from megatron import get_args
+from megatron.model import import_layernorm
+
+from .grad_scaler import ConstantGradScaler, DynamicGradScaler
+from .optimizer import FP16OptimizerWithFP16Params, FP32Optimizer
+
+
+def _get_params_for_weight_decay_optimization(module):
+ """Divide params into with-weight-decay and without-weight-decay groups.
+ Layernorms and baises will have no weight decay but the rest will.
+ """
+
+ args = get_args()
+ LayerNorm = import_layernorm(args.fp32_residual_connection)
+
+ weight_decay_params = {'params': []}
+ no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+ for module_ in module.modules():
+ if isinstance(module_, LayerNorm):
+ no_weight_decay_params['params'].extend(
+ [p for p in list(module_._parameters.values())
+ if p is not None])
+ else:
+ weight_decay_params['params'].extend(
+ [p for n, p in list(module_._parameters.items())
+ if p is not None and n != 'bias'])
+ no_weight_decay_params['params'].extend(
+ [p for n, p in list(module_._parameters.items())
+ if p is not None and n == 'bias'])
+
+ return weight_decay_params, no_weight_decay_params
+
+
+def get_megatron_optimizer(model):
+
+ args = get_args()
+
+ # Base optimizer.
+ param_groups = _get_params_for_weight_decay_optimization(model)
+ optimizer = Adam(param_groups,
+ lr=args.lr,
+ weight_decay=args.weight_decay,
+ betas=(args.adam_beta1, args.adam_beta2),
+ eps=args.adam_eps)
+
+ if args.fp16:
+ # Constant loss scale.
+ if args.loss_scale:
+ grad_scaler = ConstantGradScaler(args.loss_scale)
+ # Dynamic loss scale.
+ else:
+ grad_scaler = DynamicGradScaler(
+ initial_scale=args.initial_loss_scale,
+ min_scale=args.min_loss_scale,
+ growth_factor=2.0,
+ backoff_factor=0.5,
+ growth_interval=args.loss_scale_window,
+ hysteresis=args.hysteresis)
+ # Megatron optimizer.
+ return FP16OptimizerWithFP16Params(optimizer, grad_scaler,
+ args.clip_grad)
+
+ # FP32.
+ return FP32Optimizer(optimizer, model, args.clip_grad)
diff --git a/megatron/optimizer/grad_scaler.py b/megatron/optimizer/grad_scaler.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron grad scaler."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+
+class MegatronGradScaler(ABC):
+
+ def __init__(self, initial_scale):
+ """Initialize scale value with the input initial scale."""
+ assert initial_scale > 0.0
+ self._scale = torch.cuda.FloatTensor([initial_scale])
+
+ @property
+ def scale(self):
+ return self._scale
+
+ @property
+ def inv_scale(self):
+ return self._scale.double().reciprocal().float()
+
+ @abstractmethod
+ def update(self, found_inf):
+ pass
+
+ '''
+ @abstractmethod
+ def state_dict(self):
+ pass
+
+ @abstractmethod
+ def load_state_dict(self, state_dict):
+ pass
+ '''
+
+
+class ConstantGradScaler(MegatronGradScaler):
+
+ def update(self, found_inf):
+ pass
+
+
+class DynamicGradScaler(MegatronGradScaler):
+
+ def __init__(self, initial_scale, min_scale,
+ growth_factor, backoff_factor,
+ growth_interval, hysteresis):
+ """"Grad scaler with dynamic scale that gets adjusted
+ during training."""
+ super(DynamicGradScaler, self).__init__(initial_scale)
+
+ # Lower bound on the scale.
+ assert min_scale > 0.0
+ assert min_scale <= initial_scale
+ self.min_scale = torch.cuda.FloatTensor([min_scale])
+ # Growth and backoff factors for the scale.
+ assert growth_factor > 1.0
+ self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+ assert backoff_factor < 1.0
+ assert backoff_factor > 0.0
+ self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+ # Interval over which if we don't see any inf/nan,
+ # we will scale the grad scale by the growth factor.
+ assert growth_interval > 0
+ self.growth_interval = growth_interval
+ # Number of inf/nans we should see before scaling down
+ # the grad scale by the backoff factor.
+ assert hysteresis > 0
+ self.hysteresis = hysteresis
+
+ # Trackers.
+ self._growth_tracker = 0
+ self._hysteresis_tracker = self.hysteresis
+
+
+ def update(self, found_inf):
+
+ # If we have an inf/nan, growth tracker is set to 0
+ # and hysterisis tracker is reduced by 1.
+ if found_inf:
+ self._growth_tracker = 0
+ self._hysteresis_tracker -= 1
+ # Now if we are our of hysteresis count, scale down the loss.
+ if self._hysteresis_tracker <= 0:
+ self._scale = torch.max(self._scale * self.backoff_factor,
+ self.min_scale)
+ else:
+ # If there is no nan/inf, increment the growth tracker.
+ self._growth_tracker += 1
+ # If we have had enough consequitive intervals with no nan/inf:
+ if self._growth_tracker == self.growth_interval:
+ # Reset the tracker and hysteresis trackers,
+ self._growth_tracker = 0
+ self._hysteresis_tracker = self.hysteresis
+ # and scale up the loss scale.
+ self._scale = self._scale * self.growth_factor
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
@@ -22,166 +22,10 @@
 from torch._six import inf
 
 from apex.multi_tensor_apply import multi_tensor_applier
-from apex.optimizers import FusedAdam as Adam
 import amp_C
 
-from megatron import get_args
 from megatron import get_timers
 from megatron import mpu
-from megatron.model import import_layernorm
-
-
-def get_params_for_weight_decay_optimization(module):
- """Divide params into with-weight-decay and without-weight-decay groups.
- Layernorms and baises will have no weight decay but the rest will.
- """
-
- args = get_args()
- LayerNorm = import_layernorm(args.fp32_residual_connection)
-
- weight_decay_params = {'params': []}
- no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
- for module_ in module.modules():
- if isinstance(module_, LayerNorm):
- no_weight_decay_params['params'].extend(
- [p for p in list(module_._parameters.values())
- if p is not None])
- else:
- weight_decay_params['params'].extend(
- [p for n, p in list(module_._parameters.items())
- if p is not None and n != 'bias'])
- no_weight_decay_params['params'].extend(
- [p for n, p in list(module_._parameters.items())
- if p is not None and n == 'bias'])
-
- return weight_decay_params, no_weight_decay_params
-
-
-def get_megatron_optimizer(model):
-
- args = get_args()
-
- # Base optimizer.
- param_groups = get_params_for_weight_decay_optimization(model)
- optimizer = Adam(param_groups,
- lr=args.lr,
- weight_decay=args.weight_decay,
- betas=(args.adam_beta1, args.adam_beta2),
- eps=args.adam_eps)
-
- if args.fp16:
- # Constant loss scale.
- if args.loss_scale:
- grad_scaler = ConstantGradScaler(args.loss_scale)
- # Dynamic loss scale.
- else:
- grad_scaler = DynamicGradScaler(
- initial_scale=args.initial_loss_scale,
- min_scale=args.min_loss_scale,
- growth_factor=2.0,
- backoff_factor=0.5,
- growth_interval=args.loss_scale_window,
- hysteresis=args.hysteresis)
- # Megatron optimizer.
- return FP16OptimizerWithFP16Params(optimizer, grad_scaler,
- args.clip_grad)
-
- # FP32.
- return FP32Optimizer(optimizer, model, args.clip_grad)
-
-
-
-class MegatronGradScaler(ABC):
-
- def __init__(self, initial_scale):
- """Initialize scale value with the input initial scale."""
- assert initial_scale > 0.0
- self._scale = torch.cuda.FloatTensor([initial_scale])
-
- @property
- def scale(self):
- return self._scale
-
- @property
- def inv_scale(self):
- return self._scale.double().reciprocal().float()
-
- @abstractmethod
- def update(self, found_inf):
- pass
-
- '''
- @abstractmethod
- def state_dict(self):
- pass
-
- @abstractmethod
- def load_state_dict(self, state_dict):
- pass
- '''
-
-
-class ConstantGradScaler(MegatronGradScaler):
-
- def update(self, found_inf):
- pass
-
-
-class DynamicGradScaler(MegatronGradScaler):
-
- def __init__(self, initial_scale, min_scale,
- growth_factor, backoff_factor,
- growth_interval, hysteresis):
- """"Grad scaler with dynamic scale that gets adjusted
- during training."""
- super(DynamicGradScaler, self).__init__(initial_scale)
-
- # Lower bound on the scale.
- assert min_scale > 0.0
- assert min_scale <= initial_scale
- self.min_scale = torch.cuda.FloatTensor([min_scale])
- # Growth and backoff factors for the scale.
- assert growth_factor > 1.0
- self.growth_factor = torch.cuda.FloatTensor([growth_factor])
- assert backoff_factor < 1.0
- assert backoff_factor > 0.0
- self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
- # Interval over which if we don't see any inf/nan,
- # we will scale the grad scale by the growth factor.
- assert growth_interval > 0
- self.growth_interval = growth_interval
- # Number of inf/nans we should see before scaling down
- # the grad scale by the backoff factor.
- assert hysteresis > 0
- self.hysteresis = hysteresis
-
- # Trackers.
- self._growth_tracker = 0
- self._hysteresis_tracker = self.hysteresis
-
-
- def update(self, found_inf):
-
- # If we have an inf/nan, growth tracker is set to 0
- # and hysterisis tracker is reduced by 1.
- if found_inf:
- self._growth_tracker = 0
- self._hysteresis_tracker -= 1
- # Now if we are our of hysteresis count, scale down the loss.
- if self._hysteresis_tracker <= 0:
- self._scale = torch.max(self._scale * self.backoff_factor,
- self.min_scale)
- else:
- # If there is no nan/inf, increment the growth tracker.
- self._growth_tracker += 1
- # If we have had enough consequitive intervals with no nan/inf:
- if self._growth_tracker == self.growth_interval:
- # Reset the tracker and hysteresis trackers,
- self._growth_tracker = 0
- self._hysteresis_tracker = self.hysteresis
- # and scale up the loss scale.
- self._scale = self._scale * self.growth_factor
-
 
 
 def _zero_grad_group_helper(group, set_to_none):

diff --git a/megatron/training.py b/megatron/training.py
@@ -38,7 +38,7 @@
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
 from megatron.fp16 import FP16_Module
-from megatron.optimizer.optimizer import get_megatron_optimizer
+from megatron.optimizer import get_megatron_optimizer
 
 from megatron.initialize import initialize_megatron
 from megatron.initialize import write_args_to_tensorboard