fused layernorm (#1105)

* Add simple util for CUDA timings * Add fused layernorm kernel from Megatron Closes #952 * change default fused layernorm to false * Update test_setup.yml * Update test_train_base.yml --------- Co-authored-by: Yang Zhang <[email protected]> Co-authored-by: jahatef <[email protected]> Co-authored-by: Jacob Hatef <[email protected]>
EleutherAI · Jan 26, 2024 · 3d8fec0 · 3d8fec0
1 parent 7a8fa2f
commit 3d8fec0
Show file tree

Hide file tree

Showing 23 changed files with 285 additions and 1 deletion.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/125M-json.yml b/configs/125M-json.yml
@@ -16,6 +16,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",

diff --git a/configs/125M.yml b/configs/125M.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/13B.yml b/configs/13B.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/175B.yml b/configs/175B.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/19M.yml b/configs/19M.yml
@@ -16,6 +16,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/2-7B.yml b/configs/2-7B.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/20B.yml b/configs/20B.yml
@@ -31,6 +31,7 @@
  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/350M.yml b/configs/350M.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/49M.yml b/configs/49M.yml
@@ -19,6 +19,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/6-7B.yml b/configs/6-7B.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/760M.yml b/configs/760M.yml
@@ -21,6 +21,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/800M.yml b/configs/800M.yml
@@ -16,6 +16,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
  # init methods
  "init_method": "small_init",

diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
@@ -19,6 +19,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
 
  # optimizer settings

diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
@@ -20,6 +20,7 @@
  "scaled_upper_triang_masked_softmax_fusion": false,
  "bias_gelu_fusion": false,
  "rope_fusion": false,
+ "layernorm_fusion": false,
 
 
  # optimizer settings

diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
@@ -12,6 +12,7 @@
  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
  "rope_fusion": false,
+ "layernorm_fusion": false,
  "optimizer": {
  "type": "Adam",
  "params": {

diff --git a/megatron/devutil.py b/megatron/devutil.py
@@ -0,0 +1,51 @@
+import torch.cuda
+
+
+class Metric:
+ """
+ Dumb utility to collect and report average wall-time metrics.
+ """
+
+ def __init__(self, label):
+ self.label = label
+ self.measurements = []
+
+ def collect(self, measurement):
+ self.measurements.append(measurement)
+
+ def get_measurements(self):
+ return self.measurements[:]
+
+ def report(self):
+ print(
+ self.label,
+ torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0),
+ )
+
+
+def monitor_method_cuda_wall_times(metric, obj, methodname):
+ """
+ Measure timings for a method on an object or class.
+
+ For instance:
+
+ >>> metric = Metric('!LNORM')
+ >>> monitor_method_wall_times(metric, LayerNorm, 'forward')
+ """
+ oldmeth = getattr(obj, methodname)
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ def newmeth(*args, **kw):
+ start_event.record()
+ try:
+ return oldmeth(*args, **kw)
+ finally:
+ end_event.record()
+ torch.cuda.synchronize()
+ elapsed = start_event.elapsed_time(end_event)
+ metric.collect(elapsed)
+ metric.report()
+
+ setattr(obj, methodname, newmeth)
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""This code is copied from NVIDIA apex:
+ https://github.com/NVIDIA/apex
+ with some changes. """
+
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+from torch.nn import functional as F
+import inspect
+
+from megatron.utils import make_viewless_tensor
+
+try:
+ from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+ HAVE_PERSIST_LAYER_NORM = True
+except:
+ HAVE_PERSIST_LAYER_NORM = False
+
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+
+global fused_layer_norm_cuda
+fused_layer_norm_cuda = None
+
+
+class MixedFusedLayerNorm(torch.nn.Module):
+ def __init__(
+ self,
+ normalized_shape,
+ eps=1e-5,
+ no_persist_layer_norm=True,
+ sequence_parallel=False,
+ apply_layernorm_1p=False,
+ mem_efficient_ln=True,
+ ):
+ super(MixedFusedLayerNorm, self).__init__()
+
+ self.apply_layernorm_1p = apply_layernorm_1p
+ self.mem_efficient_ln = mem_efficient_ln
+
+ global fused_layer_norm_cuda
+ fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+ # List of hiddens sizes supported in the persistent layer norm kernel
+ # If the hidden size is not supported, fall back to the non-persistent
+ # kernel.
+ persist_ln_hidden_sizes = [
+ 1024,
+ 1536,
+ 2048,
+ 2304,
+ 3072,
+ 3840,
+ 4096,
+ 5120,
+ 6144,
+ 8192,
+ 10240,
+ 12288,
+ 12800,
+ 15360,
+ 16384,
+ 18432,
+ 20480,
+ 24576,
+ 25600,
+ 30720,
+ 32768,
+ 40960,
+ 49152,
+ 65536,
+ ]
+ if (
+ normalized_shape not in persist_ln_hidden_sizes
+ or not HAVE_PERSIST_LAYER_NORM
+ ):
+ no_persist_layer_norm = True
+
+ if isinstance(normalized_shape, numbers.Integral):
+ normalized_shape = (normalized_shape,)
+ self.normalized_shape = torch.Size(normalized_shape)
+ self.eps = eps
+ self.weight = Parameter(torch.Tensor(*normalized_shape))
+ self.bias = Parameter(torch.Tensor(*normalized_shape))
+ self.reset_parameters()
+ self.no_persist_layer_norm = no_persist_layer_norm
+ self.sequence_parallel = sequence_parallel
+
+ # set sequence parallelism flag on weight and bias parameters
+ setattr(self.weight, "sequence_parallel", self.sequence_parallel)
+ setattr(self.bias, "sequence_parallel", self.sequence_parallel)
+
+ def reset_parameters(self):
+
+ if self.apply_layernorm_1p:
+ init.zeros_(self.weight)
+ init.zeros_(self.bias)
+ else:
+ init.ones_(self.weight)
+ init.zeros_(self.bias)
+
+ def forward(self, input):
+
+ weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
+ # CPU path is here for unittest sake.
+ if not input.is_cuda:
+ print(
+ "WARNING! The input of FusedLayerNorm should be on the GPU."
+ "This warning should only be triggered in the FusedLayerNorm unit tests."
+ )
+ return F.layer_norm(
+ input, self.normalized_shape, weight, self.bias, self.eps
+ )
+
+ if self.no_persist_layer_norm:
+ # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect
+ # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
+ if (
+ "memory_efficient"
+ in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+ ):
+ return FusedLayerNormAffineFunction.apply(
+ input,
+ weight,
+ self.bias,
+ self.normalized_shape,
+ self.eps,
+ self.mem_efficient_ln,
+ )
+ else:
+ return FusedLayerNormAffineFunction.apply(
+ input, weight, self.bias, self.normalized_shape, self.eps
+ )
+ else:
+ output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+ # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+ # a populated '_base' field). This will result in schedule.py's
+ # deallocate_output_tensor() throwing an error, so a viewless tensor is
+ # created to prevent this.
+ output = make_viewless_tensor(
+ inp=output, requires_grad=input.requires_grad, keep_graph=True
+ )
+
+ return output
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch.nn import LayerNorm as LayerNorm
+from .fused_layer_norm import MixedFusedLayerNorm
 
 
 def get_norm(neox_args):
@@ -22,7 +23,7 @@ def get_norm(neox_args):
  eps = neox_args.rms_norm_epsilon
  elif neox_args.norm == "layernorm":
  eps = neox_args.layernorm_epsilon
- norm = LayerNorm
+ norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm
  elif neox_args.norm == "scalenorm":
  eps = neox_args.scalenorm_epsilon
  norm = ScaleNorm

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -125,6 +125,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
  Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
  """
 
+ layernorm_fusion: bool = False
+ """
+ Use fused layer norm kernel (if `norm` is `layernorm`).
+ """
+
  use_qk_layernorm: bool = False
  """
  Use QK Normalization