accept changes from main & resolve conflicts

EleutherAI · cr458 · Mar 29, 2023 · Mar 31, 2023 · Mar 31, 2023 · Apr 11, 2023
commit 9c645dd8629eb0f83719d902d3380539cdc3b4b1
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 142b4b6
+ Default = ce9bee3
 
  current git hash of repository
 
@@ -1951,6 +1951,14 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 
 
 
+- **force_multi**: bool
+
+ Default = False
+
+ Force multi-node training even if only one node is specified.
+
+
+
 - **detect_nvlink_pairs**: bool
 
  Default = False

@@ -4,10 +4,17 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from flash_attn import flash_attn_triton
 import flash_attn_cuda
 
 
-def _flash_attn_forward(
+def flash_attn_unpadded_unpacked_func_triton(
+ q, k, v, bias=None, causal=False, softmax_scale=None
+):
+ return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
+
+
+def _flash_attn_forward_cuda(
  q,
  k,
  v,
@@ -51,7 +58,7 @@ def _flash_attn_forward(
  return out, softmax_lse, S_dmask
 
 
-def _flash_attn_backward(
+def _flash_attn_backward_cuda(
  dout,
  q,
  k,
@@ -120,7 +127,7 @@ def forward(
  rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
  if softmax_scale is None:
  softmax_scale = qkv.shape[-1] ** (-0.5)
- out, softmax_lse, S_dmask = _flash_attn_forward(
+ out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
  qkv[:, 0],
  qkv[:, 1],
  qkv[:, 2],
@@ -148,7 +155,7 @@ def backward(ctx, dout, *args):
  cur_rng_state = torch.cuda.get_rng_state()
  torch.cuda.set_rng_state(rng_state)
  dqkv = torch.empty_like(qkv)
- _flash_attn_backward(
+ _flash_attn_backward_cuda(
  dout,
  qkv[:, 0],
  qkv[:, 1],
@@ -171,7 +178,7 @@ def backward(ctx, dout, *args):
  return dqkv, None, None, None, None, None, None
 
 
-def flash_attn_unpadded_qkvpacked_func(
+def flash_attn_unpadded_qkvpacked_func_cuda(
  qkv,
  cu_seqlens,
  max_seqlen,
@@ -204,7 +211,7 @@ def forward(
  rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
  if softmax_scale is None:
  softmax_scale = q.shape[-1] ** (-0.5)
- out, softmax_lse, S_dmask = _flash_attn_forward(
+ out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
  q,
  kv[:, 0],
  kv[:, 1],
@@ -244,7 +251,7 @@ def backward(ctx, dout, *args):
  torch.cuda.set_rng_state(rng_state)
  dq = torch.empty_like(q)
  dkv = torch.empty_like(kv)
- _flash_attn_backward(
+ _flash_attn_backward_cuda(
  dout,
  q,
  kv[:, 0],
@@ -267,7 +274,7 @@ def backward(ctx, dout, *args):
  return dq, dkv, None, None, None, None, None, None, None, None
 
 
-def flash_attn_unpadded_kvpacked_func(
+def flash_attn_unpadded_kvpacked_func_cuda(
  q,
  kv,
  cu_seqlens_q,
@@ -339,7 +346,7 @@ def forward(
  rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
  if softmax_scale is None:
  softmax_scale = q.shape[-1] ** (-0.5)
- out, softmax_lse, S_dmask = _flash_attn_forward(
+ out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
  q,
  k,
  v,
@@ -379,7 +386,7 @@ def backward(ctx, dout, *args):
  cur_rng_state = torch.cuda.get_rng_state()
  torch.cuda.set_rng_state(rng_state)
  dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
- _flash_attn_backward(
+ _flash_attn_backward_cuda(
  dout,
  q,
  k,
@@ -402,7 +409,7 @@ def backward(ctx, dout, *args):
  return dq, dk, dv, None, None, None, None, None, None, None, None
 
 
-def flash_attn_unpadded_func(
+def flash_attn_unpadded_func_cuda(
  q,
  k,
  v,

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
@@ -132,6 +132,51 @@ def get_slopes_power_of_2(n):
  ]
  )
 
+ def bias(self, seq_len_q, seq_len_k, device, dtype):
+ # [b, np, sq, sk]
+ # seq_len_q = x.shape[-2]
+ # seq_len_k = x.shape[-1]
+
+ # Initialize the AliBi matrix to match the first provided key length; grow it exponentially
+ # afterwards if longer inputs are provided. This is important for inference, where we will
+ # encounter progressively longer samples; it should have no effect at training time.
+ if self.cached_seq_len is not None and self.cached_seq_len >= seq_len_k:
+ a = self.cached_matrix
+ else:
+ target_seq_len = (
+ seq_len_k if self.cached_seq_len is None else self.cached_seq_len * 4
+ )
+ a = -torch.tril(
+ torch.arange(target_seq_len)
+ .view(target_seq_len, 1)
+ .repeat(1, target_seq_len)
+ + torch.arange(0, -target_seq_len, -1)
+ )
+ a = a.to(device).to(dtype)
+ slopes = self.slopes.to(a.device).to(a.dtype)
+ a = a * slopes.view(self.slopes.shape[0], 1, 1)
+ self.cached_seq_len = target_seq_len
+ self.cached_matrix = a
+
+ # If the AliBi matrix is larger than the key length, clip it.
+ if self.cached_seq_len > seq_len_k:
+ a = self.cached_matrix[:, :seq_len_k, :seq_len_k]
+
+ if seq_len_q != seq_len_k:
+ # In the train case x has dimensionality [b, np, sq, sk] with sq == sk
+ # The number of query tokens is equal to the number of key tokens
+ # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence)
+ # In this case we use the appropriate token index of the cache matrix.
+ # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used
+ assert (
+ seq_len_q == 1
+ ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1"
+ a = a[:, seq_len_k - 1, :].view(
+ a.shape[0], 1, a.shape[2]
+ ) # seq_len_k - 1 points to the last token index in the current inference batch.
+
+ return a
+
  def forward(self, x):
  # [b, np, sq, sk]
  seq_len_q = x.shape[-2]