feat: remove unnecessary bf16 conversions since no collective op is p…

…erformed
EleutherAI · Quentin-Anthony · Mar 4, 2024 · Mar 3, 2024 · Mar 4, 2024 · Mar 3, 2024
commit 6abad8b2fea886e30bf34258a813a6bc50e16afa
@@ -57,22 +57,13 @@ def _split(input_):
  if world_size == 1:
  return input_
 
- # Bf16 convert
- dt = input_.dtype
- if dt == torch.bfloat16 and get_fp32_allreduce():
- input_ = input_.float()
-
  # Split along last dimension.
  input_list = split_tensor_along_last_dim(input_, world_size)
 
  # Note: torch.split does not create contiguous tensors by default.
  rank = get_model_parallel_rank()
  output = input_list[rank].contiguous()
 
- # Bf16 convert
- if dt == torch.bfloat16 and get_fp32_allreduce():
- output = output.bfloat16()
-
  return output