Fix SequentialWrapper Generation (pipe_parallel_size = 0) (#1031)

* Fix SequentialGeneration * Fix SequentialGeneration
EleutherAI · Sep 18, 2023 · 70af6e8 · 70af6e8
1 parent fcd5f92
commit 70af6e8
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 0 deletions.
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
@@ -97,6 +97,7 @@ def __init__(
  self.activation_checkpoint_interval = activation_checkpoint_interval
  self.parent_class_name = parent_class_name
  self.activation_checkpoint_func = activation_checkpoint_func
+ self.batch_fn = None
 
  def _is_checkpointable(self, funcs):
  if self.parent_class_name == "GPT2ModelPipe":
@@ -106,6 +107,14 @@ def _is_checkpointable(self, funcs):
  params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)]
  return any(len(list(p)) > 0 for p in params)
 
+ def set_batch_fn(self, fn):
+ """Execute a post-processing function on input data.
+
+ Args:
+ fn (function): The function to run.
+ """
+ self.batch_fn = fn
+
  def inference_mode(self, use_cache=True):
  """
  Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
@@ -127,6 +136,9 @@ def forward(
  self, forward_input, curriculum_seqlen=None, labels=None, neox_args=None
  ):
 
+ if self.batch_fn:
+ forward_input = self.batch_fn(forward_input)
+
  if (
  curriculum_seqlen is not None
  and isinstance(forward_input, tuple)

diff --git a/megatron/training.py b/megatron/training.py
@@ -351,6 +351,16 @@ def get_batch_pipe(data, neox_args, curr_scheduler=None):
  return (tokens, position_ids, attention_mask), (labels, loss_mask)
 
 
+def get_batch_sequential(forward_input, neox_args):
+ """A modification of get_batch() to work with the latest batch instead of an iterator."""
+ attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+ data=forward_input[0],
+ eod_token=neox_args.tokenizer.eod,
+ eod_mask_loss=neox_args.eod_mask_loss,
+ )
+ return (forward_input[0], forward_input[1], attention_mask)
+
+
 def forward_step(
  data_iterator, model, neox_args, timers, return_logits=False, is_train=False
 ):
@@ -653,6 +663,13 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
  get_batch_pipe, neox_args=neox_args, curr_scheduler=curr_scheduler
  )
  )
+ else:
+ model.module.set_batch_fn(
+ partial(
+ get_batch_sequential, neox_args=neox_args
+ )
+ )
+
  else:
  raise ValueError("Must be using deepspeed to run neox")