diff --git a/megatron/training.py b/megatron/training.py index f8becb74b..6c88a21fa 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -528,7 +528,8 @@ def evaluate(neox_args, forward_step_fn, data_iterator, model, verbose=False, ti # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s # to be consistent with deepspeed's pipe parallel engine - for _ in range(neox_args.gradient_accumulation_steps): + # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true + for _ in range(1 if neox_args.is_pipe_parallel else neox_args.gradient_accumulation_steps): # Forward evaluation loss = forward_step_fn(model=model, data_iterator=data_iterator, neox_args=neox_args, timers=timers) losses.append(loss)