diff --git a/examples/pretrain_gpt3_175B.sh b/examples/pretrain_gpt3_175B.sh index 3b775c1ff1..ad0d244d7b 100755 --- a/examples/pretrain_gpt3_175B.sh +++ b/examples/pretrain_gpt3_175B.sh @@ -46,6 +46,7 @@ options=" \ --weight-decay 0.1 \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ + --init-method-std 0.006 \ --tensorboard-dir \ --fp16 \ --checkpoint-activations "