Update neox_args.py (#1107)

* Update neox_args.py Changed some default values to correspond to values that we generally recommend people use. * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions <[email protected]>
EleutherAI · Dec 26, 2023 · e5a7ea7 · e5a7ea7
1 parent 1148a0f
commit e5a7ea7
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 11 deletions.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 79befef
+ Default = 31cb364
 
  current git hash of repository
 
@@ -1143,7 +1143,7 @@ Training Arguments
 
 - **weighted_sampler_alpha**: float
 
- Default = 0.3
+ Default = 1.0
 
  Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
 
@@ -1373,23 +1373,23 @@ Training Arguments
 
 - **attention_dropout**: float
 
- Default = 0.1
+ Default = 0.0
 
  Post attention dropout probability.
 
 
 
 - **hidden_dropout**: float
 
- Default = 0.1
+ Default = 0.0
 
  Dropout probability for hidden state transformer.
 
 
 
 - **weight_decay**: float
 
- Default = 0.01
+ Default = 0.1
 
  Weight decay coefficient for L2 regularization.
 
@@ -1470,7 +1470,7 @@ Training Arguments
 
 - **clip_grad**: float
 
- Default = None
+ Default = 1.0
 
  Gradient clipping based on global L2 norm.
 

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -774,7 +774,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
  See https://arxiv.org/abs/1911.02116 for more details
  """
 
- weighted_sampler_alpha: float = 0.3
+ weighted_sampler_alpha: float = 1.0
  """
  Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
 
@@ -923,17 +923,17 @@ class NeoXArgsTraining(NeoXArgsTemplate):
  Exit the program after the iteration is divisible by this value.
  """
 
- attention_dropout: float = 0.1
+ attention_dropout: float = 0.0
  """
  Post attention dropout probability.
  """
 
- hidden_dropout: float = 0.1
+ hidden_dropout: float = 0.0
  """
  Dropout probability for hidden state transformer.
  """
 
- weight_decay: float = 0.01
+ weight_decay: float = 0.1
  """
  Weight decay coefficient for L2 regularization.
  """
@@ -982,7 +982,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
  gas: int = None
  """gradient_accumulation_steps""" # TODO this is a duplicate, remove?
 
- clip_grad: float = None
+ clip_grad: float = 1.0
  """
  Gradient clipping based on global L2 norm.
  """