Merge pull request EleutherAI#915 from EleutherAI/disable-row-parallel

Disable row-parallelism for now
Jarvis-LLM · May 2, 2023 · b608043 · b608043
2 parents dee7528 + d47a207
commit b608043
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 5 deletions.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 507ad04
+ Default = 5d2d78a
 
  current git hash of repository
 
@@ -399,11 +399,11 @@ Model Arguments
 
 
 
-- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish']
+- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
 
  Default = gelu
 
- Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish"]
+ Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
 
 
 
@@ -547,6 +547,32 @@ Model Arguments
 
 
 
+- **use_bias_in_norms**: bool
+
+ Default = True
+
+ If false, norms (e.g. LayerNorm) will not have bias terms
+
+
+
+- **use_bias_in_attn_linear**: bool
+
+ Default = True
+
+ If false, attn_linear (e.g. QKVO) will not have bias terms
+
+
+
+- **mlp_type**: str
+
+ Default = regular
+
+ Types:
+ regular: Megatron implementation
+ llama: LLaMA MLP (SiLU-gated MLP)
+
+
+
 - **soft_prompt_tuning**: dict
 
  Default = None
@@ -563,7 +589,7 @@ Model Arguments
 
 - **output_layer_parallelism**: typing.Literal['row', 'column']
 
- Default = row
+ Default = column
 
  Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
 

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -219,6 +219,8 @@ def __init__(
  mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here
  )
  else:
+ print('ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.')
+ exit()
  self.final_linear = mpu.RowParallelLinear(
  neox_args=neox_args,
  input_size=neox_args.hidden_size,

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -370,7 +370,8 @@ class NeoXArgsModel(NeoXArgsTemplate):
  'init_range': float = 0.5 # if no init string is provided, initialize the soft prompt with a uniform distribution between -init_range and init_rang
  """
 
- output_layer_parallelism: Literal["row", "column"] = "row"
+ # Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905)
+ output_layer_parallelism: Literal["row", "column"] = "column"
 
  """
  Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)