Remove row parallelism (EleutherAI#946)

* remove row parallelism * Update NeoXArgs docs automatically --------- Co-authored-by: Quentin-Anthony <[email protected]> Co-authored-by: github-actions <[email protected]>
Jarvis-LLM · May 19, 2023 · 649c309 · 649c309
1 parent b70d004
commit 649c309
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 19 deletions.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 83e820c
+ Default = a6b9622
 
  current git hash of repository
 
@@ -587,7 +587,7 @@ Model Arguments
 
 
 
-- **output_layer_parallelism**: typing.Literal['row', 'column']
+- **output_layer_parallelism**: typing.Literal['column']
 
  Default = column
 

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -222,22 +222,22 @@ def __init__(
  skip_bias_add=False,
  mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here
  )
- else:
- print(
- 'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
- )
- exit()
- self.final_linear = mpu.RowParallelLinear(
- neox_args=neox_args,
- input_size=neox_args.hidden_size,
- output_size=neox_args.padded_vocab_size,
- bias=False,
- input_is_parallel=False,
- init_method=init_method,
- parallel_output=parallel_output,
- skip_bias_add=False,
- mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here
- )
+# else:
+# print(
+# 'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
+# )
+# exit()
+# self.final_linear = mpu.RowParallelLinear(
+# neox_args=neox_args,
+# input_size=neox_args.hidden_size,
+# output_size=neox_args.padded_vocab_size,
+# bias=False,
+# input_is_parallel=False,
+# init_method=init_method,
+# parallel_output=parallel_output,
+# skip_bias_add=False,
+# mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here
+# )
 
  def forward(self, hidden_states):
  return self.final_linear(hidden_states)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -373,7 +373,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
  """
 
  # Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905)
- output_layer_parallelism: Literal["row", "column"] = "column"
+ output_layer_parallelism: Literal["column"] = "column"
 
  """
  Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)