[Fix] pre-commit and update-documentation checks

EleutherAI · May 9, 2023 · a4e9f24 · a4e9f24
1 parent 9900071
commit a4e9f24
Show file tree

Hide file tree

Showing 17 changed files with 362 additions and 195 deletions.
diff --git a/configs/README.md b/configs/README.md
@@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model.
 
 For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
 
-Note: yaml arguments may be formatted with either '-' or '_'. The standard seperator used is a '_' as shown in the example configurations below. However, the use of '-' as a seperator may be deprecated in the future.
+Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
 ```yaml
 # GPT-3 pretraining setup
 {

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -952,7 +952,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
- Default = 
+ Default =
 
 
  a single prompt's end. Defaults to newline
@@ -994,7 +994,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
- Default = 
+ Default =
 
  prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1712,7 +1712,7 @@ Args for deepspeed config
 
  Default = None
 
- 
+
 
 
 
@@ -2014,4 +2014,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
  Default = None
 
  Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
-
diff --git a/configs/pythia/1-4B.yml b/configs/pythia/1-4B.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 24]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,
@@ -83,3 +83,4 @@
  "wall_clock_breakdown": true,
 
  "tokenizer_type": "HFTokenizer"
+ }
diff --git a/configs/pythia/12B.yml b/configs/pythia/12B.yml
@@ -13,7 +13,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 36]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,
@@ -63,7 +63,7 @@
  "hysteresis": 2,
  "min_loss_scale": 1
  },
- 
+
  "train_iters": 143000,
  "lr_decay_iters": 143000,
  "distributed_backend": "nccl",

diff --git a/configs/pythia/160M.yml b/configs/pythia/160M.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 12]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,

diff --git a/configs/pythia/1B.yml b/configs/pythia/1B.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
 
@@ -38,7 +38,7 @@
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": true,
  "cpu_offload": false
- }, 
+ },
 
  "fp16": {
  "enabled": true,
@@ -49,7 +49,7 @@
  "initial_scale_power": 12,
  "hysteresis": 2,
  "min_loss_scale": 1
- }, 
+ },
 
  "fp32_allreduce": true,
 

diff --git a/configs/pythia/2-8B.yml b/configs/pythia/2-8B.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 32]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,

diff --git a/configs/pythia/410M.yml b/configs/pythia/410M.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 24]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,

diff --git a/configs/pythia/6-9B.yml b/configs/pythia/6-9B.yml
@@ -13,9 +13,9 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 32]],
- 
+
  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
 
@@ -28,7 +28,7 @@
  "eps": 1.0e-8
  }
  },
- 
+
  "min_lr": 0.000012,
 
  "zero_optimization": {

diff --git a/configs/pythia/70M.yml b/configs/pythia/70M.yml
@@ -12,7 +12,7 @@
  "no_weight_tying": true,
  "gpt_j_residual": true,
  "output_layer_parallelism": "column",
- 
+
  "attention_config": [[["flash"], 6]],
 
  "scaled_upper_triang_masked_softmax_fusion": true,

diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
@@ -173,7 +173,6 @@ py::array build_sample_idx_int32(const py::array_t<int32_t>& sizes_,
  free_when_done); // numpy array references
 }
 
-
 py::array build_sample_idx_int64(const py::array_t<int32_t>& sizes_,
  const py::array_t<int32_t>& doc_idx_,
  const int32_t seq_length,

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
@@ -38,7 +38,7 @@ def load_fused_kernels():
  print(e)
  print("=" * 100)
  print(
- f'ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them'
+ f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
  )
  print("=" * 100)
  exit()

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -146,7 +146,11 @@ class LLaMAParallelMLP(nn.Module):
  """
 
  def __init__(
- self, neox_args, init_method, output_layer_init_method, parallel_output=False,
+ self,
+ neox_args,
+ init_method,
+ output_layer_init_method,
+ parallel_output=False,
  multiple_of=256,
  ):
  super().__init__()
@@ -219,7 +223,9 @@ def __init__(
  mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here
  )
  else:
- print('ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.')
+ print(
+ 'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
+ )
  exit()
  self.final_linear = mpu.RowParallelLinear(
  neox_args=neox_args,
@@ -864,9 +870,14 @@ def forward(self, x, attention_mask, layer_past=None):
  )
  else:
  # Otherwise just apply dropout + residual
- attention_output = torch.nn.functional.dropout(
- attention_output, p=self.hidden_dropout, training=self.training
- ) + residual
+ attention_output = (
+ torch.nn.functional.dropout(
+ attention_output,
+ p=self.hidden_dropout,
+ training=self.training,
+ )
+ + residual
+ )
 
  # output = x + mlp(ln2(x))
  mlp_output, mlp_bias = self.mlp(

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -228,7 +228,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
  Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
  """
 
- activation: Literal["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] = "gelu"
+ activation: Literal[
+ "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+ ] = "gelu"
  """
  Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
  """

diff --git a/megatron/training.py b/megatron/training.py
@@ -447,9 +447,11 @@ def get_optimizer(model, neox_args):
  """Set up the optimizer."""
  if neox_args.no_load_optim:
  return None, None
- 
+
  if neox_args.optimizer is None:
- print_rank_0(f'ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)')
+ print_rank_0(
+ f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
+ )
  exit()
  # Build parameter groups (weight decay and non-decay).
  param_groups = get_params_for_weight_decay_optimization(model, neox_args)