Skip to content

Commit

Permalink
[Fix] pre-commit and update-documentation checks
Browse files Browse the repository at this point in the history
  • Loading branch information
austinburnett committed May 9, 2023
1 parent 9900071 commit a4e9f24
Show file tree
Hide file tree
Showing 17 changed files with 362 additions and 195 deletions.
2 changes: 1 addition & 1 deletion configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model.

For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)

Note: yaml arguments may be formatted with either '-' or '_'. The standard seperator used is a '_' as shown in the example configurations below. However, the use of '-' as a seperator may be deprecated in the future.
Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
```yaml
# GPT-3 pretraining setup
{
Expand Down
7 changes: 3 additions & 4 deletions configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ Text Generation arguments

- **prompt_end**: str

Default =
Default =


a single prompt's end. Defaults to newline
Expand Down Expand Up @@ -994,7 +994,7 @@ Text Generation arguments

- **eval_results_prefix**: str

Default =
Default =

prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json

Expand Down Expand Up @@ -1712,7 +1712,7 @@ Args for deepspeed config

Default = None





Expand Down Expand Up @@ -2014,4 +2014,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
Default = None

Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.

3 changes: 2 additions & 1 deletion configs/pythia/1-4B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 24]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down Expand Up @@ -83,3 +83,4 @@
"wall_clock_breakdown": true,

"tokenizer_type": "HFTokenizer"
}
4 changes: 2 additions & 2 deletions configs/pythia/12B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 36]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down Expand Up @@ -63,7 +63,7 @@
"hysteresis": 2,
"min_loss_scale": 1
},

"train_iters": 143000,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
Expand Down
2 changes: 1 addition & 1 deletion configs/pythia/160M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down
6 changes: 3 additions & 3 deletions configs/pythia/1B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

Expand All @@ -38,7 +38,7 @@
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},
},

"fp16": {
"enabled": true,
Expand All @@ -49,7 +49,7 @@
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},
},

"fp32_allreduce": true,

Expand Down
2 changes: 1 addition & 1 deletion configs/pythia/2-8B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 32]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down
2 changes: 1 addition & 1 deletion configs/pythia/410M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 24]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down
6 changes: 3 additions & 3 deletions configs/pythia/6-9B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 32]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

Expand All @@ -28,7 +28,7 @@
"eps": 1.0e-8
}
},

"min_lr": 0.000012,

"zero_optimization": {
Expand Down
2 changes: 1 addition & 1 deletion configs/pythia/70M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 6]],

"scaled_upper_triang_masked_softmax_fusion": true,
Expand Down
1 change: 0 additions & 1 deletion megatron/data/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ py::array build_sample_idx_int32(const py::array_t<int32_t>& sizes_,
free_when_done); // numpy array references
}


py::array build_sample_idx_int64(const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& doc_idx_,
const int32_t seq_length,
Expand Down
2 changes: 1 addition & 1 deletion megatron/fused_kernels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def load_fused_kernels():
print(e)
print("=" * 100)
print(
f'ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them'
f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
)
print("=" * 100)
exit()
Expand Down
21 changes: 16 additions & 5 deletions megatron/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,11 @@ class LLaMAParallelMLP(nn.Module):
"""

def __init__(
self, neox_args, init_method, output_layer_init_method, parallel_output=False,
self,
neox_args,
init_method,
output_layer_init_method,
parallel_output=False,
multiple_of=256,
):
super().__init__()
Expand Down Expand Up @@ -219,7 +223,9 @@ def __init__(
mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here
)
else:
print('ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.')
print(
'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
)
exit()
self.final_linear = mpu.RowParallelLinear(
neox_args=neox_args,
Expand Down Expand Up @@ -864,9 +870,14 @@ def forward(self, x, attention_mask, layer_past=None):
)
else:
# Otherwise just apply dropout + residual
attention_output = torch.nn.functional.dropout(
attention_output, p=self.hidden_dropout, training=self.training
) + residual
attention_output = (
torch.nn.functional.dropout(
attention_output,
p=self.hidden_dropout,
training=self.training,
)
+ residual
)

# output = x + mlp(ln2(x))
mlp_output, mlp_bias = self.mlp(
Expand Down
4 changes: 3 additions & 1 deletion megatron/neox_arguments/neox_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
"""

activation: Literal["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] = "gelu"
activation: Literal[
"gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
] = "gelu"
"""
Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
"""
Expand Down
6 changes: 4 additions & 2 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,9 +447,11 @@ def get_optimizer(model, neox_args):
"""Set up the optimizer."""
if neox_args.no_load_optim:
return None, None

if neox_args.optimizer is None:
print_rank_0(f'ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)')
print_rank_0(
f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
)
exit()
# Build parameter groups (weight decay and non-decay).
param_groups = get_params_for_weight_decay_optimization(model, neox_args)
Expand Down
Loading

0 comments on commit a4e9f24

Please sign in to comment.