From 0d921f79e5c1f73f5e4c9418afcf5de05196f1e4 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:02:02 +0000 Subject: [PATCH 01/94] changed ordering for setting up norm_factor --- megatron/model/transformer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 63f4122e2..eeb141fa1 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -295,14 +295,14 @@ def __init__( bias=neox_args.use_bias_in_attn_linear, ) - coeff = None - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - if self.apply_query_key_layer_scaling: - coeff = max(1, self.layer_number) - self.norm_factor *= coeff - if neox_args.use_mup: self.norm_factor = self.hidden_size_per_attention_head + else: + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = max(1, self.layer_number) + self.norm_factor *= coeff self.rpe = rpe @@ -956,6 +956,12 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non else: logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) + + # if self.neox_args.use_mup: + # # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. + # # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 + # logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() + # Gather if needed. if parallel_output: return logits_parallel From abee54daef5a0ca7e27a7f143ca8d93111dea54c Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 1 Dec 2023 03:02:58 +0000 Subject: [PATCH 02/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index bc2e8fc57..aa7b72d29 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 2da1083 + Default = 0d921f7 current git hash of repository From a08c3efbf1688e9e46ea654b2f0a8195a0ae404e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:49:53 +0000 Subject: [PATCH 03/94] updated muP args to the minimum required --- megatron/model/gpt2_model.py | 12 +++----- megatron/model/init_functions.py | 43 ++++++---------------------- megatron/model/transformer.py | 9 ++---- megatron/neox_arguments/neox_args.py | 33 +++++++-------------- 4 files changed, 26 insertions(+), 71 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 2725614cd..5fd70c49f 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -119,6 +119,9 @@ def __init__( self.init_method, self.output_layer_init_method = get_init_methods( self.neox_args ) + self.init_method, self.output_layer_init_method = get_init_methods( + self.neox_args + ) self.__topology__ = topology self.specs = [] @@ -268,16 +271,9 @@ def init_specs(self): def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline.""" - if self.neox_args.use_mup: - # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. - # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 - lm_output = ( - lm_output - / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() - ) logits = parallel_lm_logits( - lm_output, embedding.word_embeddings_weight, self.parallel_output + lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args ) return logits diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 11bcdc310..ff4c36b53 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -16,41 +16,22 @@ import torch -try: - import mup -except ImportError: - pass - -def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0): +def init_method_normal(sigma): """Init method based on N(0, sigma).""" - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=sigma) - with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) return init_ -def scaled_init_method_normal( - sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0 -): +def scaled_init_method_normal(sigma, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" std = sigma / math.sqrt(2.0 * num_layers) - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=std) - with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=std) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) return init_ @@ -169,21 +150,15 @@ def init_(tensor, use_mup=use_mup_outer): def get_init_methods(args): - if args.use_mup: - try: - import mup - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception - def _get(name): if name == "normal": return init_method_normal( - args.init_method_std, args.use_mup, args.mup_init_scale + sigma=args.init_method_std*args.mup_init_scale ) elif name == "scaled_normal": return scaled_init_method_normal( - args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale + sigma=args.init_method_std*args.mup_init_scale, + num_layers=args.num_layers ) elif name == "orthogonal": return orthogonal_init_method(args.use_mup, args.mup_init_scale) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index eeb141fa1..0785561cb 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -945,7 +945,7 @@ def forward(self, args): return self.norm(args) -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None): +def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None): """LM logits using word embedding weights.""" # Parallel logits. input_parallel = mpu.copy_to_model_parallel_region(input_) @@ -956,11 +956,8 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non else: logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) - - # if self.neox_args.use_mup: - # # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. - # # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 - # logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() + if args is not None and args.use_mup: + logits_parallel *= args.mup_output_logit_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 957960832..58780881b 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -263,6 +263,7 @@ class NeoXArgsModel(NeoXArgsTemplate): init_method_std: float = 0.02 """ Standard deviation of the zero mean normal distribution used for weight initialization. + When using muP this is the base std """ apply_query_key_layer_scaling: bool = False @@ -427,6 +428,7 @@ class NeoXArgsOptimizer(NeoXArgsTemplate): lr: float = None """ Max Learning rate during training + When using muP, this is the base learning rate """ @@ -1015,7 +1017,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): use_mup: bool = False """ - Whether to use Microsoft's Mup https://github.com/microsoft/mup + Whether to use muP """ coord_check: bool = False @@ -1033,35 +1035,20 @@ class NeoXArgsTraining(NeoXArgsTemplate): Path to the base shapes to save to/load from """ - mup_init_scale: float = 1.0 + mup_emb: int = 1 """ - Initialization scale: All the parameters are multiplied by this value + Embedding output multiplier """ - mup_attn_temp: float = 1.0 + mup_m_width: int = 1 """ - Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax + Manually set the layer width multiplier (d_model/d_model,base) """ - mup_output_temp: float = 1.0 + mup_d_model_base: int = 64 """ - Output temperature: Reciprocal of the multiplier applied to the input to softmax that - produces the distribution over output tokens. - """ - - mup_embedding_mult: float = 1.0 - """ - Scalar by which we multiply the output of the embedding layer - """ - - mup_rp_embedding_mult: float = 1.0 - """ - Scalar by which we multiply vectors representing relative position - """ - - mup_width_scale: int = 2 - """ - What to scale width by when creating the delta model for mup + d_model,base + Proxy (base) model's layer width """ From c35e8309a6f5b1e73f8d1dd888c23c481011b818 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:55:29 +0000 Subject: [PATCH 04/94] calculate m_width --- megatron/training.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ed9c0bcd0..0dea5ab17 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -439,11 +439,9 @@ def get_model(neox_args, use_cache=False): neox_args.use_mup = old_use_mup if neox_args.use_mup: - try: - import mup - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception + + if neox_args.mup_m_width == 1: + neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" From 81fdc4d1f7b7558aa55c97ad9adc04cd2e7bf693 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 1 Dec 2023 09:30:44 +0000 Subject: [PATCH 05/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 50 +++++++++++---------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index aa7b72d29..93c0328fe 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 0d921f7 + Default = 2d127df current git hash of repository @@ -452,6 +452,7 @@ Model Arguments Default = 0.02 Standard deviation of the zero mean normal distribution used for weight initialization. + When using muP this is the base std @@ -663,6 +664,7 @@ Optimizer Arguments Default = None Max Learning rate during training + When using muP, this is the base learning rate @@ -1521,7 +1523,7 @@ Training Arguments Default = False - Whether to use Microsoft's Mup https://github.com/microsoft/mup + Whether to use muP @@ -1549,52 +1551,28 @@ Training Arguments -- **mup_init_scale**: float +- **mup_emb**: int - Default = 1.0 - - Initialization scale: All the parameters are multiplied by this value - - - -- **mup_attn_temp**: float - - Default = 1.0 - - Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax - - - -- **mup_output_temp**: float - - Default = 1.0 - - Output temperature: Reciprocal of the multiplier applied to the input to softmax that - produces the distribution over output tokens. - - - -- **mup_embedding_mult**: float - - Default = 1.0 + Default = 1 - Scalar by which we multiply the output of the embedding layer + Embedding output multiplier -- **mup_rp_embedding_mult**: float +- **mup_m_width**: int - Default = 1.0 + Default = 1 - Scalar by which we multiply vectors representing relative position + Manually set the layer width multiplier (d_model/d_model,base) -- **mup_width_scale**: int +- **mup_d_model_base**: int - Default = 2 + Default = 64 - What to scale width by when creating the delta model for mup + d_model,base + Proxy (base) model's layer width From 7d6b2468b5634157a90550d85a55605728a80918 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:02:02 +0000 Subject: [PATCH 06/94] changed ordering for setting up norm_factor --- megatron/model/transformer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 63f4122e2..eeb141fa1 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -295,14 +295,14 @@ def __init__( bias=neox_args.use_bias_in_attn_linear, ) - coeff = None - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - if self.apply_query_key_layer_scaling: - coeff = max(1, self.layer_number) - self.norm_factor *= coeff - if neox_args.use_mup: self.norm_factor = self.hidden_size_per_attention_head + else: + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = max(1, self.layer_number) + self.norm_factor *= coeff self.rpe = rpe @@ -956,6 +956,12 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non else: logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) + + # if self.neox_args.use_mup: + # # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. + # # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 + # logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() + # Gather if needed. if parallel_output: return logits_parallel From a0d1929aa99b070b3c3f6b5a7c921d469a2ad08a Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:49:53 +0000 Subject: [PATCH 07/94] updated muP args to the minimum required --- megatron/model/gpt2_model.py | 12 +++----- megatron/model/init_functions.py | 43 ++++++---------------------- megatron/model/transformer.py | 9 ++---- megatron/neox_arguments/neox_args.py | 33 +++++++-------------- 4 files changed, 26 insertions(+), 71 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 2725614cd..5fd70c49f 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -119,6 +119,9 @@ def __init__( self.init_method, self.output_layer_init_method = get_init_methods( self.neox_args ) + self.init_method, self.output_layer_init_method = get_init_methods( + self.neox_args + ) self.__topology__ = topology self.specs = [] @@ -268,16 +271,9 @@ def init_specs(self): def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline.""" - if self.neox_args.use_mup: - # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. - # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 - lm_output = ( - lm_output - / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() - ) logits = parallel_lm_logits( - lm_output, embedding.word_embeddings_weight, self.parallel_output + lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args ) return logits diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 11bcdc310..ff4c36b53 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -16,41 +16,22 @@ import torch -try: - import mup -except ImportError: - pass - -def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0): +def init_method_normal(sigma): """Init method based on N(0, sigma).""" - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=sigma) - with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) return init_ -def scaled_init_method_normal( - sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0 -): +def scaled_init_method_normal(sigma, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" std = sigma / math.sqrt(2.0 * num_layers) - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=std) - with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=std) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) return init_ @@ -169,21 +150,15 @@ def init_(tensor, use_mup=use_mup_outer): def get_init_methods(args): - if args.use_mup: - try: - import mup - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception - def _get(name): if name == "normal": return init_method_normal( - args.init_method_std, args.use_mup, args.mup_init_scale + sigma=args.init_method_std*args.mup_init_scale ) elif name == "scaled_normal": return scaled_init_method_normal( - args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale + sigma=args.init_method_std*args.mup_init_scale, + num_layers=args.num_layers ) elif name == "orthogonal": return orthogonal_init_method(args.use_mup, args.mup_init_scale) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index eeb141fa1..0785561cb 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -945,7 +945,7 @@ def forward(self, args): return self.norm(args) -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None): +def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None): """LM logits using word embedding weights.""" # Parallel logits. input_parallel = mpu.copy_to_model_parallel_region(input_) @@ -956,11 +956,8 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non else: logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) - - # if self.neox_args.use_mup: - # # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout. - # # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274 - # logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult() + if args is not None and args.use_mup: + logits_parallel *= args.mup_output_logit_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 957960832..58780881b 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -263,6 +263,7 @@ class NeoXArgsModel(NeoXArgsTemplate): init_method_std: float = 0.02 """ Standard deviation of the zero mean normal distribution used for weight initialization. + When using muP this is the base std """ apply_query_key_layer_scaling: bool = False @@ -427,6 +428,7 @@ class NeoXArgsOptimizer(NeoXArgsTemplate): lr: float = None """ Max Learning rate during training + When using muP, this is the base learning rate """ @@ -1015,7 +1017,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): use_mup: bool = False """ - Whether to use Microsoft's Mup https://github.com/microsoft/mup + Whether to use muP """ coord_check: bool = False @@ -1033,35 +1035,20 @@ class NeoXArgsTraining(NeoXArgsTemplate): Path to the base shapes to save to/load from """ - mup_init_scale: float = 1.0 + mup_emb: int = 1 """ - Initialization scale: All the parameters are multiplied by this value + Embedding output multiplier """ - mup_attn_temp: float = 1.0 + mup_m_width: int = 1 """ - Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax + Manually set the layer width multiplier (d_model/d_model,base) """ - mup_output_temp: float = 1.0 + mup_d_model_base: int = 64 """ - Output temperature: Reciprocal of the multiplier applied to the input to softmax that - produces the distribution over output tokens. - """ - - mup_embedding_mult: float = 1.0 - """ - Scalar by which we multiply the output of the embedding layer - """ - - mup_rp_embedding_mult: float = 1.0 - """ - Scalar by which we multiply vectors representing relative position - """ - - mup_width_scale: int = 2 - """ - What to scale width by when creating the delta model for mup + d_model,base + Proxy (base) model's layer width """ From d63b3b85014b8dbdf72df5c4962400d62db947da Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 03:55:29 +0000 Subject: [PATCH 08/94] calculate m_width --- megatron/training.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ed9c0bcd0..0dea5ab17 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -439,11 +439,9 @@ def get_model(neox_args, use_cache=False): neox_args.use_mup = old_use_mup if neox_args.use_mup: - try: - import mup - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception + + if neox_args.mup_m_width == 1: + neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" From 9be82fed4d761111091b78c261cd3a9ed8a25506 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 1 Dec 2023 03:02:58 +0000 Subject: [PATCH 09/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index bc2e8fc57..aa7b72d29 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 2da1083 + Default = 0d921f7 current git hash of repository From 66214d949b09e3dbc1557bd993715a5f49b4daaf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 09:31:32 +0000 Subject: [PATCH 10/94] removed redundant line --- megatron/model/gpt2_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 5fd70c49f..012711a62 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -119,9 +119,6 @@ def __init__( self.init_method, self.output_layer_init_method = get_init_methods( self.neox_args ) - self.init_method, self.output_layer_init_method = get_init_methods( - self.neox_args - ) self.__topology__ = topology self.specs = [] From a6bad07f39d3364f74c39da3d844d61d722ae729 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 1 Dec 2023 09:34:08 +0000 Subject: [PATCH 11/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 93c0328fe..236226fdb 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 2d127df + Default = 17b7183 current git hash of repository From 63984bdfcac7e296439046b3b80b9a6c8501766d Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 09:34:36 +0000 Subject: [PATCH 12/94] removed redundant lines --- megatron/model/gpt2_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 5fd70c49f..012711a62 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -119,9 +119,6 @@ def __init__( self.init_method, self.output_layer_init_method = get_init_methods( self.neox_args ) - self.init_method, self.output_layer_init_method = get_init_methods( - self.neox_args - ) self.__topology__ = topology self.specs = [] From 11114e27958ed76b9d4b76969546b7325c126314 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 1 Dec 2023 09:35:05 +0000 Subject: [PATCH 13/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 236226fdb..0a30acdbe 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 17b7183 + Default = 02687a8 current git hash of repository From 05c4de35aeb1d51e0a12d64dd5b109aa8f5f031e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 13:29:25 +0000 Subject: [PATCH 14/94] modify init with mup --- megatron/model/init_functions.py | 78 ++++++++++++++------------------ 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index ff4c36b53..44666b229 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -68,12 +68,12 @@ def _orthogonal(tensor, gain=1): return tensor -def orthogonal_init_method(n_layers=1, use_mup=False, mup_init_scale=1.0): +def orthogonal_init_method(n_layers=1, mup_m_width=1.0): """Fills the input Tensor with a (semi) orthogonal matrix, as described in Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013) Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)""" - if use_mup: + if mup_m_width != 1: raise ValueError( "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error" ) @@ -84,67 +84,59 @@ def init_(tensor): return init_ -def xavier_uniform_init_method(use_mup_outer=False, mup_init_scale=1.0): +def xavier_uniform_init_method(mup_m_width=1.0): """Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution.""" - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.xavier_uniform_(tensor) + def init_(tensor, mup_m_width=mup_m_width): + init_weight = torch.nn.init.xavier_uniform_(tensor) + if mup_m_width != 1: with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.xavier_uniform_(tensor) + init_weight.div_(mup_m_width) + return init_weight return init_ -def xavier_normal_init_method(use_mup_outer=False, mup_init_scale=1.0): +def xavier_normal_init_method(mup_m_width=1.0): """Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution.""" - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.xavier_normal_(tensor) + def init_(tensor, mup_m_width=mup_m_width): + init_weight = torch.nn.init.xavier_normal_(tensor) + if mup_m_width != 1: with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.xavier_normal_(tensor) + init_weight.div_(mup_m_width) + return init_weight return init_ -def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0): +def small_init_init_method(dim, mup_m_width=1.0): """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" std = math.sqrt(2 / (5 * dim)) - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=std) + def init_(tensor, mup_m_width=mup_m_width): + init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) + if mup_m_width != 1: with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=std) + init_weight.div_(mup_m_width) + return init_weight return init_ -def wang_init_method(n_layers, dim, use_mup_outer=False, mup_init_scale=1.0): +def wang_init_method(n_layers, dim, mup_m_width=1.0): std = 2 / n_layers / math.sqrt(dim) - def init_(tensor, use_mup=use_mup_outer): - if use_mup: - mup.init.normal_(tensor, mean=0.0, std=std) + def init_(tensor, mup_m_width=mup_m_width): + init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) + if mup_m_width != 1: with torch.no_grad(): - tensor.mul_(mup_init_scale) - return tensor - else: - return torch.nn.init.normal_(tensor, mean=0.0, std=std) - + init_weight.div_(mup_m_width) + return init_weight + return init_ @@ -153,30 +145,30 @@ def get_init_methods(args): def _get(name): if name == "normal": return init_method_normal( - sigma=args.init_method_std*args.mup_init_scale + sigma=args.init_method_std/math.sqrt(args.mup_m_width) ) elif name == "scaled_normal": return scaled_init_method_normal( - sigma=args.init_method_std*args.mup_init_scale, + sigma=args.init_method_std/math.sqrt(args.mup_m_width), num_layers=args.num_layers ) elif name == "orthogonal": - return orthogonal_init_method(args.use_mup, args.mup_init_scale) + return orthogonal_init_method(args.mup_m_width) elif name == "scaled_orthogonal": return orthogonal_init_method( - args.num_layers, args.use_mup, args.mup_init_scale + args.num_layers, args.mup_m_width ) elif name == "xavier_uniform": - return xavier_uniform_init_method(args.use_mup, args.mup_init_scale) + return xavier_uniform_init_method(args.mup_m_width) elif name == "xavier_normal": - return xavier_normal_init_method(args.use_mup, args.mup_init_scale) + return xavier_normal_init_method(args.mup_m_width) elif name == "wang_init": return wang_init_method( - args.num_layers, args.hidden_size, args.use_mup, args.mup_init_scale + args.num_layers, args.hidden_size, args.mup_m_width ) elif name == "small_init": return small_init_init_method( - args.hidden_size, args.use_mup, args.mup_init_scale + args.hidden_size, args.mup_m_width ) else: raise NotImplementedError(f"Unknown init method {name}") From 71a91e40455203c4cbab9ded0588094fc23920c9 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 13:29:47 +0000 Subject: [PATCH 15/94] divide logits by the m_width --- megatron/model/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 0785561cb..9f48f1342 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -957,7 +957,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) if args is not None and args.use_mup: - logits_parallel *= args.mup_output_logit_multiplier + logits_parallel /= args.mup_m_width # Gather if needed. if parallel_output: From 99c8ce05a792029b0f04f5c914fec35832c09dcf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 13:52:30 +0000 Subject: [PATCH 16/94] moved position of mup parameters being processed --- megatron/training.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 0dea5ab17..4463134e7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -402,8 +402,23 @@ def get_model(neox_args, use_cache=False): # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below. # If mup isn't being used anyways, this has no effect. - old_use_mup = neox_args.use_mup - neox_args.use_mup = False + # old_use_mup = neox_args.use_mup + # neox_args.use_mup = False + if neox_args.use_mup: + + if neox_args.mup_m_width == 1: + neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base + + base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" + + if neox_args.save_base_shapes: + save_base_shapes(neox_args, base_shapes, use_cache) + + # mup.set_base_shapes(model, base_shapes) + + # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute + # mup_weights_reinit(neox_args, model) + model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, @@ -436,22 +451,7 @@ def get_model(neox_args, use_cache=False): # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training model = model.to_sequential() - neox_args.use_mup = old_use_mup - - if neox_args.use_mup: - - if neox_args.mup_m_width == 1: - neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base - - base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" - - if neox_args.save_base_shapes: - save_base_shapes(neox_args, base_shapes, use_cache) - - mup.set_base_shapes(model, base_shapes) - - # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute - mup_weights_reinit(neox_args, model) + # neox_args.use_mup = old_use_mup if neox_args.deepspeed: # DeepSpeed handles CUDA, FP16, and DDP components. From b253ab6a41e4f703f7c869148ac9c5896ccdb70c Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 1 Dec 2023 13:53:04 +0000 Subject: [PATCH 17/94] add note --- megatron/model/gpt2_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 012711a62..6a44fc8ca 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -175,6 +175,7 @@ def init_specs(self): # Embedding layer # input will be (input_ids, position_ids, attention_mask) + # TODO Initilized weights here should not be divided by m_width if weight_tying: self.specs.append( TiedLayerSpec( From 1919499fa01a5526ee726e9b3565c5c667501caf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 6 Dec 2023 04:29:41 +0000 Subject: [PATCH 18/94] made param groups to hold flag for mup scaling --- megatron/model/utils.py | 57 ++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 6beac5ca2..d505fdbb3 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,6 +18,7 @@ """Utilities for models.""" import torch +from megatron.mpu import VocabParallelEmbedding from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm from megatron.model.fused_softmax import SoftmaxFusionTypes from types import GeneratorType @@ -28,8 +29,11 @@ def get_params_for_weight_decay_optimization(module, neox_args): """Divide params into with-weight-decay and without-weight-decay groups. Layernorms and biases will have no weight decay but the rest will. """ - weight_decay_params = {"params": []} - no_weight_decay_params = {"params": [], "weight_decay": 0.0} + weight_decay_params = {"params": [], "lr_adjust": True} + no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} + embedding_weight_decay_params = {"params": [], "lr_adjust": False} + embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} + for module_ in module.modules(): if any( [ @@ -44,26 +48,49 @@ def get_params_for_weight_decay_optimization(module, neox_args): [p for p in list(module_._parameters.values()) if p is not None] ) else: - weight_decay_params["params"].extend( - [ - p - for n, p in list(module_._parameters.items()) - if p is not None and n != "bias" - ] - ) - no_weight_decay_params["params"].extend( + if any( [ - p - for n, p in list(module_._parameters.items()) - if p is not None and n == "bias" + isinstance(module_, VocabParallelEmbedding), ] - ) + ): + + embedding_weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n != "bias" + ] + ) + embedding_no_weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n == "bias" + ] + ) + else: + + weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n != "bias" + ] + ) + no_weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n == "bias" + ] + ) + if neox_args.weight_decay == 0.0: # only return a single param group # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once. # to avoid this, only use a single param group when weight decay is off. return [no_weight_decay_params] - return weight_decay_params, no_weight_decay_params + return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params def exists(x): From 17678e01f445a76892ea6f8ac7127e8a30e7a8ae Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 6 Dec 2023 04:30:06 +0000 Subject: [PATCH 19/94] lr scale --- megatron/training.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 4463134e7..fad9655bc 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -535,13 +535,15 @@ def get_optimizer(model, neox_args): elif neox_args.optimizer_type.lower() == "adam": # Use Adam if neox_args.use_mup: - try: - from mup import MuAdam - - adam_optimizer = MuAdam - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception + # try: + # from mup import MuAdam + + # adam_optimizer = MuAdam + # except ModuleNotFoundError: + # print("Please install mup https://github.com/microsoft/mup") + # raise Exception + from deepspeed.ops.adam import FusedAdam as Adam + adam_optimizer = Adam else: if neox_args.use_bnb_optimizer: try: @@ -583,6 +585,12 @@ def get_optimizer(model, neox_args): else: raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized") + # This is where the LR scaling is applied + if neox_args.use_mup: + for pg in optimizer.param_groups: + if ("lr_adjust" in pg) and pg["lr_adjust"] is True: + pg["lr"] /= neox_args.mup_m_width + if neox_args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups From 2bd5ae6d15a8de982daf628cb8f0a82f34a524a0 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 6 Dec 2023 04:30:22 +0000 Subject: [PATCH 20/94] update config --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 0a30acdbe..3d714a7c7 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -664,7 +664,7 @@ Optimizer Arguments Default = None Max Learning rate during training - When using muP, this is the base learning rate + When using muP, this is the base lr From 66422913280ebb2048e7764d27b04470f8479005 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 6 Dec 2023 04:30:47 +0000 Subject: [PATCH 21/94] adjust process of mup variables --- megatron/model/transformer.py | 2 +- megatron/model/word_embeddings.py | 8 ++++---- megatron/mpu/layers.py | 8 ++++++-- megatron/neox_arguments/neox_args.py | 7 ++++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 9f48f1342..437f0b38e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -295,10 +295,10 @@ def __init__( bias=neox_args.use_bias_in_attn_linear, ) + coeff = None if neox_args.use_mup: self.norm_factor = self.hidden_size_per_attention_head else: - coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) if self.apply_query_key_layer_scaling: coeff = max(1, self.layer_number) diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index 488baf042..29b20b320 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -51,8 +51,8 @@ def __init__( self.init_method = init_method self.num_tokentypes = num_tokentypes self.use_mup = neox_args.use_mup - self.mup_embedding_mult = neox_args.mup_embedding_mult - self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult + self.mup_m_emb = float(neox_args.mup_m_emb) + # self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( @@ -142,7 +142,6 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): # OPT always adds 2 for some reason, according to the HF implementation position_ids = position_ids + self.opt_pos_emb_offset position_embeddings = self.position_embeddings(position_ids) - position_embeddings.mul_(self.mup_rp_embedding_mult) embeddings = words_embeddings + position_embeddings else: embeddings = words_embeddings @@ -155,9 +154,10 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): # Dropout. embeddings = self.embedding_dropout(embeddings) + # Y_emb = m_emb * embed(X) if self.use_mup: with torch.no_grad(): - embeddings.mul_(self.mup_embedding_mult) + embeddings = torch.mul(embeddings, self.mup_m_emb) return embeddings diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 92edbd6eb..859553551 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -429,6 +429,7 @@ def __init__( self.stride = stride self.mup_rescale_parameters = mup_rescale_parameters self.use_mup = neox_args.use_mup + self.m_width = neox_args.mup_m_width # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -547,8 +548,10 @@ def set_parallel_output(self, value: bool): ) # if gather_output is True, parallel output is False, so we set the opposite def forward(self, input_): + + # Y_logits = W_unembed * X / m_width if self.use_mup and self.mup_rescale_parameters: - input_ /= self.width_mult() + input_ /= self.m_width # Set up backprop all-reduce. input_parallel = copy_to_model_parallel_region(input_) # Matrix multiply. @@ -624,6 +627,7 @@ def __init__( self.keep_master_weight_for_test = keep_master_weight_for_test self.mup_rescale_parameters = mup_rescale_parameters self.use_mup = neox_args.use_mup + self.m_width = neox_args.mup_m_width # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -735,7 +739,7 @@ def set_parallel_output(self, parallel_output: bool): def forward(self, input_): if self.use_mup and self.mup_rescale_parameters: - input_ /= self.width_mult() + input_ /= self.m_width # Set up backprop all-reduce. if self.input_is_parallel: input_parallel = input_ diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 58780881b..de7169654 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -103,6 +103,7 @@ class NeoXArgsModel(NeoXArgsTemplate): hidden_size: int = None """ Transformer hidden size. + When using muP, this is d_model """ num_attention_heads: int = None @@ -1035,17 +1036,17 @@ class NeoXArgsTraining(NeoXArgsTemplate): Path to the base shapes to save to/load from """ - mup_emb: int = 1 + mup_m_emb: float = 1.0 """ Embedding output multiplier """ - mup_m_width: int = 1 + mup_m_width: float = None """ Manually set the layer width multiplier (d_model/d_model,base) """ - mup_d_model_base: int = 64 + mup_d_model_base: int = 256 """ d_model,base Proxy (base) model's layer width From 8be6c66b2add4b704417b46644602b375fc240d3 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 18 Dec 2023 06:32:44 +0000 Subject: [PATCH 22/94] remove calling save_base_shapes --- megatron/training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index fad9655bc..221fa6cd3 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -409,10 +409,10 @@ def get_model(neox_args, use_cache=False): if neox_args.mup_m_width == 1: neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base - base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" + # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" - if neox_args.save_base_shapes: - save_base_shapes(neox_args, base_shapes, use_cache) + # if neox_args.save_base_shapes: + # save_base_shapes(neox_args, base_shapes, use_cache) # mup.set_base_shapes(model, base_shapes) From c9fb18ba12b8974f9310e5094e75317c71666192 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 18 Dec 2023 07:48:47 +0000 Subject: [PATCH 23/94] lr adjustments is done in train_step to address lr being reset due to lr_scheduling --- megatron/training.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 221fa6cd3..994ff5fd9 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -585,12 +585,6 @@ def get_optimizer(model, neox_args): else: raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized") - # This is where the LR scaling is applied - if neox_args.use_mup: - for pg in optimizer.param_groups: - if ("lr_adjust" in pg) and pg["lr_adjust"] is True: - pg["lr"] /= neox_args.mup_m_width - if neox_args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups @@ -729,6 +723,11 @@ def backward_step(neox_args, timers, optimizer, model, loss): def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler): """Single training step.""" + if neox_args.use_mup: + for pg in optimizer.param_groups: + if ("lr_adjust" in pg) and pg["lr_adjust"] is True: + pg["lr"] /= neox_args.mup_m_width + # Pipeline parallelism schedules forward/backward/step if neox_args.is_pipe_parallel: reduced_loss = train_step_pipe( From 795371c3525ea3537074d2c5f416c7f2ab7e4207 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 18 Dec 2023 09:45:26 +0000 Subject: [PATCH 24/94] lr scaling for mup is moved here instead --- megatron/learning_rates.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index d5d2640c9..424aee20c 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -37,6 +37,7 @@ def __init__( use_checkpoint_lr_scheduler=True, override_lr_scheduler=False, use_mup=False, + mup_m_width=1, ): # Class values. @@ -51,6 +52,7 @@ def __init__( self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler self.use_mup = use_mup + self.mup_m_width = mup_m_width if self.override_lr_scheduler: assert not self.use_checkpoint_lr_scheduler, ( "both override and " "use-checkpoint are set." @@ -95,8 +97,8 @@ def step(self, step_num=None): self.num_iters = step_num new_lr = self.get_lr() for group in self.optimizer.param_groups: - if self.use_mup and "width_mult" in group: - group["lr"] = new_lr / group["width_mult"] + if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True: + group["lr"] = new_lr / self.mup_m_width else: group["lr"] = new_lr From 087beee884442ce3f98a6de34ed1aff533169e9a Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 3 Jan 2024 14:27:33 +0000 Subject: [PATCH 25/94] removed mup usage for coord check --- megatron/mup_substitute.py | 75 +++++++++++++++++++++++--------------- megatron/training.py | 20 +++++----- 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index e16a21589..78ddedff0 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F -from mup import coord_check as mup_coord_check +# from mup import coord_check as mup_coord_check from megatron.training import train_step @@ -39,31 +39,44 @@ def _get_coord_data( ): df = [] + def word_embedding_coord_check_hook(module, input, output): + with torch.no_grad(): + word_embedding_act_abs_mean_list.append(output.abs().mean().item()) + for i in range(nseeds): torch.manual_seed(i) for width, model in models.items(): model = model() model.train() - optimizer = optcls(model) + # optimizer = optcls(model) + optimizer, _ = optcls(model, neox_args) for step in range(nsteps + 1): + word_embedding_act_abs_mean_list = [] remove_hooks = [] # add hooks - for name, module in model.named_modules(): - if filter_module_by_name and not filter_module_by_name(name): - continue - remove_hooks.append( - module.register_forward_hook( - mup_coord_check._record_coords( - df, - width, - name, - step + 1, - output_fdict=output_fdict, - input_fdict=input_fdict, - param_fdict=param_fdict, - ) - ) - ) + # for name, module in model.named_modules(): + # if name.endswith(".embedding.word_embeddings"): + # print("yess") + # import sys; sys.exit + # remove_hook.append( + # module.register_forward_hook(word_embedding_coord_check_hook)) + + # # if filter_module_by_name and not filter_module_by_name(name): + # # continue + # # pass + # # remove_hooks.append( + # # module.register_forward_hook( + # # mup_coord_check._record_coords( + # # df, + # # width, + # # name, + # # step + 1, + # # output_fdict=output_fdict, + # # input_fdict=input_fdict, + # # param_fdict=param_fdict, + # # ) + # # ) + # # ) # train for a step loss_dict, skipped_iter = train_step( @@ -79,6 +92,8 @@ def _get_coord_data( for handle in remove_hooks: handle.remove() + print("word_embedding_act_abs_mean_list") + print(word_embedding_act_abs_mean_list) import gc del model @@ -180,9 +195,10 @@ def get_coord_data( if lr is None: lr = 0.1 if optimizer == "sgd" else 1e-3 if mup: - from mup.optim import MuAdam as Adam - from mup.optim import MuAdamW as AdamW - from mup.optim import MuSGD as SGD + # from mup.optim import MuAdam as Adam + # from mup.optim import MuAdamW as AdamW + # from mup.optim import MuSGD as SGD + from deepspeed.ops.adam import FusedAdam as Adam else: from torch.optim import SGD, Adam, AdamW @@ -195,14 +211,15 @@ def get_trainable(model): params.append(p) return params - if optimizer == "sgd": - optcls = lambda model: SGD(get_trainable(model), lr=lr) - elif optimizer == "adam": - optcls = lambda model: Adam(get_trainable(model), lr=lr) - elif optimizer == "adamw": - optcls = lambda model: AdamW(get_trainable(model), lr=lr) - elif optimizer is None: - raise ValueError("optimizer should be sgd|adam|adamw or a custom function") + # if optimizer == "sgd": + # optcls = lambda model: SGD(get_trainable(model), lr=lr) + # elif optimizer == "adam": + # optcls = lambda model: Adam(get_trainable(model), lr=lr) + # elif optimizer == "adamw": + # optcls = lambda model: AdamW(get_trainable(model), lr=lr) + # elif optimizer is None: + # raise ValueError("optimizer should be sgd|adam|adamw or a custom function") + optcls = optimizer data = _get_coord_data( neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs diff --git a/megatron/training.py b/megatron/training.py index 994ff5fd9..999b857d3 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -126,7 +126,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache): def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator): from megatron.mup_substitute import get_coord_data - from mup.coord_check import plot_coord_data + # from mup.coord_check import plot_coord_data def lazy_model(hidden_size): def gen(): @@ -149,17 +149,19 @@ def gen(): for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)): models[hidden_size] = lazy_model(hidden_size) + # optimizer, _ = get_optimizer(model, neox_args) + neox_args.use_mup = True df_up = get_coord_data( - neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True + neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True, optimizer=get_optimizer ) neox_args.use_mup = False df_sp = get_coord_data( - neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False + neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False, optimizer=get_optimizer ) - plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") - plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") + # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") + # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") print_rank_0("Saved coord check plots... exiting") sys.exit(1) @@ -204,6 +206,7 @@ def pretrain(neox_args): timers("train/valid/test data iterators").stop() if neox_args.use_mup and neox_args.coord_check: + print_rank_0("Do muP Coord Check") mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator) # Print setup timing. @@ -408,6 +411,7 @@ def get_model(neox_args, use_cache=False): if neox_args.mup_m_width == 1: neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base + print_rank_0(f"mup_m_width set to {neox_args.mup_m_width}") # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" @@ -623,6 +627,7 @@ def get_learning_rate_scheduler(optimizer, neox_args): use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler, override_lr_scheduler=neox_args.override_lr_scheduler, use_mup=neox_args.use_mup, + mup_m_width=neox_args.mup_m_width, ) return lr_scheduler @@ -723,11 +728,6 @@ def backward_step(neox_args, timers, optimizer, model, loss): def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler): """Single training step.""" - if neox_args.use_mup: - for pg in optimizer.param_groups: - if ("lr_adjust" in pg) and pg["lr_adjust"] is True: - pg["lr"] /= neox_args.mup_m_width - # Pipeline parallelism schedules forward/backward/step if neox_args.is_pipe_parallel: reduced_loss = train_step_pipe( From e7b7bf67a437f69a46ba6b90b1257519d5e48465 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 24 Jan 2024 15:52:21 +0000 Subject: [PATCH 26/94] latest update on coord check implementation --- megatron/mup_substitute.py | 89 ++++++++++++++++++++------------------ megatron/training.py | 33 +++++++++----- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 78ddedff0..6b54d904f 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -10,6 +10,8 @@ import torch import torch.nn.functional as F +from megatron import print_rank_0 + # from mup import coord_check as mup_coord_check from megatron.training import train_step @@ -30,7 +32,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=1, + nseeds=3, output_fdict=None, input_fdict=None, param_fdict=None, @@ -43,40 +45,47 @@ def word_embedding_coord_check_hook(module, input, output): with torch.no_grad(): word_embedding_act_abs_mean_list.append(output.abs().mean().item()) + word_embedding_act_abs_mean_list = [] + _seeds = [] + _steps = [] + remove_hooks = [] + for i in range(nseeds): torch.manual_seed(i) for width, model in models.items(): model = model() model.train() - # optimizer = optcls(model) - optimizer, _ = optcls(model, neox_args) + optimizer = optcls(model) + # optimizer, _ = optcls(model, neox_args) + for step in range(nsteps + 1): - word_embedding_act_abs_mean_list = [] - remove_hooks = [] + # add hooks - # for name, module in model.named_modules(): - # if name.endswith(".embedding.word_embeddings"): - # print("yess") - # import sys; sys.exit - # remove_hook.append( - # module.register_forward_hook(word_embedding_coord_check_hook)) - - # # if filter_module_by_name and not filter_module_by_name(name): - # # continue - # # pass - # # remove_hooks.append( - # # module.register_forward_hook( - # # mup_coord_check._record_coords( - # # df, - # # width, - # # name, - # # step + 1, - # # output_fdict=output_fdict, - # # input_fdict=input_fdict, - # # param_fdict=param_fdict, - # # ) - # # ) - # # ) + for name, module in model.named_modules(): + if name.endswith(".word_embeddings"): + remove_hooks.append( + module.register_forward_hook(word_embedding_coord_check_hook)) + + _steps.append(step) + _seeds.append(i) + + + # if filter_module_by_name and not filter_module_by_name(name): + # continue + # pass + # remove_hooks.append( + # module.register_forward_hook( + # mup_coord_check._record_coords( + # df, + # width, + # name, + # step + 1, + # output_fdict=output_fdict, + # input_fdict=input_fdict, + # param_fdict=param_fdict, + # ) + # ) + # ) # train for a step loss_dict, skipped_iter = train_step( @@ -91,14 +100,13 @@ def word_embedding_coord_check_hook(module, input, output): # remove hooks for handle in remove_hooks: handle.remove() - - print("word_embedding_act_abs_mean_list") - print(word_embedding_act_abs_mean_list) import gc - del model gc.collect() + for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list): + print_rank_0(_i, _j, _k) + return pd.DataFrame(df) @@ -211,15 +219,14 @@ def get_trainable(model): params.append(p) return params - # if optimizer == "sgd": - # optcls = lambda model: SGD(get_trainable(model), lr=lr) - # elif optimizer == "adam": - # optcls = lambda model: Adam(get_trainable(model), lr=lr) - # elif optimizer == "adamw": - # optcls = lambda model: AdamW(get_trainable(model), lr=lr) - # elif optimizer is None: - # raise ValueError("optimizer should be sgd|adam|adamw or a custom function") - optcls = optimizer + if optimizer == "sgd": + optcls = lambda model: SGD(get_trainable(model), lr=lr) + elif optimizer == "adam": + optcls = lambda model: Adam(get_trainable(model), lr=lr) + elif optimizer == "adamw": + optcls = lambda model: AdamW(get_trainable(model), lr=lr) + elif optimizer is None: + raise ValueError("optimizer should be sgd|adam|adamw or a custom function") data = _get_coord_data( neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs diff --git a/megatron/training.py b/megatron/training.py index 4825141ed..86e2d5fa3 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -55,7 +55,7 @@ CharCounter, ) from megatron.model.gpt2_model import cross_entropy -from eval_tasks import run_eval_harness +# from eval_tasks import run_eval_harness def mup_weights_reinit(neox_args, model): @@ -124,7 +124,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache): sys.exit(1) -def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator): +def mup_coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data # from mup.coord_check import plot_coord_data @@ -133,7 +133,7 @@ def gen(): old_hidden_size = neox_args.hidden_size neox_args.hidden_size = hidden_size - model, optimizer, _ = setup_model_and_optimizer( + model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) @@ -145,24 +145,35 @@ def gen(): models = {} - # Hidden size needs to be divisible by num attention heads - for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)): - models[hidden_size] = lazy_model(hidden_size) + # # Hidden size needs to be divisible by num attention heads + # for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)): + # models[hidden_size] = lazy_model(hidden_size) - # optimizer, _ = get_optimizer(model, neox_args) + # 128 + # 256 + # 512 + # 1024 + # 2048 + # 4096 + # 8192 + models[neox_args.hidden_size] = lazy_model(neox_args.hidden_size) + + print_rank_0("df_up") neox_args.use_mup = True df_up = get_coord_data( - neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True, optimizer=get_optimizer + neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" ) + print_rank_0("df_sp") neox_args.use_mup = False df_sp = get_coord_data( - neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False, optimizer=get_optimizer + neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam" ) # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") + print_rank_0("Saved coord check plots... exiting") sys.exit(1) @@ -207,7 +218,9 @@ def pretrain(neox_args): if neox_args.use_mup and neox_args.coord_check: print_rank_0("Do muP Coord Check") - mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator) + mup_coord_check(neox_args, timers, train_data_iterator) + else: + pass # Print setup timing. print_rank_0("done with setups ...") From 8dea9cef9eed9e0b1e6434addde5d529a3a0e36b Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:02:20 +0000 Subject: [PATCH 27/94] fix merge conflict --- configs/neox_arguments.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 5d58932fa..3b95b7e67 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,11 +111,7 @@ Logging Arguments - **git_hash**: str -<<<<<<< HEAD - Default = 02687a8 -======= Default = 31cb364 ->>>>>>> e5a7ea71e96eeada636c9612036dc85e886d973d current git hash of repository From 3664ebab5a0eb1614a1883c313584a1d623f5256 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:04:41 +0000 Subject: [PATCH 28/94] changed `mup_m_width` to `mup_width_multiplier` --- configs/neox_arguments.md | 4 +-- megatron/learning_rates.py | 6 ++-- megatron/model/init_functions.py | 52 ++++++++++++++-------------- megatron/model/transformer.py | 2 +- megatron/mpu/layers.py | 4 +-- megatron/neox_arguments/neox_args.py | 2 +- megatron/training.py | 8 ++--- 7 files changed, 39 insertions(+), 39 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 3b95b7e67..409baab3d 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -1567,7 +1567,7 @@ Training Arguments -- **mup_m_width**: int +- **mup_width_multiplier**: int Default = 1 @@ -1577,7 +1577,7 @@ Training Arguments - **mup_d_model_base**: int - Default = 64 + Default = 256 d_model,base Proxy (base) model's layer width diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 424aee20c..da0d3a353 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -37,7 +37,7 @@ def __init__( use_checkpoint_lr_scheduler=True, override_lr_scheduler=False, use_mup=False, - mup_m_width=1, + mup_width_multiplier=1, ): # Class values. @@ -52,7 +52,7 @@ def __init__( self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler self.use_mup = use_mup - self.mup_m_width = mup_m_width + self.mup_width_multiplier = mup_width_multiplier if self.override_lr_scheduler: assert not self.use_checkpoint_lr_scheduler, ( "both override and " "use-checkpoint are set." @@ -98,7 +98,7 @@ def step(self, step_num=None): new_lr = self.get_lr() for group in self.optimizer.param_groups: if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True: - group["lr"] = new_lr / self.mup_m_width + group["lr"] = new_lr / self.mup_width_multiplier else: group["lr"] = new_lr diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 44666b229..3eecd7308 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -68,12 +68,12 @@ def _orthogonal(tensor, gain=1): return tensor -def orthogonal_init_method(n_layers=1, mup_m_width=1.0): +def orthogonal_init_method(n_layers=1, mup_width_multiplier=1.0): """Fills the input Tensor with a (semi) orthogonal matrix, as described in Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013) Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)""" - if mup_m_width != 1: + if mup_width_multiplier != 1: raise ValueError( "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error" ) @@ -84,57 +84,57 @@ def init_(tensor): return init_ -def xavier_uniform_init_method(mup_m_width=1.0): +def xavier_uniform_init_method(mup_width_multiplier=1.0): """Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution.""" - def init_(tensor, mup_m_width=mup_m_width): + def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.xavier_uniform_(tensor) - if mup_m_width != 1: + if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_m_width) + init_weight.div_(mup_width_multiplier) return init_weight return init_ -def xavier_normal_init_method(mup_m_width=1.0): +def xavier_normal_init_method(mup_width_multiplier=1.0): """Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution.""" - def init_(tensor, mup_m_width=mup_m_width): + def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.xavier_normal_(tensor) - if mup_m_width != 1: + if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_m_width) + init_weight.div_(mup_width_multiplier) return init_weight return init_ -def small_init_init_method(dim, mup_m_width=1.0): +def small_init_init_method(dim, mup_width_multiplier=1.0): """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" std = math.sqrt(2 / (5 * dim)) - def init_(tensor, mup_m_width=mup_m_width): + def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) - if mup_m_width != 1: + if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_m_width) + init_weight.div_(mup_width_multiplier) return init_weight return init_ -def wang_init_method(n_layers, dim, mup_m_width=1.0): +def wang_init_method(n_layers, dim, mup_width_multiplier=1.0): std = 2 / n_layers / math.sqrt(dim) - def init_(tensor, mup_m_width=mup_m_width): + def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) - if mup_m_width != 1: + if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_m_width) + init_weight.div_(mup_width_multiplier) return init_weight return init_ @@ -145,30 +145,30 @@ def get_init_methods(args): def _get(name): if name == "normal": return init_method_normal( - sigma=args.init_method_std/math.sqrt(args.mup_m_width) + sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier) ) elif name == "scaled_normal": return scaled_init_method_normal( - sigma=args.init_method_std/math.sqrt(args.mup_m_width), + sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier), num_layers=args.num_layers ) elif name == "orthogonal": - return orthogonal_init_method(args.mup_m_width) + return orthogonal_init_method(args.mup_width_multiplier) elif name == "scaled_orthogonal": return orthogonal_init_method( - args.num_layers, args.mup_m_width + args.num_layers, args.mup_width_multiplier ) elif name == "xavier_uniform": - return xavier_uniform_init_method(args.mup_m_width) + return xavier_uniform_init_method(args.mup_width_multiplier) elif name == "xavier_normal": - return xavier_normal_init_method(args.mup_m_width) + return xavier_normal_init_method(args.mup_width_multiplier) elif name == "wang_init": return wang_init_method( - args.num_layers, args.hidden_size, args.mup_m_width + args.num_layers, args.hidden_size, args.mup_width_multiplier ) elif name == "small_init": return small_init_init_method( - args.hidden_size, args.mup_m_width + args.hidden_size, args.mup_width_multiplier ) else: raise NotImplementedError(f"Unknown init method {name}") diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 660a0ad6d..79424ca44 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -972,7 +972,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) if args is not None and args.use_mup: - logits_parallel /= args.mup_m_width + logits_parallel /= args.mup_width_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 859553551..db125375f 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -429,7 +429,7 @@ def __init__( self.stride = stride self.mup_rescale_parameters = mup_rescale_parameters self.use_mup = neox_args.use_mup - self.m_width = neox_args.mup_m_width + self.m_width = neox_args.mup_width_multiplier # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -627,7 +627,7 @@ def __init__( self.keep_master_weight_for_test = keep_master_weight_for_test self.mup_rescale_parameters = mup_rescale_parameters self.use_mup = neox_args.use_mup - self.m_width = neox_args.mup_m_width + self.m_width = neox_args.mup_width_multiplier # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 19864b821..b7b0bcb8c 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1046,7 +1046,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): Embedding output multiplier """ - mup_m_width: float = None + mup_width_multiplier: float = None """ Manually set the layer width multiplier (d_model/d_model,base) """ diff --git a/megatron/training.py b/megatron/training.py index 86e2d5fa3..d8461c1eb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -422,9 +422,9 @@ def get_model(neox_args, use_cache=False): # neox_args.use_mup = False if neox_args.use_mup: - if neox_args.mup_m_width == 1: - neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base - print_rank_0(f"mup_m_width set to {neox_args.mup_m_width}") + if neox_args.mup_width_multiplier == 1: + neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base + print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" @@ -640,7 +640,7 @@ def get_learning_rate_scheduler(optimizer, neox_args): use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler, override_lr_scheduler=neox_args.override_lr_scheduler, use_mup=neox_args.use_mup, - mup_m_width=neox_args.mup_m_width, + mup_width_multiplier=neox_args.mup_width_multiplier, ) return lr_scheduler From 6a462476a0db172657accc13c1e12dc7b2f97bbc Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:14:53 +0000 Subject: [PATCH 29/94] fixed notations --- megatron/model/init_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 3eecd7308..57666e567 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -18,7 +18,7 @@ def init_method_normal(sigma): - """Init method based on N(0, sigma).""" + """Init method based on N(0, sigma^2).""" def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) @@ -27,7 +27,7 @@ def init_(tensor): def scaled_init_method_normal(sigma, num_layers): - """Init method based on N(0, sigma/sqrt(2*num_layers).""" + """Init method based on N(0, sigma^2/(2*num_layers).""" std = sigma / math.sqrt(2.0 * num_layers) def init_(tensor): From 7439f9a1cd8d63baefa2c29b5d523c7cc54fb91d Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:19:17 +0000 Subject: [PATCH 30/94] correct scale --- megatron/model/init_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 57666e567..bbf109413 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -92,7 +92,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.xavier_uniform_(tensor) if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_width_multiplier) + init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ @@ -106,7 +106,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.xavier_normal_(tensor) if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_width_multiplier) + init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ @@ -121,7 +121,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_width_multiplier) + init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ @@ -134,7 +134,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) if mup_width_multiplier != 1: with torch.no_grad(): - init_weight.div_(mup_width_multiplier) + init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ From 5b2d31c9f2fe88da0a03800bd999a159934f9518 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:54:18 +0000 Subject: [PATCH 31/94] m_emb * embed(X) --- megatron/model/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 79424ca44..42e3f1893 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -972,7 +972,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) if args is not None and args.use_mup: - logits_parallel /= args.mup_width_multiplier + logits_parallel *= args.mup_emb # Gather if needed. if parallel_output: From 98caa82f92b51b2897a3407dfc0103fa184138e7 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 06:59:14 +0000 Subject: [PATCH 32/94] removed mup rescale in the layers --- megatron/model/transformer.py | 1 - megatron/mpu/layers.py | 151 ---------------------------------- 2 files changed, 152 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 42e3f1893..0fe907569 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -220,7 +220,6 @@ def __init__( init_method=init_method, gather_output=not parallel_output, skip_bias_add=False, - mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here ) # else: diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index db125375f..20dd5dec3 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -162,25 +162,6 @@ def __init__( self.weight, init_method, partition_dim=0, stride=1 ) - def mup_reinitialize_weights(self, neox_args): - if neox_args.use_cpu_initialization: - _initialize_affine_weight_cpu( - neox_args, - self.weight, - self.num_embeddings, - self.embedding_dim, - self.num_embeddings_per_partition, - 0, - partial(self.init_method, use_mup=True), - ) - else: - _initialize_affine_weight_gpu( - self.weight, - partial(self.init_method, use_mup=True), - partition_dim=0, - stride=1, - ) - def forward(self, input_): if self.model_parallel_size > 1: # Build the mask. @@ -292,25 +273,6 @@ def __init__( self._k_len_cached = None self._rel_pos_bucket_cached = None - def mup_reinitialize_weights(self, neox_args): - if self.use_cpu_initialization: - _initialize_affine_weight_cpu( - neox_args, - self.weight, - self.num_buckets, - self.heads, - self.num_heads_per_partition, - partition_dim=1, - init_method=partial(self.init_method, use_mup=True), - ) - else: - _initialize_affine_weight_gpu( - self.weight, - partial(self.init_method, use_mup=True), - partition_dim=1, - stride=1, - ) - @staticmethod def get_heads_range(global_n_heads, rank, world_size): per_partition_n_heads = divide(global_n_heads, world_size) @@ -413,7 +375,6 @@ def __init__( stride=1, keep_master_weight_for_test=False, skip_bias_add=False, - mup_rescale_parameters=False, ): super(ColumnParallelLinear, self).__init__() @@ -427,9 +388,6 @@ def __init__( self.skip_bias_add = skip_bias_add self.init_method = init_method self.stride = stride - self.mup_rescale_parameters = mup_rescale_parameters - self.use_mup = neox_args.use_mup - self.m_width = neox_args.mup_width_multiplier # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -491,56 +449,6 @@ def __init__( else: self.register_parameter("bias", None) - # Copied from Mup - def width_mult(self): - assert hasattr(self.weight, "infshape"), ( - "Please call set_base_shapes(...). If using torch.nn.DataParallel, " - "switch to distributed training with " - "torch.nn.parallel.DistributedDataParallel instead" - ) - return self.weight.infshape.width_mult() - - # Copied from Mup - def _rescale_parameters(self): - """Rescale parameters to convert SP initialization to μP initialization. - Warning: This method is NOT idempotent and should be called only once - unless you know what you are doing. - """ - if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params: - raise RuntimeError( - "`_rescale_parameters` has been called once before already. " - "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n" - "If you called `set_base_shapes` on a model loaded from a checkpoint, " - "or just want to re-set the base shapes of an existing model, " - "make sure to set the flag `rescale_params=False`.\n" - "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call." - ) - if self.bias is not None: - self.bias.data *= self.width_mult() ** 0.5 - self.weight.data *= self.width_mult() ** 0.5 - self._has_rescaled_params = True - - def mup_reinitialize_weights(self, neox_args): - if neox_args.use_cpu_initialization: - self.master_weight = _initialize_affine_weight_cpu( - neox_args, - self.weight, - self.output_size, - self.input_size, - self.output_size_per_partition, - 0, - partial(self.init_method, use_mup=True), - stride=self.stride, - return_master_weight=keep_master_weight_for_test, - ) - else: - _initialize_affine_weight_gpu( - self.weight, - partial(self.init_method, use_mup=True), - partition_dim=0, - stride=self.stride, - ) - def set_parallel_output(self, value: bool): assert isinstance(value, bool) self.gather_output = ( @@ -549,9 +457,6 @@ def set_parallel_output(self, value: bool): def forward(self, input_): - # Y_logits = W_unembed * X / m_width - if self.use_mup and self.mup_rescale_parameters: - input_ /= self.m_width # Set up backprop all-reduce. input_parallel = copy_to_model_parallel_region(input_) # Matrix multiply. @@ -609,7 +514,6 @@ def __init__( keep_master_weight_for_test=False, skip_bias_add=False, parallel_output=False, - mup_rescale_parameters=False, ): super(RowParallelLinear, self).__init__() @@ -625,9 +529,6 @@ def __init__( self.init_method = init_method self.stride = stride self.keep_master_weight_for_test = keep_master_weight_for_test - self.mup_rescale_parameters = mup_rescale_parameters - self.use_mup = neox_args.use_mup - self.m_width = neox_args.mup_width_multiplier # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -683,63 +584,11 @@ def __init__( else: self.register_parameter("bias", None) - # Copied from Mup - def width_mult(self): - assert hasattr(self.weight, "infshape"), ( - "Please call set_base_shapes(...). If using torch.nn.DataParallel, " - "switch to distributed training with " - "torch.nn.parallel.DistributedDataParallel instead" - ) - return self.weight.infshape.width_mult() - - # Copied from Mup - def _rescale_parameters(self): - """Rescale parameters to convert SP initialization to μP initialization. - Warning: This method is NOT idempotent and should be called only once - unless you know what you are doing. - """ - if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params: - raise RuntimeError( - "`_rescale_parameters` has been called once before already. " - "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n" - "If you called `set_base_shapes` on a model loaded from a checkpoint, " - "or just want to re-set the base shapes of an existing model, " - "make sure to set the flag `rescale_params=False`.\n" - "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call." - ) - if self.bias is not None: - self.bias.data *= self.width_mult() ** 0.5 - self.weight.data *= self.width_mult() ** 0.5 - self._has_rescaled_params = True - - def mup_reinitialize_weights(self, neox_args): - if neox_args.use_cpu_initialization: - self.master_weight = _initialize_affine_weight_cpu( - neox_args, - self.weight, - self.output_size, - self.input_size, - self.input_size_per_partition, - 1, - partial(self.init_method, use_mup=True), - stride=self.stride, - return_master_weight=self.keep_master_weight_for_test, - ) - else: - _initialize_affine_weight_gpu( - self.weight, - partial(self.init_method, use_mup=True), - partition_dim=1, - stride=self.stride, - ) - def set_parallel_output(self, parallel_output: bool): assert isinstance(parallel_output, bool) self.parallel_output = parallel_output def forward(self, input_): - if self.use_mup and self.mup_rescale_parameters: - input_ /= self.m_width # Set up backprop all-reduce. if self.input_is_parallel: input_parallel = input_ From 5c99637c50bd2163bc43eb369c651099ec5e6aea Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 07:03:08 +0000 Subject: [PATCH 33/94] removed mup rescale in the layers --- megatron/model/transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 0fe907569..347126b6e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -236,7 +236,6 @@ def __init__( # init_method=init_method, # parallel_output=parallel_output, # skip_bias_add=False, - # mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here # ) def forward(self, hidden_states): From a636f0610c23f4239ee753367ffb2bff5a4bbcbf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Fri, 2 Feb 2024 07:16:59 +0000 Subject: [PATCH 34/94] adjust mup_m_emb to mup_embedding_multiplier --- configs/neox_arguments.md | 2 +- megatron/model/transformer.py | 2 +- megatron/model/word_embeddings.py | 5 ++--- megatron/neox_arguments/neox_args.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 409baab3d..7acbde83d 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -1559,7 +1559,7 @@ Training Arguments -- **mup_emb**: int +- **mup_embedding_multiplier**: int Default = 1 diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 347126b6e..b5cf1754d 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -970,7 +970,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) if args is not None and args.use_mup: - logits_parallel *= args.mup_emb + logits_parallel /= args.mup_width_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index 29b20b320..bac8d2808 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -51,8 +51,7 @@ def __init__( self.init_method = init_method self.num_tokentypes = num_tokentypes self.use_mup = neox_args.use_mup - self.mup_m_emb = float(neox_args.mup_m_emb) - # self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult + self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( @@ -157,7 +156,7 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): # Y_emb = m_emb * embed(X) if self.use_mup: with torch.no_grad(): - embeddings = torch.mul(embeddings, self.mup_m_emb) + embeddings = torch.mul(embeddings, self.mup_embedding_multiplier) return embeddings diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index b7b0bcb8c..50ae34156 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1041,7 +1041,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): Path to the base shapes to save to/load from """ - mup_m_emb: float = 1.0 + mup_embedding_multiplier: float = 1.0 """ Embedding output multiplier """ From 39190c59308e3c1229de96b24831904a63b1c668 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 20 Feb 2024 15:00:58 +0000 Subject: [PATCH 35/94] add multiplier mup_output_multiplier --- megatron/model/transformer.py | 1 + megatron/neox_arguments/neox_args.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index b5cf1754d..2f9004e0e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -971,6 +971,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non if args is not None and args.use_mup: logits_parallel /= args.mup_width_multiplier + logits_parallel *= args.mup_output_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 50ae34156..dc134f883 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1046,6 +1046,11 @@ class NeoXArgsTraining(NeoXArgsTemplate): Embedding output multiplier """ + mup_output_multiplier: float = 1.0 + """ + Output logits multiplier + """ + mup_width_multiplier: float = None """ Manually set the layer width multiplier (d_model/d_model,base) From 2489cc062507e652b72dac1edd373188801be534 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 20 Feb 2024 15:31:14 +0000 Subject: [PATCH 36/94] reorder model loading --- megatron/mup_substitute.py | 2 - megatron/training.py | 78 ++++---------------------------------- 2 files changed, 8 insertions(+), 72 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 6b54d904f..bdde503bd 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -11,8 +11,6 @@ import torch.nn.functional as F from megatron import print_rank_0 - -# from mup import coord_check as mup_coord_check from megatron.training import train_step diff --git a/megatron/training.py b/megatron/training.py index d8461c1eb..6807459f6 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -71,59 +71,6 @@ def has_method(o, name): layer.mup_reinitialize_weights(neox_args) -def save_base_shapes(neox_args, base_shapes, use_cache): - - # Instantiation of the base model fails in the init function (init_functions.py) because we haven't called set_base_shapes on it at this point, so disable it temporarily here - neox_args.use_mup = False - - base_model = GPT2ModelPipe( - neox_args=neox_args, - num_tokentypes=0, - parallel_output=True, - topology=mpu.get_topology(), - use_cache=use_cache, - ) - - if not neox_args.is_pipe_parallel: - base_model = base_model.to_sequential() - - try: - import mup - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception - - base_shapes = mup.get_shapes(base_model) - - del base_model - - old_hidden_size = neox_args.hidden_size - neox_args.hidden_size = neox_args.hidden_size * neox_args.mup_width_scale - - delta_model = GPT2ModelPipe( - neox_args=neox_args, - num_tokentypes=0, - parallel_output=True, - topology=mpu.get_topology(), - use_cache=use_cache, - ) - - if not neox_args.is_pipe_parallel: - delta_model = delta_model.to_sequential() - - delta_shapes = mup.get_shapes(delta_model) - - # change back - neox_args.use_mup = True - neox_args.hidden_size = old_hidden_size - - save_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" - print(f"saving base shapes at {save_shapes}") - mup.make_base_shapes(base_shapes, delta_shapes, savefile=save_shapes) - print(f"base shapes saved...exiting") - sys.exit(1) - - def mup_coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data # from mup.coord_check import plot_coord_data @@ -200,13 +147,6 @@ def pretrain(neox_args): # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) - # Model, optimizer, and learning rate. - timers("model and optimizer").start() - model, optimizer, lr_scheduler = setup_model_and_optimizer( - neox_args=neox_args, use_cache=False, iteration=neox_args.iteration - ) - timers("model and optimizer").stop() - # Data stuff. timers("train/valid/test data iterators").start() ( @@ -219,9 +159,17 @@ def pretrain(neox_args): if neox_args.use_mup and neox_args.coord_check: print_rank_0("Do muP Coord Check") mup_coord_check(neox_args, timers, train_data_iterator) + sys.exit() else: pass + # Model, optimizer, and learning rate. + timers("model and optimizer").start() + model, optimizer, lr_scheduler = setup_model_and_optimizer( + neox_args=neox_args, use_cache=False, iteration=neox_args.iteration + ) + timers("model and optimizer").stop() + # Print setup timing. print_rank_0("done with setups ...") timers.log(["model and optimizer", "train/valid/test data iterators"]) @@ -426,16 +374,6 @@ def get_model(neox_args, use_cache=False): neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") - # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}" - - # if neox_args.save_base_shapes: - # save_base_shapes(neox_args, base_shapes, use_cache) - - # mup.set_base_shapes(model, base_shapes) - - # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute - # mup_weights_reinit(neox_args, model) - model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, From 23b877670ac7e388894233ec72b2388c2cacc9ae Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 20 Feb 2024 15:46:25 +0000 Subject: [PATCH 37/94] removed comments --- megatron/training.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 6807459f6..4fa74e010 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -73,7 +73,6 @@ def has_method(o, name): def mup_coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data - # from mup.coord_check import plot_coord_data def lazy_model(hidden_size): def gen(): @@ -92,19 +91,9 @@ def gen(): models = {} - # # Hidden size needs to be divisible by num attention heads - # for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)): - # models[hidden_size] = lazy_model(hidden_size) - - # 128 - # 256 - # 512 - # 1024 - # 2048 - # 4096 - # 8192 - - models[neox_args.hidden_size] = lazy_model(neox_args.hidden_size) + # Hidden size needs to be divisible by num attention heads + for hidden_size in [2**p for p in range(7,14)]: + models[hidden_size] = lazy_model(hidden_size) print_rank_0("df_up") neox_args.use_mup = True From 10e935e9919ed4b50d66d2ea51f76ee7ca5b01ba Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 20 Feb 2024 15:46:56 +0000 Subject: [PATCH 38/94] removed comments --- megatron/training.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 4fa74e010..4fbe9dcd2 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -108,8 +108,6 @@ def gen(): # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") - - print_rank_0("Saved coord check plots... exiting") sys.exit(1) From a0aca99adab69df3874d4c259e255f1290d3d3d6 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 20 Feb 2024 16:48:38 +0000 Subject: [PATCH 39/94] implement full process --- megatron/mup_substitute.py | 80 +++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index bdde503bd..7baa687b9 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -37,16 +37,14 @@ def _get_coord_data( show_progress=True, one_hot_target=False, ): - df = [] - - def word_embedding_coord_check_hook(module, input, output): - with torch.no_grad(): - word_embedding_act_abs_mean_list.append(output.abs().mean().item()) - - word_embedding_act_abs_mean_list = [] - _seeds = [] - _steps = [] - remove_hooks = [] + df = { + "seed": [], + "step": [], + "we_act": [], + "ao_act": [], + "fo_act": [], + "width": [], + } for i in range(nseeds): torch.manual_seed(i) @@ -54,36 +52,31 @@ def word_embedding_coord_check_hook(module, input, output): model = model() model.train() optimizer = optcls(model) - # optimizer, _ = optcls(model, neox_args) for step in range(nsteps + 1): - # add hooks + word_embedding_act_abs_mean_list = [] + attn_output_act_abs_mean_list = [] + ffn_output_act_abs_mean_list = [] + remove_hooks = [] + + def word_embedding_coord_check_hook(module, input, output): + with torch.no_grad(): + word_embedding_act_abs_mean_list.append(output.abs().mean().item()) + + def attn_output_coord_check_hook(module, input, output): + with torch.no_grad(): + attn_output_act_abs_mean_list.append(output[0].abs().mean().item()) + + def ffn_output_coord_check_hook(module, input, output): + with torch.no_grad(): + ffn_output_act_abs_mean_list.append(output[0].abs().mean().item()) + for name, module in model.named_modules(): if name.endswith(".word_embeddings"): remove_hooks.append( - module.register_forward_hook(word_embedding_coord_check_hook)) - - _steps.append(step) - _seeds.append(i) - - - # if filter_module_by_name and not filter_module_by_name(name): - # continue - # pass - # remove_hooks.append( - # module.register_forward_hook( - # mup_coord_check._record_coords( - # df, - # width, - # name, - # step + 1, - # output_fdict=output_fdict, - # input_fdict=input_fdict, - # param_fdict=param_fdict, - # ) - # ) - # ) + module.register_forward_hook(word_embedding_coord_check_hook) + ) # train for a step loss_dict, skipped_iter = train_step( @@ -95,15 +88,30 @@ def word_embedding_coord_check_hook(module, input, output): lr_scheduler=lr_scheduler, ) + word_embedding_act_abs_mean = None + attn_output_act_abs_mean = None + ffn_output_act_abs_mean = None + # remove hooks for handle in remove_hooks: handle.remove() + word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list) + attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list) + ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list) + + df["seed"].append(i) + df["step"].append(step) + df["we_act"].append(word_embedding_act_abs_mean) + # df["ao_act"].append(attn_output_act_abs_mean) + # df["fo_act"].append(ffn_output_act_abs_mean) + df["width"].append(width) + import gc del model gc.collect() - for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list): - print_rank_0(_i, _j, _k) + # for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list): + # print_rank_0(_i, _j, _k) return pd.DataFrame(df) From 9472b35917524ef104c496002987a050dbea4c7e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 21 Feb 2024 07:01:43 +0000 Subject: [PATCH 40/94] set neox_args.iteration to 0 for coord_check mode --- megatron/training.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 4fbe9dcd2..b3a0207b7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -134,17 +134,18 @@ def pretrain(neox_args): # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) - # Data stuff. - timers("train/valid/test data iterators").start() - ( - train_data_iterator, - valid_data_iterator, - test_data_iterator, - ) = build_train_valid_test_data_iterators(neox_args=neox_args) - timers("train/valid/test data iterators").stop() - if neox_args.use_mup and neox_args.coord_check: print_rank_0("Do muP Coord Check") + # Data stuff + neox_args.iteration = 0 + timers("train/valid/test data iterators").start() + ( + train_data_iterator, + valid_data_iterator, + test_data_iterator, + ) = build_train_valid_test_data_iterators(neox_args=neox_args) + timers("train/valid/test data iterators").stop() + mup_coord_check(neox_args, timers, train_data_iterator) sys.exit() else: @@ -157,6 +158,15 @@ def pretrain(neox_args): ) timers("model and optimizer").stop() + # Data stuff. + timers("train/valid/test data iterators").start() + ( + train_data_iterator, + valid_data_iterator, + test_data_iterator, + ) = build_train_valid_test_data_iterators(neox_args=neox_args) + timers("train/valid/test data iterators").stop() + # Print setup timing. print_rank_0("done with setups ...") timers.log(["model and optimizer", "train/valid/test data iterators"]) From 5c5f2df265328b3f510c25e489e5391f36e284e2 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 21 Feb 2024 07:20:18 +0000 Subject: [PATCH 41/94] move mup_width_multiplier init --- megatron/training.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index b3a0207b7..4ce2d5904 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -134,22 +134,26 @@ def pretrain(neox_args): # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) - if neox_args.use_mup and neox_args.coord_check: - print_rank_0("Do muP Coord Check") - # Data stuff - neox_args.iteration = 0 - timers("train/valid/test data iterators").start() - ( - train_data_iterator, - valid_data_iterator, - test_data_iterator, - ) = build_train_valid_test_data_iterators(neox_args=neox_args) - timers("train/valid/test data iterators").stop() - - mup_coord_check(neox_args, timers, train_data_iterator) - sys.exit() - else: - pass + if neox_args.use_mup: + + if neox_args.mup_width_multiplier is None: + neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base + print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") + + if neox_args.coord_check: + print_rank_0("---- Do muP Coord Check ----") + # Data stuff + neox_args.iteration = 0 + timers("train/valid/test data iterators").start() + ( + train_data_iterator, + valid_data_iterator, + test_data_iterator, + ) = build_train_valid_test_data_iterators(neox_args=neox_args) + timers("train/valid/test data iterators").stop() + + mup_coord_check(neox_args, timers, train_data_iterator) + sys.exit() # Model, optimizer, and learning rate. timers("model and optimizer").start() @@ -365,11 +369,6 @@ def get_model(neox_args, use_cache=False): # If mup isn't being used anyways, this has no effect. # old_use_mup = neox_args.use_mup # neox_args.use_mup = False - if neox_args.use_mup: - - if neox_args.mup_width_multiplier == 1: - neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base - print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") model = GPT2ModelPipe( neox_args=neox_args, From 7eca3e7944bd47cbc043f55dc94fa2a318b1d12e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 21 Feb 2024 07:21:37 +0000 Subject: [PATCH 42/94] mup_coord_check returns 2 df --- megatron/training.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 4ce2d5904..c54bf0ef2 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -95,9 +95,9 @@ def gen(): for hidden_size in [2**p for p in range(7,14)]: models[hidden_size] = lazy_model(hidden_size) - print_rank_0("df_up") + print_rank_0("df_mup") neox_args.use_mup = True - df_up = get_coord_data( + df_mup = get_coord_data( neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" ) print_rank_0("df_sp") @@ -109,8 +109,7 @@ def gen(): # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") print_rank_0("Saved coord check plots... exiting") - sys.exit(1) - + return df_mup, df_sp def pretrain(neox_args): """Main training program. @@ -152,7 +151,7 @@ def pretrain(neox_args): ) = build_train_valid_test_data_iterators(neox_args=neox_args) timers("train/valid/test data iterators").stop() - mup_coord_check(neox_args, timers, train_data_iterator) + df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator) sys.exit() # Model, optimizer, and learning rate. From c9a3a6560f90b533a5683ea54d29366bd452c8d2 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 21 Feb 2024 16:36:05 +0000 Subject: [PATCH 43/94] can run --- megatron/mup_substitute.py | 11 ++++------- megatron/training.py | 21 +++++++++++++-------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 7baa687b9..55edecec1 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -51,6 +51,7 @@ def _get_coord_data( for width, model in models.items(): model = model() model.train() + neox_args.hidden_size = width optimizer = optcls(model) for step in range(nsteps + 1): @@ -208,13 +209,9 @@ def get_coord_data( """ if lr is None: lr = 0.1 if optimizer == "sgd" else 1e-3 - if mup: - # from mup.optim import MuAdam as Adam - # from mup.optim import MuAdamW as AdamW - # from mup.optim import MuSGD as SGD - from deepspeed.ops.adam import FusedAdam as Adam - else: - from torch.optim import SGD, Adam, AdamW + + from torch.optim import SGD, AdamW, Adam + # from deepspeed.ops.adam import FusedAdam as Adam def get_trainable(model): params = model.parameters() diff --git a/megatron/training.py b/megatron/training.py index c54bf0ef2..1396862d0 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -79,7 +79,7 @@ def gen(): old_hidden_size = neox_args.hidden_size neox_args.hidden_size = hidden_size - model, optimizer, lr_scheduler = setup_model_and_optimizer( + model, *_ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) @@ -92,23 +92,26 @@ def gen(): models = {} # Hidden size needs to be divisible by num attention heads - for hidden_size in [2**p for p in range(7,14)]: + for hidden_size in [2**p for p in range(8,14)]: models[hidden_size] = lazy_model(hidden_size) - print_rank_0("df_mup") + print_rank_0(">>> Coord Check for mu Parameterization") neox_args.use_mup = True df_mup = get_coord_data( neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" ) - print_rank_0("df_sp") + print_rank_0(">>> Coord Check for standard Parameterization") neox_args.use_mup = False df_sp = get_coord_data( neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam" ) - # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") + print_rank_0(df_mup) + # plot_coord_data(df_mup, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") print_rank_0("Saved coord check plots... exiting") + + import sys; sys.exit() return df_mup, df_sp def pretrain(neox_args): @@ -492,8 +495,9 @@ def get_optimizer(model, neox_args): # except ModuleNotFoundError: # print("Please install mup https://github.com/microsoft/mup") # raise Exception - from deepspeed.ops.adam import FusedAdam as Adam - adam_optimizer = Adam + # from deepspeed.ops.adam import FusedAdam as Adam + # adam_optimizer = Adam + adam_optimizer = torch.optim.Adam else: if neox_args.use_bnb_optimizer: try: @@ -514,7 +518,8 @@ def get_optimizer(model, neox_args): print( "WARNING: APEX not installed - defaulting to deepspeed's fused adam" ) - from deepspeed.ops.adam import FusedAdam as Adam + # from deepspeed.ops.adam import FusedAdam as Adam + from torch.optim import Adam adam_optimizer = Adam optimizer = adam_optimizer( param_groups, From a7877d4fb07699406f97de8a1cf7a68b351b9dc7 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 06:31:03 +0000 Subject: [PATCH 44/94] remove commehts --- megatron/mup_substitute.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 55edecec1..7c9e0534b 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -21,7 +21,7 @@ def _get_coord_data( models, dataloader, optcls, - nsteps=3, + nsteps=10, dict_in_out=False, flatten_input=False, flatten_output=False, @@ -30,7 +30,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=3, + nseeds=1, output_fdict=None, input_fdict=None, param_fdict=None, @@ -111,9 +111,6 @@ def ffn_output_coord_check_hook(module, input, output): del model gc.collect() - # for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list): - # print_rank_0(_i, _j, _k) - return pd.DataFrame(df) From bd9d399f61e25ac098fc2243e10b1b226e3d1ec7 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 06:45:02 +0000 Subject: [PATCH 45/94] add hooks --- megatron/mup_substitute.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 7c9e0534b..ebf98d64d 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -74,13 +74,22 @@ def ffn_output_coord_check_hook(module, input, output): ffn_output_act_abs_mean_list.append(output[0].abs().mean().item()) for name, module in model.named_modules(): + print_rank_0(name) if name.endswith(".word_embeddings"): remove_hooks.append( module.register_forward_hook(word_embedding_coord_check_hook) ) + elif name.endswith(".attention.dense"): + remove_hooks.append( + module.register_forward_hook(attn_output_coord_check_hook) + ) + elif name.endswith(".mlp.dense_4h_to_h"): + remove_hooks.append( + module.register_forward_hook(ffn_output_coord_check_hook) + ) # train for a step - loss_dict, skipped_iter = train_step( + train_step( neox_args=neox_args, timers=timers, data_iterator=dataloader, From fe180d3679053781e9a2c5b099407c4428ccddae Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 06:45:15 +0000 Subject: [PATCH 46/94] remove comments --- megatron/mup_substitute.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index ebf98d64d..44e3e1d66 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -74,7 +74,6 @@ def ffn_output_coord_check_hook(module, input, output): ffn_output_act_abs_mean_list.append(output[0].abs().mean().item()) for name, module in model.named_modules(): - print_rank_0(name) if name.endswith(".word_embeddings"): remove_hooks.append( module.register_forward_hook(word_embedding_coord_check_hook) From b240c19826825dfa7d1cff9f0760fe7bd1ba3acf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 06:45:46 +0000 Subject: [PATCH 47/94] uncomment activation data --- megatron/mup_substitute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 44e3e1d66..673ef40b1 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -111,8 +111,8 @@ def ffn_output_coord_check_hook(module, input, output): df["seed"].append(i) df["step"].append(step) df["we_act"].append(word_embedding_act_abs_mean) - # df["ao_act"].append(attn_output_act_abs_mean) - # df["fo_act"].append(ffn_output_act_abs_mean) + df["ao_act"].append(attn_output_act_abs_mean) + df["fo_act"].append(ffn_output_act_abs_mean) df["width"].append(width) import gc From 93b424165be4dd49731bf9507396ac23b9d3db2f Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 09:56:25 +0000 Subject: [PATCH 48/94] plot coords --- megatron/mup_substitute.py | 14 ++++++++------ megatron/training.py | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 673ef40b1..5c369ca29 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -30,7 +30,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=1, + nseeds=2, output_fdict=None, input_fdict=None, param_fdict=None, @@ -46,10 +46,11 @@ def _get_coord_data( "width": [], } - for i in range(nseeds): - torch.manual_seed(i) - for width, model in models.items(): - model = model() + for width, model_obj in models.items(): + for i in range(nseeds): + torch.manual_seed(10**i) + print_rank_0(f">>> Running Model with width: {width} on seed: {i}") + model = model_obj() model.train() neox_args.hidden_size = width optimizer = optcls(model) @@ -116,8 +117,9 @@ def ffn_output_coord_check_hook(module, input, output): df["width"].append(width) import gc - del model + del model, optimizer gc.collect() + torch.cuda.empty_cache() return pd.DataFrame(df) diff --git a/megatron/training.py b/megatron/training.py index 1396862d0..f3b5e0fc8 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -57,6 +57,31 @@ from megatron.model.gpt2_model import cross_entropy # from eval_tasks import run_eval_harness +import seaborn as sns +import matplotlib.pyplot as plt + + +def plot_coord_data(df, activation, graph_name): + + """If distributed is initialized print only on rank 0.""" + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + _plot_data(df, activation, graph_name) + else: + _plot_data(df, activation, graph_name) + + def _plot_data(df, activation, graph_name): + df = df.groupby(['step', 'width']).mean().reset_index() + sns.lineplot( + data=df, + x="width", y=activation, hue="step", errorbar=None, style="step", + marker="o", dashes=False, legend='full' + ) + plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) + plt.savefig(f"{graph_name}.png") + return 0 + + return 0 def mup_weights_reinit(neox_args, model): def has_method(o, name): @@ -92,7 +117,7 @@ def gen(): models = {} # Hidden size needs to be divisible by num attention heads - for hidden_size in [2**p for p in range(8,14)]: + for hidden_size in [2**p for p in range(8,11)]: models[hidden_size] = lazy_model(hidden_size) print_rank_0(">>> Coord Check for mu Parameterization") @@ -106,12 +131,13 @@ def gen(): neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam" ) - print_rank_0(df_mup) - # plot_coord_data(df_mup, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg") - # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg") + df_mup.to_csv("df_mup.csv", index=False) + df_sp.to_csv("df_sp.csv", index=False) + for activation in ["we_act", "ao_act", "fo_act"]: + plot_coord_data(df_mup, activation, graph_name=f"coord_check_up.{activation}.jpg") + plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp.{activation}.jpg") print_rank_0("Saved coord check plots... exiting") - import sys; sys.exit() return df_mup, df_sp def pretrain(neox_args): From d4899fc2d384ccb7d77f4ed2e96896baa400d2ce Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 13:14:42 +0000 Subject: [PATCH 49/94] removed variables, add way to plot only from rank 0 --- megatron/mup_substitute.py | 4 ++-- megatron/training.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 5c369ca29..a8534baae 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -241,6 +241,6 @@ def get_trainable(model): data = _get_coord_data( neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs ) - data["optimizer"] = optimizer - data["lr"] = lr + # data["optimizer"] = optimizer + # data["lr"] = lr return data diff --git a/megatron/training.py b/megatron/training.py index f3b5e0fc8..b15d0b811 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -63,13 +63,6 @@ def plot_coord_data(df, activation, graph_name): - """If distributed is initialized print only on rank 0.""" - if torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - _plot_data(df, activation, graph_name) - else: - _plot_data(df, activation, graph_name) - def _plot_data(df, activation, graph_name): df = df.groupby(['step', 'width']).mean().reset_index() sns.lineplot( @@ -81,6 +74,13 @@ def _plot_data(df, activation, graph_name): plt.savefig(f"{graph_name}.png") return 0 + """If distributed is initialized print only on rank 0.""" + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + _plot_data(df, activation, graph_name) + else: + _plot_data(df, activation, graph_name) + return 0 def mup_weights_reinit(neox_args, model): @@ -134,8 +134,8 @@ def gen(): df_mup.to_csv("df_mup.csv", index=False) df_sp.to_csv("df_sp.csv", index=False) for activation in ["we_act", "ao_act", "fo_act"]: - plot_coord_data(df_mup, activation, graph_name=f"coord_check_up.{activation}.jpg") - plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp.{activation}.jpg") + plot_coord_data(df_mup, activation, graph_name=f"coord_check_mup-{activation}") + plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp-{activation}") print_rank_0("Saved coord check plots... exiting") return df_mup, df_sp From f589e29d6a3a7354c1761cf8d29e197719f0e53e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 13:48:27 +0000 Subject: [PATCH 50/94] changed key name in dict --- megatron/mup_substitute.py | 30 +++++++++++++++++++++++------- megatron/training.py | 29 ++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index a8534baae..fb9511d56 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -40,9 +40,10 @@ def _get_coord_data( df = { "seed": [], "step": [], - "we_act": [], - "ao_act": [], - "fo_act": [], + "word_embedding_act_abs_mean": [], + "attn_output_act_abs_mean": [], + "ffn_output_act_abs_mean": [], + "output_logits_act_abs_mean": [], "width": [], } @@ -60,6 +61,7 @@ def _get_coord_data( word_embedding_act_abs_mean_list = [] attn_output_act_abs_mean_list = [] ffn_output_act_abs_mean_list = [] + output_logits_act_abs_mean_list = [] remove_hooks = [] def word_embedding_coord_check_hook(module, input, output): @@ -74,7 +76,14 @@ def ffn_output_coord_check_hook(module, input, output): with torch.no_grad(): ffn_output_act_abs_mean_list.append(output[0].abs().mean().item()) + def output_logits_coord_check_hook(module, input, output): + with torch.no_grad(): + # print("output_logits_coord_check_hook") + # print_rank_0(output.shape) + output_logits_act_abs_mean_list.append(output[0].abs().mean().item()) + for name, module in model.named_modules(): + print_rank_0(name) if name.endswith(".word_embeddings"): remove_hooks.append( module.register_forward_hook(word_embedding_coord_check_hook) @@ -87,6 +96,10 @@ def ffn_output_coord_check_hook(module, input, output): remove_hooks.append( module.register_forward_hook(ffn_output_coord_check_hook) ) + elif name.endswith(".final_linear"): + remove_hooks.append( + module.register_forward_hook(output_logits_coord_check_hook) + ) # train for a step train_step( @@ -101,6 +114,7 @@ def ffn_output_coord_check_hook(module, input, output): word_embedding_act_abs_mean = None attn_output_act_abs_mean = None ffn_output_act_abs_mean = None + output_logits_act_abs_mean = None # remove hooks for handle in remove_hooks: @@ -108,12 +122,14 @@ def ffn_output_coord_check_hook(module, input, output): word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list) attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list) ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list) + output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list) df["seed"].append(i) - df["step"].append(step) - df["we_act"].append(word_embedding_act_abs_mean) - df["ao_act"].append(attn_output_act_abs_mean) - df["fo_act"].append(ffn_output_act_abs_mean) + df["step"].append(f"t={step}") + df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean) + df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean) + df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean) + df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean) df["width"].append(width) import gc diff --git a/megatron/training.py b/megatron/training.py index b15d0b811..15f16a56c 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -61,9 +61,9 @@ import matplotlib.pyplot as plt -def plot_coord_data(df, activation, graph_name): +def plot_coord_data(df, graph_name_prefix, mup=True): - def _plot_data(df, activation, graph_name): + def _plot_data(df, activation, graph_name_prefix): df = df.groupby(['step', 'width']).mean().reset_index() sns.lineplot( data=df, @@ -71,15 +71,28 @@ def _plot_data(df, activation, graph_name): marker="o", dashes=False, legend='full' ) plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) - plt.savefig(f"{graph_name}.png") + plt.xlabel("Width") + plt.ylabel("Activation with {}".format("muP" if mup else "SP")) + plt.title(f"{activation}") + plt.savefig(f"{graph_name_prefix}-{activation}.png") + plt.close() + return 0 + activation_list = [ + "word_embedding_act_abs_mean", + "attn_output_act_abs_mean", + "ffn_output_act_abs_mean", + "output_logits_act_abs_mean", + ] """If distributed is initialized print only on rank 0.""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: - _plot_data(df, activation, graph_name) + for activation in activation_list: + _plot_data(df, activation, graph_name_prefix) else: - _plot_data(df, activation, graph_name) + for activation in activation_list: + _plot_data(df, activation, graph_name_prefix) return 0 @@ -115,7 +128,6 @@ def gen(): return gen models = {} - # Hidden size needs to be divisible by num attention heads for hidden_size in [2**p for p in range(8,11)]: models[hidden_size] = lazy_model(hidden_size) @@ -133,9 +145,8 @@ def gen(): df_mup.to_csv("df_mup.csv", index=False) df_sp.to_csv("df_sp.csv", index=False) - for activation in ["we_act", "ao_act", "fo_act"]: - plot_coord_data(df_mup, activation, graph_name=f"coord_check_mup-{activation}") - plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp-{activation}") + plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True) + plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False) print_rank_0("Saved coord check plots... exiting") return df_mup, df_sp From 8261e0dc35fd679676e360eb8504619ceae0f86a Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 13:50:51 +0000 Subject: [PATCH 51/94] remove print --- megatron/mup_substitute.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index fb9511d56..8cf29973d 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -83,7 +83,6 @@ def output_logits_coord_check_hook(module, input, output): output_logits_act_abs_mean_list.append(output[0].abs().mean().item()) for name, module in model.named_modules(): - print_rank_0(name) if name.endswith(".word_embeddings"): remove_hooks.append( module.register_forward_hook(word_embedding_coord_check_hook) From 25aa786c334384daec9bb3b4cada2f5bb26b615e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 14:49:48 +0000 Subject: [PATCH 52/94] fix how width_multiplier is applied --- megatron/mup_substitute.py | 1 + megatron/training.py | 41 +++++++++++++++++++------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 8cf29973d..a8931e249 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -253,6 +253,7 @@ def get_trainable(model): elif optimizer is None: raise ValueError("optimizer should be sgd|adam|adamw or a custom function") + neox_args.use_mup = mup data = _get_coord_data( neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs ) diff --git a/megatron/training.py b/megatron/training.py index 15f16a56c..d8b5e779f 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -116,6 +116,7 @@ def lazy_model(hidden_size): def gen(): old_hidden_size = neox_args.hidden_size neox_args.hidden_size = hidden_size + neox_args.mup_width_multiplier = None model, *_ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False @@ -129,7 +130,7 @@ def gen(): models = {} # Hidden size needs to be divisible by num attention heads - for hidden_size in [2**p for p in range(8,11)]: + for hidden_size in [2**p for p in range(7,11)]: models[hidden_size] = lazy_model(hidden_size) print_rank_0(">>> Coord Check for mu Parameterization") @@ -173,26 +174,20 @@ def pretrain(neox_args): # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) - if neox_args.use_mup: - - if neox_args.mup_width_multiplier is None: - neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base - print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") - - if neox_args.coord_check: - print_rank_0("---- Do muP Coord Check ----") - # Data stuff - neox_args.iteration = 0 - timers("train/valid/test data iterators").start() - ( - train_data_iterator, - valid_data_iterator, - test_data_iterator, - ) = build_train_valid_test_data_iterators(neox_args=neox_args) - timers("train/valid/test data iterators").stop() - - df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator) - sys.exit() + if neox_args.use_mup and neox_args.coord_check: + print_rank_0("---- Do muP Coord Check ----") + # Data stuff + neox_args.iteration = 0 + timers("train/valid/test data iterators").start() + ( + train_data_iterator, + valid_data_iterator, + test_data_iterator, + ) = build_train_valid_test_data_iterators(neox_args=neox_args) + timers("train/valid/test data iterators").stop() + + df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator) + sys.exit() # Model, optimizer, and learning rate. timers("model and optimizer").start() @@ -623,6 +618,10 @@ def get_learning_rate_scheduler(optimizer, neox_args): def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): """Setup model and optimizer.""" + if neox_args.mup_width_multiplier is None: + neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base + print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") + model = get_model(neox_args=neox_args, use_cache=use_cache) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) From 4d246a15c1dc6fe504a9de5c50d9f9e99e0a3cac Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 22 Feb 2024 15:54:51 +0000 Subject: [PATCH 53/94] updated plot config --- megatron/mup_substitute.py | 5 +++-- megatron/training.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index a8931e249..7cd5cb1ea 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -124,7 +124,7 @@ def output_logits_coord_check_hook(module, input, output): output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list) df["seed"].append(i) - df["step"].append(f"t={step}") + df["step"].append(step) df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean) df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean) df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean) @@ -134,7 +134,8 @@ def output_logits_coord_check_hook(module, input, output): import gc del model, optimizer gc.collect() - torch.cuda.empty_cache() + with torch.no_grad(): + torch.cuda.empty_cache() return pd.DataFrame(df) diff --git a/megatron/training.py b/megatron/training.py index d8b5e779f..c5afd7a94 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -65,12 +65,14 @@ def plot_coord_data(df, graph_name_prefix, mup=True): def _plot_data(df, activation, graph_name_prefix): df = df.groupby(['step', 'width']).mean().reset_index() + sns.color_palette("magma") sns.lineplot( data=df, x="width", y=activation, hue="step", errorbar=None, style="step", marker="o", dashes=False, legend='full' ) plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) + plt.tight_layout() plt.xlabel("Width") plt.ylabel("Activation with {}".format("muP" if mup else "SP")) plt.title(f"{activation}") From 84c5380c1989c1ef90ec9c3915cd17217fd1d895 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 26 Feb 2024 15:39:57 +0000 Subject: [PATCH 54/94] update files --- configs/coord_check.yml | 113 +++++++++++++++++++++++++++++++++++++ megatron/mup_substitute.py | 8 ++- 2 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 configs/coord_check.yml diff --git a/configs/coord_check.yml b/configs/coord_check.yml new file mode 100644 index 000000000..299eab290 --- /dev/null +++ b/configs/coord_check.yml @@ -0,0 +1,113 @@ +{ + # parallelism settings + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + # model settings + "num_layers": 8, + "num_attention_heads": 8, + "seq_length": 128, + "max_position_embeddings": 128, + "pos_emb": "rotary", + "rotary_pct": 0.25, + "no_weight_tying": true, + "gpt_j_residual": true, + "output_layer_parallelism": "column", + + # "attention_config": [[["flash"], 8]], + + # these should provide some speedup but takes a while to build, set to true if desired + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": true, + + # init methods + "init_method": "normal", + "output_layer_init_method": "scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.006, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + # "min_lr": 0.006, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "gradient_accumulation_steps": 8, + "data_impl": "mmap", + "num_workers": 1, + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.0, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "precision": "fp32", + # "fp16": { + # "fp16": true, + # "enabled": true, + # "loss_scale": 0, + # "loss_scale_window": 1000, + # "initial_scale_power": 12, + # "hysteresis": 2, + # "min_loss_scale": 1, + # }, + + # misc. training settings + "train_iters": 300, + "lr_decay_iters": 300, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 300, + "eval_interval": 300, + "eval_iters": 10, + + # logging + "log_interval": 10, + "steps_per_print": 10, + "wall_clock_breakdown": true, + + "tokenizer_type": "HFTokenizer", + "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", + + "coord_check": true, + "use_mup": true, + # sigma_base + "init_method_std": 0.08, + # "mup_embedding_multiplier": 5, + # "mup_output_multiplier": 1, + # "mup_width_multiplier": 1, + "mup_d_model_base": 128, + "hidden_size": 128, + + "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document", + + # "launcher": "slurm", + # "deepspeed_slurm": true, + +} diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 7cd5cb1ea..a01a9bec4 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -2,6 +2,7 @@ Helper functions for performing coord check. """ import os +import gc from copy import copy from itertools import product @@ -30,7 +31,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=2, + nseeds=10, output_fdict=None, input_fdict=None, param_fdict=None, @@ -131,12 +132,15 @@ def output_logits_coord_check_hook(module, input, output): df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean) df["width"].append(width) - import gc del model, optimizer gc.collect() with torch.no_grad(): torch.cuda.empty_cache() + gc.collect() + with torch.no_grad(): + torch.cuda.empty_cache() + return pd.DataFrame(df) From 42d4cdea9b1833df9c175dd6ac28d970bd0ec11f Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 26 Feb 2024 15:47:02 +0000 Subject: [PATCH 55/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 90 +++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 23 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index e74993c0c..d79034a14 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -7,7 +7,7 @@ LR Scheduler Arguments -- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential'] +- **lr_decay_style**: Literal Default = linear @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 211e726 + Default = b2f1101 current git hash of repository @@ -253,7 +253,7 @@ Model Arguments -- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16'] +- **precision**: Literal Default = None @@ -274,6 +274,17 @@ Model Arguments Default = None Transformer hidden size. + When using muP, this is d_model + + + +- **intermediate_size**: int + + Default = None + + Transformer intermediate size. Currently only used for "mlp_type": "llama". + + If not passed, will be set to a reasonable default. @@ -283,6 +294,22 @@ Model Arguments Number of transformer attention heads. + If num_kv_heads is set, will control only number of query heads. + + + +- **num_kv_heads**: int + + Default = None + + Number of transformer key/value attention heads. + + If set to None or the same value as num_attention_heads, will perform multi-head attention (MHA). + If set to < num_attention_heads but > 1, will perform grouped-query attention (GQA) (https://arxiv.org/pdf/2305.13245.pdf) + If set to 1, will perform multi-query attention. + + Must be < num_attention_heads and divide num_attention_heads evenly. + - **seq_length**: int @@ -293,6 +320,14 @@ Model Arguments +- **sliding_window_width**: int + + Default = None + + Width of the attention sliding window. Only supported with Flash Attention 2. + + + - **max_position_embeddings**: int Default = None @@ -301,7 +336,7 @@ Model Arguments -- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm'] +- **norm**: Literal Default = layernorm @@ -349,7 +384,7 @@ Model Arguments -- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none'] +- **pos_emb**: Literal Default = learned @@ -463,7 +498,7 @@ Model Arguments -- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu'] +- **activation**: Literal Default = gelu @@ -568,7 +603,7 @@ Model Arguments -- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init'] +- **init_method**: Literal Default = normal @@ -577,7 +612,7 @@ Model Arguments -- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init'] +- **output_layer_init_method**: Literal Default = scaled_normal @@ -660,7 +695,7 @@ Model Arguments -- **output_layer_parallelism**: typing.Literal['column'] +- **output_layer_parallelism**: Literal Default = column @@ -674,7 +709,7 @@ Optimizer Arguments -- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion'] +- **optimizer_type**: Literal Default = adam @@ -691,7 +726,7 @@ Optimizer Arguments -- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']] +- **zero_stage**: Union Default = None @@ -736,7 +771,7 @@ Optimizer Arguments Default = None Max Learning rate during training - When using muP, this is the base lr + When using muP, this is the base learning rate @@ -1026,7 +1061,7 @@ Text Generation arguments - **prompt_end**: str - Default = + Default = a single prompt's end. Defaults to newline @@ -1068,7 +1103,7 @@ Text Generation arguments - **eval_results_prefix**: str - Default = + Default = prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json @@ -1090,7 +1125,7 @@ Tokenizer Arguments -- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer'] +- **tokenizer_type**: Literal Default = GPT2BPETokenizer @@ -1221,7 +1256,7 @@ Training Arguments -- **data_impl**: typing.Literal['infer', 'mmap', 'cached'] +- **data_impl**: Literal Default = infer @@ -1285,7 +1320,7 @@ Training Arguments -- **checkpoint_scale**: typing.Literal['linear', 'log'] +- **checkpoint_scale**: Literal Default = linear @@ -1617,17 +1652,25 @@ Training Arguments -- **mup_embedding_multiplier**: int +- **mup_embedding_multiplier**: float - Default = 1 + Default = 1.0 Embedding output multiplier -- **mup_width_multiplier**: int +- **mup_output_multiplier**: float - Default = 1 + Default = 1.0 + + Output logits multiplier + + + +- **mup_width_multiplier**: float + + Default = None Manually set the layer width multiplier (d_model/d_model,base) @@ -1788,7 +1831,7 @@ Args for deepspeed config Default = None - + @@ -2035,7 +2078,7 @@ Args for deepspeed runner (deepspeed.launcher.runner). -- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm'] +- **launcher**: Literal Default = pdsh @@ -2088,3 +2131,4 @@ Args for deepspeed runner (deepspeed.launcher.runner). Default = None Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard. + From 4c477d51a4b563375b1a7567e853b641ff1fff21 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 02:50:27 +0000 Subject: [PATCH 56/94] init function, add input embedding different initialization --- megatron/model/gpt2_model.py | 4 ++-- megatron/model/init_functions.py | 26 ++++++++++++++++---------- megatron/mup_substitute.py | 4 +++- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index f3ccbdf6b..652004c8b 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -118,7 +118,7 @@ def __init__( self.parallel_output = parallel_output self.hidden_size = self.neox_args.hidden_size self.num_tokentypes = num_tokentypes - self.init_method, self.output_layer_init_method = get_init_methods( + self.init_method, self.input_embedding_init_method, self.output_layer_init_method = get_init_methods( self.neox_args ) self.__topology__ = topology @@ -188,7 +188,7 @@ def init_specs(self): self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, - self.init_method, + self.input_embedding_init_method, self.num_tokentypes, tied_weight_attr="word_embeddings_weight", ) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index fc4d23ea6..7554a7b94 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -142,35 +142,41 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): def get_init_methods(args): - def _get(name): + def _get(name, use_mup=False): if name == "normal": + sigma = args.init_method_std + if use_mup: + sigma = sigma/math.sqrt(args.mup_width_multiplier) return init_method_normal( - sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier) + sigma=sigma, ) elif name == "scaled_normal": + sigma = args.init_method_std + if use_mup: + sigma = sigma/math.sqrt(args.mup_width_multiplier) return scaled_init_method_normal( - sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier), + sigma=sigma, num_layers=args.num_layers ) elif name == "orthogonal": - return orthogonal_init_method(args.mup_width_multiplier) + return orthogonal_init_method(args.mup_width_multiplier if use_mup else 1.0) elif name == "scaled_orthogonal": return orthogonal_init_method( - args.num_layers, args.mup_width_multiplier + args.num_layers, args.mup_width_multiplier if use_mup else 1.0 ) elif name == "xavier_uniform": - return xavier_uniform_init_method(args.mup_width_multiplier) + return xavier_uniform_init_method(args.mup_width_multiplier if use_mup else 1.0) elif name == "xavier_normal": - return xavier_normal_init_method(args.mup_width_multiplier) + return xavier_normal_init_method(args.mup_width_multiplier if use_mup else 1.0) elif name == "wang_init": return wang_init_method( - args.num_layers, args.hidden_size, args.mup_width_multiplier + args.num_layers, args.hidden_size, args.mup_width_multiplier if use_mup else 1.0 ) elif name == "small_init": return small_init_init_method( - args.hidden_size, args.mup_width_multiplier + args.hidden_size, args.mup_width_multiplier if use_mup else 1.0 ) else: raise NotImplementedError(f"Unknown init method {name}") - return _get(args.init_method), _get(args.output_layer_init_method) + return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index a01a9bec4..520266b78 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -31,7 +31,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=10, + nseeds=2, output_fdict=None, input_fdict=None, param_fdict=None, @@ -47,6 +47,8 @@ def _get_coord_data( "output_logits_act_abs_mean": [], "width": [], } + with torch.no_grad(): + torch.cuda.empty_cache() for width, model_obj in models.items(): for i in range(nseeds): From 65c103e5676a58ff4f1d2d35122c0ef45e6bd740 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 15:16:34 +0000 Subject: [PATCH 57/94] changeoutput layer to normal --- configs/coord_check.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/coord_check.yml b/configs/coord_check.yml index 299eab290..05e7b6bb8 100644 --- a/configs/coord_check.yml +++ b/configs/coord_check.yml @@ -22,7 +22,7 @@ # init methods "init_method": "normal", - "output_layer_init_method": "scaled_normal", + "output_layer_init_method": "normal", # optimizer settings "optimizer": { @@ -48,8 +48,8 @@ }, # batch / data settings - "train_micro_batch_size_per_gpu": 4, - "gradient_accumulation_steps": 8, + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 32, "data_impl": "mmap", "num_workers": 1, From 08b5d40a7c7bc115d5f45b3f3b97cc6b40b6f1cf Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 15:17:41 +0000 Subject: [PATCH 58/94] change from mean to std --- megatron/mup_substitute.py | 52 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 520266b78..87cff559d 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -31,7 +31,7 @@ def _get_coord_data( filter_module_by_name=None, fix_data=True, cuda=True, - nseeds=2, + nseeds=10, output_fdict=None, input_fdict=None, param_fdict=None, @@ -41,10 +41,10 @@ def _get_coord_data( df = { "seed": [], "step": [], - "word_embedding_act_abs_mean": [], - "attn_output_act_abs_mean": [], - "ffn_output_act_abs_mean": [], - "output_logits_act_abs_mean": [], + "word_embedding_act_abs_std": [], + "attn_output_act_abs_std": [], + "ffn_output_act_abs_std": [], + "output_logits_act_abs_std": [], "width": [], } with torch.no_grad(): @@ -53,7 +53,7 @@ def _get_coord_data( for width, model_obj in models.items(): for i in range(nseeds): torch.manual_seed(10**i) - print_rank_0(f">>> Running Model with width: {width} on seed: {i}") + print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") model = model_obj() model.train() neox_args.hidden_size = width @@ -61,29 +61,29 @@ def _get_coord_data( for step in range(nsteps + 1): - word_embedding_act_abs_mean_list = [] - attn_output_act_abs_mean_list = [] - ffn_output_act_abs_mean_list = [] - output_logits_act_abs_mean_list = [] + word_embedding_act_abs_std_list = [] + attn_output_act_abs_std_list = [] + ffn_output_act_abs_std_list = [] + output_logits_act_abs_std_list = [] remove_hooks = [] def word_embedding_coord_check_hook(module, input, output): with torch.no_grad(): - word_embedding_act_abs_mean_list.append(output.abs().mean().item()) + word_embedding_act_abs_std_list.append(output.abs().std().item()) def attn_output_coord_check_hook(module, input, output): with torch.no_grad(): - attn_output_act_abs_mean_list.append(output[0].abs().mean().item()) + attn_output_act_abs_std_list.append(output[0].abs().std().item()) def ffn_output_coord_check_hook(module, input, output): with torch.no_grad(): - ffn_output_act_abs_mean_list.append(output[0].abs().mean().item()) + ffn_output_act_abs_std_list.append(output[0].abs().std().item()) def output_logits_coord_check_hook(module, input, output): with torch.no_grad(): # print("output_logits_coord_check_hook") # print_rank_0(output.shape) - output_logits_act_abs_mean_list.append(output[0].abs().mean().item()) + output_logits_act_abs_std_list.append(output[0].abs().std().item()) for name, module in model.named_modules(): if name.endswith(".word_embeddings"): @@ -113,25 +113,25 @@ def output_logits_coord_check_hook(module, input, output): lr_scheduler=lr_scheduler, ) - word_embedding_act_abs_mean = None - attn_output_act_abs_mean = None - ffn_output_act_abs_mean = None - output_logits_act_abs_mean = None + word_embedding_act_abs_std = None + attn_output_act_abs_std = None + ffn_output_act_abs_std = None + output_logits_act_abs_std = None # remove hooks for handle in remove_hooks: handle.remove() - word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list) - attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list) - ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list) - output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list) + word_embedding_act_abs_std = np.mean(word_embedding_act_abs_std_list) + attn_output_act_abs_std = np.mean(attn_output_act_abs_std_list) + ffn_output_act_abs_std = np.mean(ffn_output_act_abs_std_list) + output_logits_act_abs_std = np.mean(output_logits_act_abs_std_list) df["seed"].append(i) df["step"].append(step) - df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean) - df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean) - df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean) - df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean) + df["word_embedding_act_abs_std"].append(word_embedding_act_abs_std) + df["attn_output_act_abs_std"].append(attn_output_act_abs_std) + df["ffn_output_act_abs_std"].append(ffn_output_act_abs_std) + df["output_logits_act_abs_std"].append(output_logits_act_abs_std) df["width"].append(width) del model, optimizer From 2ca94a8598b1a018d59d3c7b337dc8cff0b7e7d2 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 15:18:58 +0000 Subject: [PATCH 59/94] double attention head for every hidden size doubled --- megatron/training.py | 53 +++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 050d39fb5..0d4cfcab7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -74,7 +74,7 @@ def _plot_data(df, activation, graph_name_prefix): marker="o", dashes=False, legend='full' ) plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) - plt.tight_layout() + plt.tight_layout(pad=3.0) plt.xlabel("Width") plt.ylabel("Activation with {}".format("muP" if mup else "SP")) plt.title(f"{activation}") @@ -116,44 +116,49 @@ def has_method(o, name): def mup_coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data - def lazy_model(hidden_size): + def lazy_model(hidden_size, attention_head): def gen(): old_hidden_size = neox_args.hidden_size + old_num_attention_heads = neox_args.num_attention_heads neox_args.hidden_size = hidden_size + neox_args.num_attention_heads = attention_head neox_args.mup_width_multiplier = None - model, *_ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) neox_args.hidden_size = old_hidden_size - + neox_args.num_attention_heads = old_num_attention_heads return model return gen models = {} # Hidden size needs to be divisible by num attention heads - for hidden_size in [2**p for p in range(7,11)]: - models[hidden_size] = lazy_model(hidden_size) + for idx, hidden_size in enumerate([2**p for p in range(7,12)]): + models[hidden_size] = lazy_model( + hidden_size, + neox_args.num_attention_heads*(2**idx) + ) + + # print_rank_0(">>> Coord Check for mu Parameterization") + # neox_args.use_mup = True + # df_mup = get_coord_data( + # neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" + # ) + # df_mup.to_csv("df_mup.csv", index=False) + # plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True) - print_rank_0(">>> Coord Check for mu Parameterization") - neox_args.use_mup = True - df_mup = get_coord_data( - neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" - ) print_rank_0(">>> Coord Check for standard Parameterization") neox_args.use_mup = False df_sp = get_coord_data( neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam" ) - - df_mup.to_csv("df_mup.csv", index=False) df_sp.to_csv("df_sp.csv", index=False) - plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True) plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False) - print_rank_0("Saved coord check plots... exiting") + print_rank_0("Saved coord check plots... exiting") + import sys; sys.exit() return df_mup, df_sp def pretrain(neox_args): @@ -190,7 +195,7 @@ def pretrain(neox_args): ) = build_train_valid_test_data_iterators(neox_args=neox_args) timers("train/valid/test data iterators").stop() - df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator) + mup_coord_check(neox_args, timers, train_data_iterator) sys.exit() # Model, optimizer, and learning rate. @@ -534,13 +539,15 @@ def get_optimizer(model, neox_args): # Use Adam if neox_args.use_mup: # try: - # from mup import MuAdam - - # adam_optimizer = MuAdam + # # from mup import MuAdam + # # adam_optimizer = MuAdam + # # except ModuleNotFoundError: + # # print("Please install mup https://github.com/microsoft/mup") + # # raise Exception + # from deepspeed.ops.adam import FusedAdam as Adam + # adam_optimizer = Adam # except ModuleNotFoundError: - # print("Please install mup https://github.com/microsoft/mup") - # raise Exception - # from deepspeed.ops.adam import FusedAdam as Adam + # from apex.optimizers import FusedAdam as Adam # adam_optimizer = Adam adam_optimizer = torch.optim.Adam else: @@ -642,7 +649,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): """Setup model and optimizer.""" if neox_args.mup_width_multiplier is None: neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base - print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}") + print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") model = get_model(neox_args=neox_args, use_cache=use_cache) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) From 497485ca059655392071c0ed3eb8077351341a4c Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 27 Feb 2024 15:21:42 +0000 Subject: [PATCH 60/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 40 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 684b58609..591bd9384 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -7,7 +7,7 @@ LR Scheduler Arguments -- **lr_decay_style**: Literal +- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential'] Default = linear @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6bb4e62 + Default = 7483246 current git hash of repository @@ -253,7 +253,7 @@ Model Arguments -- **precision**: Literal +- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16'] Default = None @@ -288,16 +288,6 @@ Model Arguments -- **intermediate_size**: int - - Default = None - - Transformer intermediate size. Currently only used for "mlp_type": "llama". - - If not passed, will be set to a reasonable default. - - - - **num_attention_heads**: int Default = None @@ -346,7 +336,7 @@ Model Arguments -- **norm**: Literal +- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm'] Default = layernorm @@ -394,7 +384,7 @@ Model Arguments -- **pos_emb**: Literal +- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none'] Default = learned @@ -508,7 +498,7 @@ Model Arguments -- **activation**: Literal +- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu'] Default = gelu @@ -613,7 +603,7 @@ Model Arguments -- **init_method**: Literal +- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init'] Default = normal @@ -622,7 +612,7 @@ Model Arguments -- **output_layer_init_method**: Literal +- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init'] Default = scaled_normal @@ -705,7 +695,7 @@ Model Arguments -- **output_layer_parallelism**: Literal +- **output_layer_parallelism**: typing.Literal['column'] Default = column @@ -719,7 +709,7 @@ Optimizer Arguments -- **optimizer_type**: Literal +- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion'] Default = adam @@ -736,7 +726,7 @@ Optimizer Arguments -- **zero_stage**: Union +- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']] Default = None @@ -1135,7 +1125,7 @@ Tokenizer Arguments -- **tokenizer_type**: Literal +- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer'] Default = GPT2BPETokenizer @@ -1266,7 +1256,7 @@ Training Arguments -- **data_impl**: Literal +- **data_impl**: typing.Literal['infer', 'mmap', 'cached'] Default = infer @@ -1330,7 +1320,7 @@ Training Arguments -- **checkpoint_scale**: Literal +- **checkpoint_scale**: typing.Literal['linear', 'log'] Default = linear @@ -2088,7 +2078,7 @@ Args for deepspeed runner (deepspeed.launcher.runner). -- **launcher**: Literal +- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm'] Default = pdsh From 34fb7ca9c23dab34a58a4dacecabfb1843878d5f Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 18:45:07 +0000 Subject: [PATCH 61/94] added args --- megatron/neox_arguments/neox_args.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 08075aa0f..c04d566de 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1103,6 +1103,16 @@ class NeoXArgsTraining(NeoXArgsTemplate): Whether to generate a "coord check" plot to verify mup's implementation in neox """ + coord_check_nsteps: int = 10 + """ + + """ + + coord_check_nseeds: int = 5 + """ + + """ + save_base_shapes: bool = False """ Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file. From 2d53f1f48f8eb10cdfc0cd7c210af14b985d083e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 18:45:28 +0000 Subject: [PATCH 62/94] simplify coordcheck --- megatron/mup_substitute.py | 189 +++++++------------------------------ 1 file changed, 34 insertions(+), 155 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 87cff559d..49eab375b 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -10,34 +10,20 @@ import pandas as pd import torch import torch.nn.functional as F - +import deepspeed from megatron import print_rank_0 from megatron.training import train_step -def _get_coord_data( +def get_coord_data( neox_args, timers, - lr_scheduler, models, dataloader, - optcls, nsteps=10, - dict_in_out=False, - flatten_input=False, - flatten_output=False, - output_name="loss", - lossfn="xent", - filter_module_by_name=None, - fix_data=True, - cuda=True, - nseeds=10, - output_fdict=None, - input_fdict=None, - param_fdict=None, - show_progress=True, - one_hot_target=False, + nseeds=2, ): + lr_scheduler = None df = { "seed": [], "step": [], @@ -54,10 +40,9 @@ def _get_coord_data( for i in range(nseeds): torch.manual_seed(10**i) print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") - model = model_obj() + model, optimizer = model_obj() model.train() neox_args.hidden_size = width - optimizer = optcls(model) for step in range(nsteps + 1): @@ -69,21 +54,19 @@ def _get_coord_data( def word_embedding_coord_check_hook(module, input, output): with torch.no_grad(): - word_embedding_act_abs_std_list.append(output.abs().std().item()) + word_embedding_act_abs_std_list.append(output.cpu().abs().std().item()) def attn_output_coord_check_hook(module, input, output): with torch.no_grad(): - attn_output_act_abs_std_list.append(output[0].abs().std().item()) + attn_output_act_abs_std_list.append(output[0].cpu().abs().std().item()) def ffn_output_coord_check_hook(module, input, output): with torch.no_grad(): - ffn_output_act_abs_std_list.append(output[0].abs().std().item()) + ffn_output_act_abs_std_list.append(output[0].cpu().abs().std().item()) def output_logits_coord_check_hook(module, input, output): with torch.no_grad(): - # print("output_logits_coord_check_hook") - # print_rank_0(output.shape) - output_logits_act_abs_std_list.append(output[0].abs().std().item()) + output_logits_act_abs_std_list.append(output[0].cpu().abs().std().item()) for name, module in model.named_modules(): if name.endswith(".word_embeddings"): @@ -134,136 +117,32 @@ def output_logits_coord_check_hook(module, input, output): df["output_logits_act_abs_std"].append(output_logits_act_abs_std) df["width"].append(width) - del model, optimizer + print_rank_0( + f">>> BEFORE Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}" + ) + def del_obj_attrs(obj): + attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))] + for attr in attributes: + try: + delattr(obj,attr) + except: + pass + + def unlink_hp_params(lp_param_list): + for lp in lp_param_list: + lp._hp_mapping = None + return + + for i, _ in enumerate(optimizer.optimizer.param_groups): + unlink_hp_params(optimizer.bit16_groups[i]) + del_obj_attrs(optimizer) + model.destroy() + del optimizer gc.collect() - with torch.no_grad(): - torch.cuda.empty_cache() - - gc.collect() - with torch.no_grad(): torch.cuda.empty_cache() + deepspeed.runtime.utils.empty_cache() + print_rank_0( + f">>> AFTER Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}" + ) return pd.DataFrame(df) - - -def get_coord_data( - neox_args, - timers, - lr_scheduler, - models, - dataloader, - optimizer="sgd", - lr=None, - mup=True, - filter_trainable_by_name=None, - **kwargs -): - """Get coord data for coord check. - Train the models in `models` with data from `dataloader` and optimizer - specified by `optimizer` and `lr` for `nsteps` steps, and record coordinate - statistics specified by `output_fdict`, `input_fdict`, `param_fdict`. By - default, only `l1` is computed for output activations of each module. - This function wraps around `_get_coord_data`, with the main difference being - user can specify common optimizers via a more convenient interface. - Inputs: - models: - a dict of lazy models, where the keys are numbers indicating width. - Each entry of `models` is a function that instantiates a model given - nothing. - dataloader: - an iterator whose elements are either Huggingface style dicts, if - `dict_in_out` is True, or (input, label). If `fix_data` is True - (which is the default), then only the first element of `dataloader` - is used in a loop and the rest of `dataloder` is ignored. - optimizer: - a string in `['sgd', 'adam', 'adamw']`, with default being `'sgd'`. - lr: - learning rate. By default is 0.1 for `'sgd'` and 1e-3 for others. - mup: - If True, then use the optimizer from `mup.optim`; otherwise, use the - one from `torch.optim`. - filter_trainable_by_name: - a function that returns a bool given module names (from - `model.named_modules()`), or None. If not None, then only modules - whose name yields True will be trained. - nsteps: - number of steps to train the model - dict_in_out: - whether the data loader contains Huggingface-style dict input and - output. Default: False - flatten_input: - if not `dict_in_out`, reshape the input to be - `input.view(input.shape[0], -1)`. Typically used for testing MLPs. - flatten_output: - if not `dict_in_out`, reshape the label to be `label.view(-1, - input.shape[-1])`. - output_name: - if `dict_in_out`, this is the key for the loss value if the output - is a dict. If the output is not a dict, then we assume the first - element of the output is the loss. - lossfn: - loss function to use if not `dict_in_out`. Can be either a string from - [`xent`, 'mse', 'nll', 'l1'] or a python `callable` such that - `lossfn(output, target)` returns the loss value. Examples of valid - `callable`s are `F.cross_entropy`, `F.mse_loss`, etc, where `F` is - `torch.nn.functional`. Default: 'xent' - filter_module_by_name: - a function that returns a bool given module names (from - `model.named_modules()`), or None. If not None, then only modules - whose name yields True will be recorded. - cuda: - whether to use cuda or not. Default: True - nseeds: - number of times to repeat the training, each with different seeds. - output_fdict, input_fdict, param_fdict: - function dicts to be used in `_record_coords`. By default, only `l1` - is computed for output activations of each module. - show_progress: - show progress using tqdm. Default: True - one_hot_target: - convert target label into a one-hot vector. This typically is only - used for `'mse'` or `'l1'` losses in classification tasks. - Default: False - Output: - a pandas DataFrame containing recorded results. The column names are - `'width', 'module', 't'` as well as names of statistics recorded, such - as `'l1'` (see `FDICT` for other premade statistics that can be - collected). - - Breaking Changes: - In v1.0.0, when `lossfn=='mse'`, the target is automatically converted - to a one hot vector before loss computation. Starting in v1.1.0, this - behavior is turned off, and the user needs to explicitly turn on this - behavior by setting `one_hot_target=True`. - """ - if lr is None: - lr = 0.1 if optimizer == "sgd" else 1e-3 - - from torch.optim import SGD, AdamW, Adam - # from deepspeed.ops.adam import FusedAdam as Adam - - def get_trainable(model): - params = model.parameters() - if filter_trainable_by_name is not None: - params = [] - for name, p in model.named_parameters(): - if filter_trainable_by_name(name): - params.append(p) - return params - - if optimizer == "sgd": - optcls = lambda model: SGD(get_trainable(model), lr=lr) - elif optimizer == "adam": - optcls = lambda model: Adam(get_trainable(model), lr=lr) - elif optimizer == "adamw": - optcls = lambda model: AdamW(get_trainable(model), lr=lr) - elif optimizer is None: - raise ValueError("optimizer should be sgd|adam|adamw or a custom function") - - neox_args.use_mup = mup - data = _get_coord_data( - neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs - ) - # data["optimizer"] = optimizer - # data["lr"] = lr - return data From 789761017eb3494f777381700c9a6f8ceab4781e Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 18:46:04 +0000 Subject: [PATCH 63/94] seperate sp and mup configs --- .../{coord_check.yml => coord_check_mup.yml} | 2 + configs/coord_check_sp.yml | 115 ++++++++++++++++++ 2 files changed, 117 insertions(+) rename configs/{coord_check.yml => coord_check_mup.yml} (98%) create mode 100644 configs/coord_check_sp.yml diff --git a/configs/coord_check.yml b/configs/coord_check_mup.yml similarity index 98% rename from configs/coord_check.yml rename to configs/coord_check_mup.yml index 05e7b6bb8..1a253c8ce 100644 --- a/configs/coord_check.yml +++ b/configs/coord_check_mup.yml @@ -96,6 +96,8 @@ "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", "coord_check": true, + "coord_check_nsteps": 10, + "coord_check_nseeds": 5, "use_mup": true, # sigma_base "init_method_std": 0.08, diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml new file mode 100644 index 000000000..e878927df --- /dev/null +++ b/configs/coord_check_sp.yml @@ -0,0 +1,115 @@ +{ + # parallelism settings + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + # model settings + "num_layers": 8, + "num_attention_heads": 8, + "seq_length": 128, + "max_position_embeddings": 128, + "pos_emb": "rotary", + "rotary_pct": 0.25, + "no_weight_tying": true, + "gpt_j_residual": true, + "output_layer_parallelism": "column", + + # "attention_config": [[["flash"], 8]], + + # these should provide some speedup but takes a while to build, set to true if desired + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": true, + + # init methods + "init_method": "normal", + "output_layer_init_method": "normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.006, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + # "min_lr": 0.006, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 32, + "data_impl": "mmap", + "num_workers": 1, + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.0, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "precision": "fp32", + # "fp16": { + # "fp16": true, + # "enabled": true, + # "loss_scale": 0, + # "loss_scale_window": 1000, + # "initial_scale_power": 12, + # "hysteresis": 2, + # "min_loss_scale": 1, + # }, + + # misc. training settings + "train_iters": 300, + "lr_decay_iters": 300, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 300, + "eval_interval": 300, + "eval_iters": 10, + + # logging + "log_interval": 10, + "steps_per_print": 10, + "wall_clock_breakdown": true, + + "tokenizer_type": "HFTokenizer", + "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", + + "coord_check": true, + "coord_check_nsteps": 10, + "coord_check_nseeds": 5, + "use_mup": false, + # sigma_base + "init_method_std": 0.08, + # "mup_embedding_multiplier": 5, + # "mup_output_multiplier": 1, + # "mup_width_multiplier": 1, + "mup_d_model_base": 128, + "hidden_size": 128, + + "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document", + + # "launcher": "slurm", + # "deepspeed_slurm": true, + +} From 4f3920903bf6fd25efceabc386aec6818ace59cd Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 27 Feb 2024 18:46:32 +0000 Subject: [PATCH 64/94] perform coordcheck for sp and mup seperately --- megatron/training.py | 50 ++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ccdcedcbb..5f189fa59 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -24,6 +24,7 @@ import math import sys +import gc import torch import deepspeed @@ -84,10 +85,10 @@ def _plot_data(df, activation, graph_name_prefix): return 0 activation_list = [ - "word_embedding_act_abs_mean", - "attn_output_act_abs_mean", - "ffn_output_act_abs_mean", - "output_logits_act_abs_mean", + "word_embedding_act_abs_std", + "attn_output_act_abs_std", + "ffn_output_act_abs_std", + "output_logits_act_abs_std", ] """If distributed is initialized print only on rank 0.""" if torch.distributed.is_initialized(): @@ -113,7 +114,7 @@ def has_method(o, name): layer.mup_reinitialize_weights(neox_args) -def mup_coord_check(neox_args, timers, train_data_iterator): +def coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data def lazy_model(hidden_size, attention_head): @@ -123,43 +124,38 @@ def gen(): neox_args.hidden_size = hidden_size neox_args.num_attention_heads = attention_head neox_args.mup_width_multiplier = None - model, *_ = setup_model_and_optimizer( + model, optimizer, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) neox_args.hidden_size = old_hidden_size neox_args.num_attention_heads = old_num_attention_heads - return model + return model, optimizer return gen models = {} # Hidden size needs to be divisible by num attention heads - for idx, hidden_size in enumerate([2**p for p in range(7,12)]): + for idx, hidden_size in enumerate([2**p for p in range(8,12)]): models[hidden_size] = lazy_model( hidden_size, neox_args.num_attention_heads*(2**idx) ) - # print_rank_0(">>> Coord Check for mu Parameterization") - # neox_args.use_mup = True - # df_mup = get_coord_data( - # neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam" - # ) - # df_mup.to_csv("df_mup.csv", index=False) - # plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True) - - print_rank_0(">>> Coord Check for standard Parameterization") - neox_args.use_mup = False - df_sp = get_coord_data( - neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam" + df_mode = "mup" if neox_args.use_mup else "sp" + if neox_args.use_mup: + print_rank_0(">>> Coord Check for mu Parameterization") + else: + print_rank_0(">>> Coord Check for standard Parameterization") + + df = get_coord_data( + neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds, ) - df_sp.to_csv("df_sp.csv", index=False) - plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False) + df.to_csv(f"df_{df_mode}.csv", index=False) + plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", mup=neox_args.use_mup) print_rank_0("Saved coord check plots... exiting") - import sys; sys.exit() - return df_mup, df_sp + return 0 def pretrain(neox_args): """Main training program. @@ -183,8 +179,8 @@ def pretrain(neox_args): # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) - if neox_args.use_mup and neox_args.coord_check: - print_rank_0("---- Do muP Coord Check ----") + if neox_args.coord_check: + print_rank_0("---- Do Coord Check ----") # Data stuff neox_args.iteration = 0 timers("train/valid/test data iterators").start() @@ -195,7 +191,7 @@ def pretrain(neox_args): ) = build_train_valid_test_data_iterators(neox_args=neox_args) timers("train/valid/test data iterators").stop() - mup_coord_check(neox_args, timers, train_data_iterator) + coord_check(neox_args, timers, train_data_iterator) sys.exit() # Model, optimizer, and learning rate. From 5f84a3f8553d9aed74590ca3983c609013108030 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 27 Feb 2024 18:47:16 +0000 Subject: [PATCH 65/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 591bd9384..0a1ed33fc 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 7483246 + Default = 4f39209 current git hash of repository @@ -1636,6 +1636,22 @@ Training Arguments +- **coord_check_nsteps**: int + + Default = 10 + + + + + +- **coord_check_nseeds**: int + + Default = 5 + + + + + - **save_base_shapes**: bool Default = False From 479b854af593259f8fa8a32bb426e8bdacd8030c Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 28 Feb 2024 15:22:32 +0000 Subject: [PATCH 66/94] update --- configs/coord_check_mup.yml | 19 +++++-------------- configs/coord_check_sp.yml | 19 +++++-------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml index 1a253c8ce..d16e900cd 100644 --- a/configs/coord_check_mup.yml +++ b/configs/coord_check_mup.yml @@ -78,26 +78,17 @@ # }, # misc. training settings - "train_iters": 300, - "lr_decay_iters": 300, + "train_iters": 10, + "lr_decay_iters": 10, + "log_interval": 1, "distributed_backend": "nccl", - "lr_decay_style": "cosine", - "warmup": 0.01, - "checkpoint_factor": 300, - "eval_interval": 300, - "eval_iters": 10, - - # logging - "log_interval": 10, - "steps_per_print": 10, - "wall_clock_breakdown": true, - + "lr_decay_style": "constant", "tokenizer_type": "HFTokenizer", "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", "coord_check": true, "coord_check_nsteps": 10, - "coord_check_nseeds": 5, + "coord_check_nseeds": 3, "use_mup": true, # sigma_base "init_method_std": 0.08, diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml index e878927df..12c8165e1 100644 --- a/configs/coord_check_sp.yml +++ b/configs/coord_check_sp.yml @@ -78,26 +78,17 @@ # }, # misc. training settings - "train_iters": 300, - "lr_decay_iters": 300, + "train_iters": 10, + "lr_decay_iters": 10, + "log_interval": 1, "distributed_backend": "nccl", - "lr_decay_style": "cosine", - "warmup": 0.01, - "checkpoint_factor": 300, - "eval_interval": 300, - "eval_iters": 10, - - # logging - "log_interval": 10, - "steps_per_print": 10, - "wall_clock_breakdown": true, - + "lr_decay_style": "constant", "tokenizer_type": "HFTokenizer", "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", "coord_check": true, "coord_check_nsteps": 10, - "coord_check_nseeds": 5, + "coord_check_nseeds": 3, "use_mup": false, # sigma_base "init_method_std": 0.08, From 21a7e32739d78f1ca806dd5ea112c2ebdc742240 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 28 Feb 2024 15:22:56 +0000 Subject: [PATCH 67/94] update how params are sorted --- megatron/model/utils.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 9e70b6cce..ef36aac8e 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -30,7 +30,7 @@ def get_params_for_weight_decay_optimization(module, neox_args): Layernorms and biases will have no weight decay but the rest will. """ weight_decay_params = {"params": [], "lr_adjust": True} - no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} + no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0} embedding_weight_decay_params = {"params": [], "lr_adjust": False} embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} @@ -44,9 +44,14 @@ def get_params_for_weight_decay_optimization(module, neox_args): ) or ( neox_args.weight_decay == 0.0 ): # also include all parameters here if no weight decay is being done - no_weight_decay_params["params"].extend( - [p for p in list(module_._parameters.values()) if p is not None] - ) + if isinstance(module_, VocabParallelEmbedding): + embedding_no_weight_decay_params["params"].extend( + [p for p in list(module_._parameters.values()) if p is not None] + ) + else: + no_weight_decay_params["params"].extend( + [p for p in list(module_._parameters.values()) if p is not None] + ) else: if any( [ @@ -89,7 +94,8 @@ def get_params_for_weight_decay_optimization(module, neox_args): # only return a single param group # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once. # to avoid this, only use a single param group when weight decay is off. - return [no_weight_decay_params] + # return [no_weight_decay_params] + return no_weight_decay_params, embedding_no_weight_decay_params return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params From bb2e0c99249618520552d00a64868b0c3a5e59ac Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 28 Feb 2024 15:24:40 +0000 Subject: [PATCH 68/94] remove unused comments --- megatron/mup_substitute.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 49eab375b..11d3aa503 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -23,7 +23,6 @@ def get_coord_data( nsteps=10, nseeds=2, ): - lr_scheduler = None df = { "seed": [], "step": [], @@ -40,7 +39,7 @@ def get_coord_data( for i in range(nseeds): torch.manual_seed(10**i) print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") - model, optimizer = model_obj() + model, optimizer, lr_scheduler = model_obj() model.train() neox_args.hidden_size = width @@ -117,9 +116,6 @@ def output_logits_coord_check_hook(module, input, output): df["output_logits_act_abs_std"].append(output_logits_act_abs_std) df["width"].append(width) - print_rank_0( - f">>> BEFORE Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}" - ) def del_obj_attrs(obj): attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))] for attr in attributes: @@ -141,8 +137,5 @@ def unlink_hp_params(lp_param_list): gc.collect() torch.cuda.empty_cache() deepspeed.runtime.utils.empty_cache() - print_rank_0( - f">>> AFTER Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}" - ) return pd.DataFrame(df) From bf1ce068c35c79d3580b2b780fbc329629dc90f3 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 13:06:32 +0000 Subject: [PATCH 69/94] adjust --- megatron/model/init_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 7554a7b94..cae150464 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -179,4 +179,4 @@ def _get(name, use_mup=False): else: raise NotImplementedError(f"Unknown init method {name}") - return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method) + return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method, use_mup=args.use_mup) From 50a3dbadc092fda1b7a4372d96b1fc1a72f561f6 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 13:08:23 +0000 Subject: [PATCH 70/94] simplify --- megatron/learning_rates.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 4ae18d49b..9e9994049 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -98,9 +98,8 @@ def step(self, step_num=None): new_lr = self.get_lr() for group in self.optimizer.param_groups: if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True: - group["lr"] = new_lr / self.mup_width_multiplier - else: - group["lr"] = new_lr + new_lr = new_lr / self.mup_width_multiplier + group["lr"] = new_lr def state_dict(self): state_dict = { From c4c1660fbdf30b0f666e4d82ee4cc9cb78949a6f Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 13:41:26 +0000 Subject: [PATCH 71/94] fix mup embedding multiplier --- megatron/model/word_embeddings.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index 230bdedb3..04cf55c8d 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -50,8 +50,7 @@ def __init__( self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes - self.use_mup = neox_args.use_mup - self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) + self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1 # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( @@ -152,11 +151,8 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): # Dropout. embeddings = self.embedding_dropout(embeddings) - # Y_emb = m_emb * embed(X) - if self.use_mup: - with torch.no_grad(): - embeddings = torch.mul(embeddings, self.mup_embedding_multiplier) + embeddings = torch.mul(embeddings, self.mup_embedding_multiplier) return embeddings From 1c359119c4ade191de263ecb4b8dd1f531b4ea53 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 13:43:15 +0000 Subject: [PATCH 72/94] embeddingpipe fix init --- megatron/model/gpt2_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 652004c8b..5426f7749 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -113,7 +113,6 @@ def __init__( use_cache=False, ): self.neox_args = neox_args - self.use_cache = use_cache self.parallel_output = parallel_output self.hidden_size = self.neox_args.hidden_size @@ -202,7 +201,7 @@ def init_specs(self): self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, - self.init_method, + self.input_embedding_init_method, self.num_tokentypes, ) ) From 84be4d4f1877f5a5f1bd04e46d93ddaf394abf26 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:01:33 +0000 Subject: [PATCH 73/94] changed how manual seed is loaded --- megatron/model/transformer.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index a86f1b99f..8ba004336 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -232,6 +232,9 @@ def __init__( gather_output=not parallel_output, skip_bias_add=False, ) + + self.neox_args = neox_args + self.is_last_layer = is_last_layer # else: # print( @@ -250,7 +253,15 @@ def __init__( # ) def forward(self, hidden_states): - return self.final_linear(hidden_states) + logits = self.final_linear(hidden_states) + + if self.is_last_layer: + _logits, *_args = logits + if self.neox_args.use_mup: + _logits /= self.neox_args.mup_width_multiplier + _logits *= self.neox_args.mup_output_multiplier + logits = (_logits, *_args) + return logits class ParallelSelfAttention(nn.Module): @@ -348,12 +359,14 @@ def __init__( coeff = None if neox_args.use_mup: - self.norm_factor = self.hidden_size_per_attention_head + # self.norm_factor = self.hidden_size_per_attention_head + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) else: self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - if self.apply_query_key_layer_scaling: - coeff = max(1, self.layer_number) - self.norm_factor *= coeff + + if self.apply_query_key_layer_scaling: + coeff = max(1, self.layer_number) + self.norm_factor *= coeff self.rpe = rpe From fbb4daf3a73039d353f3a32154d95dc4e2626c42 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:02:33 +0000 Subject: [PATCH 74/94] removed musgd and other changces --- megatron/training.py | 98 ++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 68 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 5f189fa59..a1d234636 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -64,7 +64,7 @@ import matplotlib.pyplot as plt -def plot_coord_data(df, graph_name_prefix, mup=True): +def plot_coord_data(df, graph_name_prefix, use_mup=True): def _plot_data(df, activation, graph_name_prefix): df = df.groupby(['step', 'width']).mean().reset_index() @@ -77,7 +77,7 @@ def _plot_data(df, activation, graph_name_prefix): plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) plt.tight_layout(pad=3.0) plt.xlabel("Width") - plt.ylabel("Activation with {}".format("muP" if mup else "SP")) + plt.ylabel("Activation with {}".format("muP" if use_mup else "SP")) plt.title(f"{activation}") plt.savefig(f"{graph_name_prefix}-{activation}.png") plt.close() @@ -101,18 +101,6 @@ def _plot_data(df, activation, graph_name_prefix): return 0 -def mup_weights_reinit(neox_args, model): - def has_method(o, name): - return callable(getattr(o, name, None)) - - for layer in model.modules(): - # This normally would happen in set_base_shapes if we actually were able to use the MuReadout class - if hasattr(layer, "mup_rescale_parameters") and layer.mup_rescale_parameters: - layer._rescale_parameters() - - if has_method(layer, "mup_reinitialize_weights"): - layer.mup_reinitialize_weights(neox_args) - def coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data @@ -124,13 +112,14 @@ def gen(): neox_args.hidden_size = hidden_size neox_args.num_attention_heads = attention_head neox_args.mup_width_multiplier = None - model, optimizer, _ = setup_model_and_optimizer( + neox_args.mup_d_model_base = 2**8 + model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) neox_args.hidden_size = old_hidden_size neox_args.num_attention_heads = old_num_attention_heads - return model, optimizer + return model, optimizer, lr_scheduler return gen @@ -152,8 +141,7 @@ def gen(): neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds, ) df.to_csv(f"df_{df_mode}.csv", index=False) - plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", mup=neox_args.use_mup) - + plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup) print_rank_0("Saved coord check plots... exiting") return 0 @@ -413,11 +401,6 @@ def get_model(neox_args, use_cache=False): # Build model on cpu. print_rank_0("building GPT2 model ...") - # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below. - # If mup isn't being used anyways, this has no effect. - # old_use_mup = neox_args.use_mup - # neox_args.use_mup = False - model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, @@ -450,8 +433,6 @@ def get_model(neox_args, use_cache=False): # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training model = model.to_sequential() - # neox_args.use_mup = old_use_mup - if neox_args.deepspeed: # DeepSpeed handles CUDA, FP16, and DDP components. return model @@ -532,55 +513,36 @@ def get_optimizer(model, neox_args): **neox_args.optimizer["params"], ) elif neox_args.optimizer_type.lower() == "adam": - # Use Adam - if neox_args.use_mup: - # try: - # # from mup import MuAdam - # # adam_optimizer = MuAdam - # # except ModuleNotFoundError: - # # print("Please install mup https://github.com/microsoft/mup") - # # raise Exception - # from deepspeed.ops.adam import FusedAdam as Adam - # adam_optimizer = Adam - # except ModuleNotFoundError: - # from apex.optimizers import FusedAdam as Adam - # adam_optimizer = Adam - adam_optimizer = torch.optim.Adam + if neox_args.use_bnb_optimizer: + try: + import bitsandbytes as bnb + + adam_optimizer = bnb.optim.Adam8bit + except ModuleNotFoundError: + print( + "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes." + ) + raise Exception else: - if neox_args.use_bnb_optimizer: - try: - import bitsandbytes as bnb - - adam_optimizer = bnb.optim.Adam8bit - except ModuleNotFoundError: - print( - "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes." - ) - raise Exception - else: - try: - # default to apex as it's slightly faster - from apex.optimizers import FusedAdam as Adam - except ImportError: - # if apex isn't installed, use deepspeed's FusedAdam - print( - "WARNING: APEX not installed - defaulting to deepspeed's fused adam" - ) - # from deepspeed.ops.adam import FusedAdam as Adam - from torch.optim import Adam - adam_optimizer = Adam + try: + # default to apex as it's slightly faster + from apex.optimizers import FusedAdam as Adam + except ImportError: + # if apex isn't installed, use deepspeed's FusedAdam + print( + "WARNING: APEX not installed - defaulting to deepspeed's fused adam" + ) + # from deepspeed.ops.adam import FusedAdam as Adam + from torch.optim import Adam + adam_optimizer = Adam optimizer = adam_optimizer( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], ) elif neox_args.optimizer_type.lower() == "sgd": - try: - from mup import MuSGD - except ModuleNotFoundError: - print("Please install mup https://github.com/microsoft/mup") - raise Exception - optimizer = MuSGD( + from torch.optim import SGD + optimizer = SGD( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], @@ -645,7 +607,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): """Setup model and optimizer.""" if neox_args.mup_width_multiplier is None: neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base - print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") + print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") model = get_model(neox_args=neox_args, use_cache=use_cache) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) From fa142ffd1c9557b638e7d0320a67d06b83199b1c Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:03:11 +0000 Subject: [PATCH 75/94] update config --- configs/coord_check_mup.yml | 3 +-- configs/coord_check_sp.yml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml index d16e900cd..77b333a46 100644 --- a/configs/coord_check_mup.yml +++ b/configs/coord_check_mup.yml @@ -22,7 +22,7 @@ # init methods "init_method": "normal", - "output_layer_init_method": "normal", + "output_layer_init_method": "scaled_normal", # optimizer settings "optimizer": { @@ -79,7 +79,6 @@ # misc. training settings "train_iters": 10, - "lr_decay_iters": 10, "log_interval": 1, "distributed_backend": "nccl", "lr_decay_style": "constant", diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml index 12c8165e1..ad7ef2246 100644 --- a/configs/coord_check_sp.yml +++ b/configs/coord_check_sp.yml @@ -22,7 +22,7 @@ # init methods "init_method": "normal", - "output_layer_init_method": "normal", + "output_layer_init_method": "scaled_normal", # optimizer settings "optimizer": { @@ -79,7 +79,6 @@ # misc. training settings "train_iters": 10, - "lr_decay_iters": 10, "log_interval": 1, "distributed_backend": "nccl", "lr_decay_style": "constant", From ad2336f9c5dcb4a9eced62df58f112f1ebf7efb0 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:03:50 +0000 Subject: [PATCH 76/94] fixed how params are sorted --- megatron/model/utils.py | 81 ++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index ef36aac8e..960c2d956 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -29,44 +29,60 @@ def get_params_for_weight_decay_optimization(module, neox_args): """Divide params into with-weight-decay and without-weight-decay groups. Layernorms and biases will have no weight decay but the rest will. """ - weight_decay_params = {"params": [], "lr_adjust": True} - no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0} - embedding_weight_decay_params = {"params": [], "lr_adjust": False} - embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} + lr_adjust_weight_decay_params = {"params": [], "lr_adjust": True} + lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0} + no_lr_adjust_weight_decay_params = {"params": [], "lr_adjust": False} + no_lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} for module_ in module.modules(): - if any( - [ - isinstance(module_, LayerNorm), - isinstance(module_, RMSNorm), - isinstance(module_, ScaleNorm), - ] - ) or ( - neox_args.weight_decay == 0.0 - ): # also include all parameters here if no weight decay is being done - if isinstance(module_, VocabParallelEmbedding): - embedding_no_weight_decay_params["params"].extend( - [p for p in list(module_._parameters.values()) if p is not None] - ) - else: - no_weight_decay_params["params"].extend( + if neox_args.weight_decay == 0.0: + if any( + [ + isinstance(module_, LayerNorm), + isinstance(module_, RMSNorm), + isinstance(module_, ScaleNorm), + isinstance(module_, VocabParallelEmbedding), + ] + ): + no_lr_adjust_no_weight_decay_params["params"].extend( [p for p in list(module_._parameters.values()) if p is not None] ) + else: + no_lr_adjust_no_weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n == "bias" + ] + ) + lr_adjust_no_weight_decay_params["params"].extend( + [ + p + for n, p in list(module_._parameters.items()) + if p is not None and n != "bias" + ] + ) else: if any( [ - isinstance(module_, VocabParallelEmbedding), + isinstance(module_, LayerNorm), + isinstance(module_, RMSNorm), + isinstance(module_, ScaleNorm), ] ): - - embedding_weight_decay_params["params"].extend( + no_lr_adjust_no_weight_decay_params["params"].extend( + [p for p in list(module_._parameters.values()) if p is not None] + ) + + elif isinstance(module_, VocabParallelEmbedding): + no_lr_adjust_weight_decay_params["params"].extend( [ p for n, p in list(module_._parameters.items()) if p is not None and n != "bias" ] ) - embedding_no_weight_decay_params["params"].extend( + no_lr_adjust_no_weight_decay_params["params"].extend( [ p for n, p in list(module_._parameters.items()) @@ -74,15 +90,14 @@ def get_params_for_weight_decay_optimization(module, neox_args): ] ) else: - - weight_decay_params["params"].extend( + lr_adjust_weight_decay_params["params"].extend( [ p for n, p in list(module_._parameters.items()) if p is not None and n != "bias" ] ) - no_weight_decay_params["params"].extend( + lr_adjust_no_weight_decay_params["params"].extend( [ p for n, p in list(module_._parameters.items()) @@ -94,9 +109,17 @@ def get_params_for_weight_decay_optimization(module, neox_args): # only return a single param group # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once. # to avoid this, only use a single param group when weight decay is off. - # return [no_weight_decay_params] - return no_weight_decay_params, embedding_no_weight_decay_params - return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params + # return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params) + return ( + lr_adjust_no_weight_decay_params, + no_lr_adjust_no_weight_decay_params + ) + return ( + lr_adjust_weight_decay_params, + lr_adjust_no_weight_decay_params, + no_lr_adjust_weight_decay_params, + no_lr_adjust_no_weight_decay_params + ) def exists(x): From fe73bc39dec4bf165d259e2e7eebf00e3a63e1d3 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:04:34 +0000 Subject: [PATCH 77/94] update how seed is computed --- megatron/mup_substitute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 11d3aa503..cd7646be1 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -37,7 +37,7 @@ def get_coord_data( for width, model_obj in models.items(): for i in range(nseeds): - torch.manual_seed(10**i) + torch.manual_seed((i+1)*100000) print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") model, optimizer, lr_scheduler = model_obj() model.train() From a3bd44cb2943fdd57124bb0d67f8b4c05e60fefc Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Thu, 29 Feb 2024 14:07:27 +0000 Subject: [PATCH 78/94] update to follow pre-commit format --- configs/neox_arguments.md | 7 +++--- megatron/model/gpt2_model.py | 15 +++++++---- megatron/model/init_functions.py | 30 +++++++++++++--------- megatron/model/transformer.py | 10 +++++--- megatron/model/utils.py | 23 ++++++++++------- megatron/model/word_embeddings.py | 4 ++- megatron/mup_substitute.py | 28 +++++++++++++++------ megatron/neox_arguments/neox_args.py | 4 +-- megatron/training.py | 37 +++++++++++++++++++--------- 9 files changed, 102 insertions(+), 56 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 591bd9384..ee78b8d0f 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -1061,7 +1061,7 @@ Text Generation arguments - **prompt_end**: str - Default = + Default = a single prompt's end. Defaults to newline @@ -1103,7 +1103,7 @@ Text Generation arguments - **eval_results_prefix**: str - Default = + Default = prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json @@ -1831,7 +1831,7 @@ Args for deepspeed config Default = None - + @@ -2131,4 +2131,3 @@ Args for deepspeed runner (deepspeed.launcher.runner). Default = None Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard. - diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 5426f7749..17ad6905a 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -117,9 +117,11 @@ def __init__( self.parallel_output = parallel_output self.hidden_size = self.neox_args.hidden_size self.num_tokentypes = num_tokentypes - self.init_method, self.input_embedding_init_method, self.output_layer_init_method = get_init_methods( - self.neox_args - ) + ( + self.init_method, + self.input_embedding_init_method, + self.output_layer_init_method, + ) = get_init_methods(self.neox_args) self.__topology__ = topology self.specs = [] @@ -176,7 +178,7 @@ def init_specs(self): # Embedding layer # input will be (input_ids, position_ids, attention_mask) - # TODO Initilized weights here should not be divided by m_width + # TODO Initialized weights here should not be divided by m_width if weight_tying: self.specs.append( TiedLayerSpec( @@ -272,7 +274,10 @@ def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline.""" logits = parallel_lm_logits( - lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args + lm_output, + embedding.word_embeddings_weight, + self.parallel_output, + self.neox_args, ) return logits diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index cae150464..2f85e4517 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -136,28 +136,24 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): with torch.no_grad(): init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight - + return init_ def get_init_methods(args): - def _get(name, use_mup=False): if name == "normal": sigma = args.init_method_std if use_mup: - sigma = sigma/math.sqrt(args.mup_width_multiplier) + sigma = sigma / math.sqrt(args.mup_width_multiplier) return init_method_normal( sigma=sigma, ) elif name == "scaled_normal": sigma = args.init_method_std if use_mup: - sigma = sigma/math.sqrt(args.mup_width_multiplier) - return scaled_init_method_normal( - sigma=sigma, - num_layers=args.num_layers - ) + sigma = sigma / math.sqrt(args.mup_width_multiplier) + return scaled_init_method_normal(sigma=sigma, num_layers=args.num_layers) elif name == "orthogonal": return orthogonal_init_method(args.mup_width_multiplier if use_mup else 1.0) elif name == "scaled_orthogonal": @@ -165,12 +161,18 @@ def _get(name, use_mup=False): args.num_layers, args.mup_width_multiplier if use_mup else 1.0 ) elif name == "xavier_uniform": - return xavier_uniform_init_method(args.mup_width_multiplier if use_mup else 1.0) + return xavier_uniform_init_method( + args.mup_width_multiplier if use_mup else 1.0 + ) elif name == "xavier_normal": - return xavier_normal_init_method(args.mup_width_multiplier if use_mup else 1.0) + return xavier_normal_init_method( + args.mup_width_multiplier if use_mup else 1.0 + ) elif name == "wang_init": return wang_init_method( - args.num_layers, args.hidden_size, args.mup_width_multiplier if use_mup else 1.0 + args.num_layers, + args.hidden_size, + args.mup_width_multiplier if use_mup else 1.0, ) elif name == "small_init": return small_init_init_method( @@ -179,4 +181,8 @@ def _get(name, use_mup=False): else: raise NotImplementedError(f"Unknown init method {name}") - return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method, use_mup=args.use_mup) + return ( + _get(args.init_method, use_mup=args.use_mup), + _get(args.init_method), + _get(args.output_layer_init_method, use_mup=args.use_mup), + ) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 8ba004336..79203eae3 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -232,7 +232,7 @@ def __init__( gather_output=not parallel_output, skip_bias_add=False, ) - + self.neox_args = neox_args self.is_last_layer = is_last_layer @@ -259,7 +259,7 @@ def forward(self, hidden_states): _logits, *_args = logits if self.neox_args.use_mup: _logits /= self.neox_args.mup_width_multiplier - _logits *= self.neox_args.mup_output_multiplier + _logits *= self.neox_args.mup_output_multiplier logits = (_logits, *_args) return logits @@ -1132,7 +1132,9 @@ def forward(self, args): return self.norm(args) -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None): +def parallel_lm_logits( + input_, word_embeddings_weight, parallel_output, bias=None, args=None +): """LM logits using word embedding weights.""" # Parallel logits. input_parallel = mpu.copy_to_model_parallel_region(input_) @@ -1145,7 +1147,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non if args is not None and args.use_mup: logits_parallel /= args.mup_width_multiplier - logits_parallel *= args.mup_output_multiplier + logits_parallel *= args.mup_output_multiplier # Gather if needed. if parallel_output: diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 960c2d956..825edee99 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -30,9 +30,17 @@ def get_params_for_weight_decay_optimization(module, neox_args): Layernorms and biases will have no weight decay but the rest will. """ lr_adjust_weight_decay_params = {"params": [], "lr_adjust": True} - lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0} + lr_adjust_no_weight_decay_params = { + "params": [], + "lr_adjust": True, + "weight_decay": 0.0, + } no_lr_adjust_weight_decay_params = {"params": [], "lr_adjust": False} - no_lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0} + no_lr_adjust_no_weight_decay_params = { + "params": [], + "lr_adjust": False, + "weight_decay": 0.0, + } for module_ in module.modules(): if neox_args.weight_decay == 0.0: @@ -73,7 +81,7 @@ def get_params_for_weight_decay_optimization(module, neox_args): no_lr_adjust_no_weight_decay_params["params"].extend( [p for p in list(module_._parameters.values()) if p is not None] ) - + elif isinstance(module_, VocabParallelEmbedding): no_lr_adjust_weight_decay_params["params"].extend( [ @@ -110,16 +118,13 @@ def get_params_for_weight_decay_optimization(module, neox_args): # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once. # to avoid this, only use a single param group when weight decay is off. # return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params) - return ( - lr_adjust_no_weight_decay_params, - no_lr_adjust_no_weight_decay_params - ) + return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params) return ( lr_adjust_weight_decay_params, lr_adjust_no_weight_decay_params, no_lr_adjust_weight_decay_params, - no_lr_adjust_no_weight_decay_params - ) + no_lr_adjust_no_weight_decay_params, + ) def exists(x): diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index 04cf55c8d..517646546 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -50,7 +50,9 @@ def __init__( self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes - self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1 + self.mup_embedding_multiplier = ( + float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1 + ) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index cd7646be1..e45f3f82a 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -37,7 +37,7 @@ def get_coord_data( for width, model_obj in models.items(): for i in range(nseeds): - torch.manual_seed((i+1)*100000) + torch.manual_seed((i + 1) * 100000) print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") model, optimizer, lr_scheduler = model_obj() model.train() @@ -53,24 +53,34 @@ def get_coord_data( def word_embedding_coord_check_hook(module, input, output): with torch.no_grad(): - word_embedding_act_abs_std_list.append(output.cpu().abs().std().item()) + word_embedding_act_abs_std_list.append( + output.cpu().abs().std().item() + ) def attn_output_coord_check_hook(module, input, output): with torch.no_grad(): - attn_output_act_abs_std_list.append(output[0].cpu().abs().std().item()) + attn_output_act_abs_std_list.append( + output[0].cpu().abs().std().item() + ) def ffn_output_coord_check_hook(module, input, output): with torch.no_grad(): - ffn_output_act_abs_std_list.append(output[0].cpu().abs().std().item()) + ffn_output_act_abs_std_list.append( + output[0].cpu().abs().std().item() + ) def output_logits_coord_check_hook(module, input, output): with torch.no_grad(): - output_logits_act_abs_std_list.append(output[0].cpu().abs().std().item()) + output_logits_act_abs_std_list.append( + output[0].cpu().abs().std().item() + ) for name, module in model.named_modules(): if name.endswith(".word_embeddings"): remove_hooks.append( - module.register_forward_hook(word_embedding_coord_check_hook) + module.register_forward_hook( + word_embedding_coord_check_hook + ) ) elif name.endswith(".attention.dense"): remove_hooks.append( @@ -117,10 +127,12 @@ def output_logits_coord_check_hook(module, input, output): df["width"].append(width) def del_obj_attrs(obj): - attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))] + attributes = [ + attr for attr in vars(obj) if not callable(getattr(obj, attr)) + ] for attr in attributes: try: - delattr(obj,attr) + delattr(obj, attr) except: pass diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index c04d566de..57613f0ab 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1105,12 +1105,12 @@ class NeoXArgsTraining(NeoXArgsTemplate): coord_check_nsteps: int = 10 """ - + Number of steps to do for the coordinate check """ coord_check_nseeds: int = 5 """ - + Number of repetition for each size in coordinate check """ save_base_shapes: bool = False diff --git a/megatron/training.py b/megatron/training.py index a1d234636..57b241cee 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -65,16 +65,21 @@ def plot_coord_data(df, graph_name_prefix, use_mup=True): - def _plot_data(df, activation, graph_name_prefix): - df = df.groupby(['step', 'width']).mean().reset_index() + df = df.groupby(["step", "width"]).mean().reset_index() sns.color_palette("magma") sns.lineplot( data=df, - x="width", y=activation, hue="step", errorbar=None, style="step", - marker="o", dashes=False, legend='full' + x="width", + y=activation, + hue="step", + errorbar=None, + style="step", + marker="o", + dashes=False, + legend="full", ) - plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) + plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0) plt.tight_layout(pad=3.0) plt.xlabel("Width") plt.ylabel("Activation with {}".format("muP" if use_mup else "SP")) @@ -125,10 +130,9 @@ def gen(): models = {} # Hidden size needs to be divisible by num attention heads - for idx, hidden_size in enumerate([2**p for p in range(8,12)]): + for idx, hidden_size in enumerate([2**p for p in range(8, 12)]): models[hidden_size] = lazy_model( - hidden_size, - neox_args.num_attention_heads*(2**idx) + hidden_size, neox_args.num_attention_heads * (2**idx) ) df_mode = "mup" if neox_args.use_mup else "sp" @@ -138,13 +142,21 @@ def gen(): print_rank_0(">>> Coord Check for standard Parameterization") df = get_coord_data( - neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds, + neox_args, + timers, + models, + train_data_iterator, + neox_args.coord_check_nsteps, + neox_args.coord_check_nseeds, ) df.to_csv(f"df_{df_mode}.csv", index=False) - plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup) + plot_coord_data( + df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup + ) print_rank_0("Saved coord check plots... exiting") return 0 + def pretrain(neox_args): """Main training program. @@ -542,6 +554,7 @@ def get_optimizer(model, neox_args): ) elif neox_args.optimizer_type.lower() == "sgd": from torch.optim import SGD + optimizer = SGD( param_groups, weight_decay=neox_args.weight_decay, @@ -606,7 +619,9 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): """Setup model and optimizer.""" if neox_args.mup_width_multiplier is None: - neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base + neox_args.mup_width_multiplier = ( + neox_args.hidden_size / neox_args.mup_d_model_base + ) print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") model = get_model(neox_args=neox_args, use_cache=use_cache) From e8639a03b1f6cb351d9e6c347b108303581e07a7 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 29 Feb 2024 14:09:55 +0000 Subject: [PATCH 79/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 0649b5436..948f399db 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 4f39209 + Default = 2365fd5 current git hash of repository @@ -1061,7 +1061,7 @@ Text Generation arguments - **prompt_end**: str - Default = + Default = a single prompt's end. Defaults to newline @@ -1103,7 +1103,7 @@ Text Generation arguments - **eval_results_prefix**: str - Default = + Default = prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json @@ -1640,7 +1640,7 @@ Training Arguments Default = 10 - + Number of steps to do for the coordinate check @@ -1648,7 +1648,7 @@ Training Arguments Default = 5 - + Number of repetition for each size in coordinate check @@ -1847,7 +1847,7 @@ Args for deepspeed config Default = None - + @@ -2147,3 +2147,4 @@ Args for deepspeed runner (deepspeed.launcher.runner). Default = None Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard. + From 47e14389bcc51ce1d2549723b9d2cccee861e95f Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 5 Mar 2024 05:13:34 +0000 Subject: [PATCH 80/94] fix lr weighting --- megatron/learning_rates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 9e9994049..4ae18d49b 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -98,8 +98,9 @@ def step(self, step_num=None): new_lr = self.get_lr() for group in self.optimizer.param_groups: if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True: - new_lr = new_lr / self.mup_width_multiplier - group["lr"] = new_lr + group["lr"] = new_lr / self.mup_width_multiplier + else: + group["lr"] = new_lr def state_dict(self): state_dict = { From a064f9b98191aa8fd6578f9cc9ee4b3c7cbcc792 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Tue, 5 Mar 2024 05:14:44 +0000 Subject: [PATCH 81/94] hard set to 1.0 if neox_args.use_mup is false --- megatron/model/word_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index 517646546..22ea5989d 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -51,7 +51,7 @@ def __init__( self.init_method = init_method self.num_tokentypes = num_tokentypes self.mup_embedding_multiplier = ( - float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1 + float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1.0 ) # Word embeddings (parallel). From 6fe55f4b5c6c8ab1d0eacd7abecc254a17ff4c79 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 21 Apr 2024 22:59:13 +0000 Subject: [PATCH 82/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 0268da279..fb48f78aa 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 11a5537 + Default = b0da27a current git hash of repository From 8bf8bcde743931289b88870919802e32685d4d3d Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 04:01:06 +0000 Subject: [PATCH 83/94] add new parameters --- megatron/neox_arguments/neox_args.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 57613f0ab..bdb4c82b3 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1098,6 +1098,29 @@ class NeoXArgsTraining(NeoXArgsTemplate): Whether to use muP """ + mup_save: str = None + """ + Path to save results when using muP + """ + + mup_lr: float = None + """ + An alias parameter for lr, + if not None will override lr + """ + + mup_std: float = None + """ + An alias parameter for init_method_std, + if not None will override init_method_std + """ + + mup_hidden_size: int = None + """ + An alias parameter for hidden_size, + if not None will override hidden_size + """ + coord_check: bool = False """ Whether to generate a "coord check" plot to verify mup's implementation in neox From 7f0b03304ae45779c5637770333ba345057cc653 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 04:13:20 +0000 Subject: [PATCH 84/94] add parameter checks --- megatron/training.py | 63 +++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 57b241cee..2f94cdc96 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -64,7 +64,7 @@ import matplotlib.pyplot as plt -def plot_coord_data(df, graph_name_prefix, use_mup=True): +def plot_coord_data(df, graph_name_prefix, use_mup=True, save_path=None): def _plot_data(df, activation, graph_name_prefix): df = df.groupby(["step", "width"]).mean().reset_index() sns.color_palette("magma") @@ -84,7 +84,12 @@ def _plot_data(df, activation, graph_name_prefix): plt.xlabel("Width") plt.ylabel("Activation with {}".format("muP" if use_mup else "SP")) plt.title(f"{activation}") - plt.savefig(f"{graph_name_prefix}-{activation}.png") + + file_path = f"{graph_name_prefix}-{activation}.png" + if save_path is not None: + file_path = os.path.join(save_path, file_path) + + plt.savefig(file_path) plt.close() return 0 @@ -110,6 +115,12 @@ def _plot_data(df, activation, graph_name_prefix): def coord_check(neox_args, timers, train_data_iterator): from megatron.mup_substitute import get_coord_data + if neox_args.mup_save is None: + print_rank_0("Must set mup_save") + sys.exit() + else: + os.makedirs(neox_args.mup_save, exist_ok=True) + def lazy_model(hidden_size, attention_head): def gen(): old_hidden_size = neox_args.hidden_size @@ -129,8 +140,8 @@ def gen(): return gen models = {} - # Hidden size needs to be divisible by num attention heads - for idx, hidden_size in enumerate([2**p for p in range(8, 12)]): + # Hidden size needs to be divisible by num attention heads #14 + for idx, hidden_size in enumerate([2**p for p in range(8, 11)]): models[hidden_size] = lazy_model( hidden_size, neox_args.num_attention_heads * (2**idx) ) @@ -149,11 +160,13 @@ def gen(): neox_args.coord_check_nsteps, neox_args.coord_check_nseeds, ) - df.to_csv(f"df_{df_mode}.csv", index=False) - plot_coord_data( - df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup - ) - print_rank_0("Saved coord check plots... exiting") + + if neox_args.mup_save is not None: + plot_coord_data( + df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup, save_path=neox_args.mup_save + ) + print_rank_0("Saved coord check plots... exiting") + return 0 @@ -462,7 +475,13 @@ def get_optimizer(model, neox_args): f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)" ) exit() - # Build parameter groups (weight decay and non-decay). + + if neox_args["lr"] is not None: + neox_args["optimizer"]["params"]["lr"] = neox_args["lr"] + + # Build parameter groups for parameters that + # are affected by weight decay and non-decay or + # have adjustable and non-adjustable learning rate. param_groups = get_params_for_weight_decay_optimization(model, neox_args) print_rank_0( f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}' @@ -538,7 +557,8 @@ def get_optimizer(model, neox_args): else: try: # default to apex as it's slightly faster - from apex.optimizers import FusedAdam as Adam + # from apex.optimizers import FusedAdam as Adam + from torch.optim import Adam except ImportError: # if apex isn't installed, use deepspeed's FusedAdam print( @@ -618,10 +638,23 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): ) """Setup model and optimizer.""" - if neox_args.mup_width_multiplier is None: - neox_args.mup_width_multiplier = ( - neox_args.hidden_size / neox_args.mup_d_model_base - ) + if neox_args.use_mup: + if neox_args.mup_lr is not None: + neox_args.lr = neox_args.mup_lr + print_rank_0(f"Overriding neox_args.lr with neox_args.mup_lr: {neox_args.mup_lr}") + + if neox_args.mup_std is not None: + neox_args.init_method_std = neox_args.mup_std + print_rank_0(f"Overriding neox_args.init_method_std with neox_args.mup_std: {neox_args.mup_std}") + + if neox_args.mup_hidden_size is not None: + neox_args.hidden_size = neox_args.mup_hidden_size + print_rank_0(f"Overriding neox_args.hidden_size with neox_args.mup_hidden_size: {neox_args.mup_hidden_size}") + + if neox_args.mup_width_multiplier is None: + neox_args.mup_width_multiplier = ( + neox_args.hidden_size / neox_args.mup_d_model_base + ) print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") model = get_model(neox_args=neox_args, use_cache=use_cache) From f8028695b2d0211179c1f610a74bf31eeec96075 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 07:44:33 +0000 Subject: [PATCH 85/94] updates to argument processing for mup --- megatron/neox_arguments/arguments.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 7bca420cd..e9d23125e 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1134,6 +1134,27 @@ def validate_values(self): if not self.deepspeed: return False + if self.use_mup: + if self.mup_d_model_base is None: + logging.info("mup_d_model_base is required when use_mup is True") + return False + + if self.mup_lr is not None: + self.lr = self.mup_lr + logging.info(f"Overriding lr with mup_lr: {self.mup_lr}") + + if self.mup_std is not None: + self.init_method_std = self.mup_std + logging.info(f"Overriding init_method_std with mup_std: {self.mup_std}") + + if self.mup_hidden_size is not None: + self.hidden_size = self.mup_hidden_size + logging.info(f"Overriding hidden_size with mup_hidden_size: {self.mup_hidden_size}") + + if self.mup_width_multiplier is None: + self.mup_width_multiplier = self.hidden_size / self.mup_d_model_base + logging.info(f"Overriding mup_width_multiplier with hidden_size/mup_d_model_base: {self.mup_width_multiplier}") + # learning rate if self.lr is None: error_message = self.__class__.__name__ + ".validate_values() lr is None" From cc711049b807ac4cdc200dbbe748719e54fb4e18 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 07:46:34 +0000 Subject: [PATCH 86/94] add data save and descriptions being printed --- megatron/mup_substitute.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index e45f3f82a..1438e0dba 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -32,13 +32,26 @@ def get_coord_data( "output_logits_act_abs_std": [], "width": [], } + + df_mode = "mup" if neox_args.use_mup else "sp" + if neox_args.use_mup: + print_rank_0("muP Coord Check for mu Parameterization") + else: + print_rank_0("muP Coord Check for standard Parameterization") + + _df = None + df_path = os.path.join(neox_args.mup_save, f"df_{df_mode}.csv") + if (neox_args.mup_save is not None) and os.path.exists(df_path): + _df = pd.read_csv(df_path) + with torch.no_grad(): torch.cuda.empty_cache() for width, model_obj in models.items(): for i in range(nseeds): torch.manual_seed((i + 1) * 100000) - print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n") + print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}") + print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {i}\n") model, optimizer, lr_scheduler = model_obj() model.train() neox_args.hidden_size = width @@ -150,4 +163,7 @@ def unlink_hp_params(lp_param_list): torch.cuda.empty_cache() deepspeed.runtime.utils.empty_cache() + temp_df = pd.DataFrame(df) + temp_df.to_csv(os.path.join(neox_args.mup_save, f"df_{df_mode}.csv"), index=False) + return pd.DataFrame(df) From c8feb39ed21fc4fb2faa355d1bf6c593d10ef0d6 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 07:54:34 +0000 Subject: [PATCH 87/94] update mup --- configs/coord_check_mup.yml | 51 +++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml index 77b333a46..1a14d8639 100644 --- a/configs/coord_check_mup.yml +++ b/configs/coord_check_mup.yml @@ -4,22 +4,24 @@ "model_parallel_size": 1, # model settings - "num_layers": 8, - "num_attention_heads": 8, - "seq_length": 128, - "max_position_embeddings": 128, + "num_layers": 2, + "num_attention_heads": 4, + "seq_length": 2048, + "max_position_embeddings": 2048, "pos_emb": "rotary", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - # "attention_config": [[["flash"], 8]], - # these should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, + # # init methods + # "init_method": "small_init", + # "output_layer_init_method": "wang_init", + # init methods "init_method": "normal", "output_layer_init_method": "scaled_normal", @@ -28,12 +30,12 @@ "optimizer": { "type": "Adam", "params": { - "lr": 0.006, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, - # "min_lr": 0.006, + "lr_decay_style": constant, + "warmup": 0, # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { @@ -48,8 +50,8 @@ }, # batch / data settings - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 32, + "train_micro_batch_size_per_gpu": 8, + "gradient_accumulation_steps": 1, "data_impl": "mmap", "num_workers": 1, @@ -61,7 +63,7 @@ # regularization "gradient_clipping": 1.0, - "weight_decay": 0.0, + "weight_decay": 0.1, "hidden_dropout": 0, "attention_dropout": 0, @@ -72,34 +74,29 @@ # "enabled": true, # "loss_scale": 0, # "loss_scale_window": 1000, - # "initial_scale_power": 12, # "hysteresis": 2, - # "min_loss_scale": 1, + # "min_loss_scale": 1 # }, # misc. training settings "train_iters": 10, "log_interval": 1, "distributed_backend": "nccl", - "lr_decay_style": "constant", - "tokenizer_type": "HFTokenizer", - "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", "coord_check": true, "coord_check_nsteps": 10, "coord_check_nseeds": 3, "use_mup": true, - # sigma_base - "init_method_std": 0.08, - # "mup_embedding_multiplier": 5, - # "mup_output_multiplier": 1, - # "mup_width_multiplier": 1, - "mup_d_model_base": 128, - "hidden_size": 128, + # base lr + "mup_lr": 0.01, + # base sigma + "mup_std": 0.06, + # base size + "mup_d_model_base": 256, - "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document", - - # "launcher": "slurm", - # "deepspeed_slurm": true, + "tokenizer_type": "HFTokenizer", + "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json", + "data-path": "/mnt/ssd-1/lintang/09-mup-neox/data/enwik8/enwik8_text_document", + "mup_save": "/mnt/ssd-1/lintang/09-mup-neox/mup_results", } From b6b3a02e52d9ed185574a0e7de85d9d6ce695db2 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 07:55:05 +0000 Subject: [PATCH 88/94] update seed --- megatron/mup_substitute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index 1438e0dba..ecac73fba 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -49,9 +49,10 @@ def get_coord_data( for width, model_obj in models.items(): for i in range(nseeds): - torch.manual_seed((i + 1) * 100000) + seed = (i + 1) * 100000 + torch.manual_seed(seed) print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}") - print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {i}\n") + print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}\n") model, optimizer, lr_scheduler = model_obj() model.train() neox_args.hidden_size = width From 847e8925197ce0842d65af0fa56c2018fcf6258a Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 07:55:41 +0000 Subject: [PATCH 89/94] remove print text --- megatron/training.py | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 2f94cdc96..be97de8cb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -121,20 +121,26 @@ def coord_check(neox_args, timers, train_data_iterator): else: os.makedirs(neox_args.mup_save, exist_ok=True) - def lazy_model(hidden_size, attention_head): + def lazy_model(hidden_size, attention_head, d_model_base=2**8): def gen(): old_hidden_size = neox_args.hidden_size old_num_attention_heads = neox_args.num_attention_heads + old_mup_d_model_base = neox_args.mup_d_model_base + old_mup_width_multiplier = neox_args.mup_width_multiplier + neox_args.hidden_size = hidden_size neox_args.num_attention_heads = attention_head - neox_args.mup_width_multiplier = None - neox_args.mup_d_model_base = 2**8 + neox_args.mup_d_model_base = d_model_base + neox_args.mup_width_multiplier = hidden_size / neox_args.mup_d_model_base + model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) neox_args.hidden_size = old_hidden_size neox_args.num_attention_heads = old_num_attention_heads + neox_args.mup_d_model_base = old_mup_d_model_base + neox_args.mup_width_multiplier = old_mup_width_multiplier return model, optimizer, lr_scheduler return gen @@ -147,10 +153,6 @@ def gen(): ) df_mode = "mup" if neox_args.use_mup else "sp" - if neox_args.use_mup: - print_rank_0(">>> Coord Check for mu Parameterization") - else: - print_rank_0(">>> Coord Check for standard Parameterization") df = get_coord_data( neox_args, @@ -476,8 +478,8 @@ def get_optimizer(model, neox_args): ) exit() - if neox_args["lr"] is not None: - neox_args["optimizer"]["params"]["lr"] = neox_args["lr"] + if neox_args.lr is not None: + neox_args.optimizer["params"]["lr"] = neox_args.lr # Build parameter groups for parameters that # are affected by weight decay and non-decay or @@ -638,25 +640,6 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): ) """Setup model and optimizer.""" - if neox_args.use_mup: - if neox_args.mup_lr is not None: - neox_args.lr = neox_args.mup_lr - print_rank_0(f"Overriding neox_args.lr with neox_args.mup_lr: {neox_args.mup_lr}") - - if neox_args.mup_std is not None: - neox_args.init_method_std = neox_args.mup_std - print_rank_0(f"Overriding neox_args.init_method_std with neox_args.mup_std: {neox_args.mup_std}") - - if neox_args.mup_hidden_size is not None: - neox_args.hidden_size = neox_args.mup_hidden_size - print_rank_0(f"Overriding neox_args.hidden_size with neox_args.mup_hidden_size: {neox_args.mup_hidden_size}") - - if neox_args.mup_width_multiplier is None: - neox_args.mup_width_multiplier = ( - neox_args.hidden_size / neox_args.mup_d_model_base - ) - print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}") - model = get_model(neox_args=neox_args, use_cache=use_cache) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) From 1b0027cdc11bc4e11be77e783bd923090592d1af Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 13:35:30 +0000 Subject: [PATCH 90/94] fixed kv --- megatron/model/transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 79203eae3..5e5e149d4 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -359,8 +359,7 @@ def __init__( coeff = None if neox_args.use_mup: - # self.norm_factor = self.hidden_size_per_attention_head - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.norm_factor = self.hidden_size_per_attention_head else: self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) From 055596f414f367006405a90dfc11a0b1f9073bd6 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 16:28:32 +0000 Subject: [PATCH 91/94] update --- configs/coord_check_mup.yml | 7 ++--- configs/coord_check_sp.yml | 54 +++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml index 1a14d8639..a09090029 100644 --- a/configs/coord_check_mup.yml +++ b/configs/coord_check_mup.yml @@ -84,15 +84,16 @@ "distributed_backend": "nccl", "coord_check": true, - "coord_check_nsteps": 10, - "coord_check_nseeds": 3, + "coord_check_nsteps": 5, + "coord_check_nseeds": 1, "use_mup": true, # base lr "mup_lr": 0.01, # base sigma - "mup_std": 0.06, + "mup_std": 0.08, # base size "mup_d_model_base": 256, + "mup_hidden_size": 256, "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json", diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml index ad7ef2246..66573892d 100644 --- a/configs/coord_check_sp.yml +++ b/configs/coord_check_sp.yml @@ -4,22 +4,24 @@ "model_parallel_size": 1, # model settings - "num_layers": 8, - "num_attention_heads": 8, - "seq_length": 128, - "max_position_embeddings": 128, + "num_layers": 2, + "num_attention_heads": 4, + "seq_length": 2048, + "max_position_embeddings": 2048, "pos_emb": "rotary", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - # "attention_config": [[["flash"], 8]], - # these should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, + # # init methods + # "init_method": "small_init", + # "output_layer_init_method": "wang_init", + # init methods "init_method": "normal", "output_layer_init_method": "scaled_normal", @@ -28,12 +30,13 @@ "optimizer": { "type": "Adam", "params": { - "lr": 0.006, + "lr": 0.01, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, - # "min_lr": 0.006, + "lr_decay_style": constant, + "warmup": 0, # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { @@ -48,8 +51,8 @@ }, # batch / data settings - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 32, + "train_micro_batch_size_per_gpu": 8, + "gradient_accumulation_steps": 1, "data_impl": "mmap", "num_workers": 1, @@ -61,7 +64,7 @@ # regularization "gradient_clipping": 1.0, - "weight_decay": 0.0, + "weight_decay": 0.1, "hidden_dropout": 0, "attention_dropout": 0, @@ -72,34 +75,27 @@ # "enabled": true, # "loss_scale": 0, # "loss_scale_window": 1000, - # "initial_scale_power": 12, # "hysteresis": 2, - # "min_loss_scale": 1, + # "min_loss_scale": 1 # }, # misc. training settings "train_iters": 10, "log_interval": 1, "distributed_backend": "nccl", - "lr_decay_style": "constant", - "tokenizer_type": "HFTokenizer", - "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json", "coord_check": true, - "coord_check_nsteps": 10, - "coord_check_nseeds": 3, - "use_mup": false, - # sigma_base + "coord_check_nsteps": 5, + "coord_check_nseeds": 1, + # "use_mup": true, + # base sigma "init_method_std": 0.08, - # "mup_embedding_multiplier": 5, - # "mup_output_multiplier": 1, - # "mup_width_multiplier": 1, - "mup_d_model_base": 128, - "hidden_size": 128, + # base size + "hidden_size": 256, - "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document", - - # "launcher": "slurm", - # "deepspeed_slurm": true, + "tokenizer_type": "HFTokenizer", + "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json", + "data-path": "/mnt/ssd-1/lintang/09-mup-neox/data/enwik8/enwik8_text_document", + "mup_save": "/mnt/ssd-1/lintang/09-mup-neox/mup_results", } From fabb45ba1f686d89bf9ca8df64e7f7b94c4a46d8 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 16:35:04 +0000 Subject: [PATCH 92/94] update dewcriptions being printed --- megatron/mup_substitute.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py index ecac73fba..9770c0765 100644 --- a/megatron/mup_substitute.py +++ b/megatron/mup_substitute.py @@ -51,12 +51,11 @@ def get_coord_data( for i in range(nseeds): seed = (i + 1) * 100000 torch.manual_seed(seed) - print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}") - print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}\n") + model, optimizer, lr_scheduler = model_obj() model.train() - neox_args.hidden_size = width - + print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}") + print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {model.neox_args.mup_width_multiplier}") for step in range(nsteps + 1): word_embedding_act_abs_std_list = [] From 5ccf693092bd4354c97d4536d9d4019fd0db66a2 Mon Sep 17 00:00:00 2001 From: lintang Date: Thu, 2 May 2024 16:35:48 +0000 Subject: [PATCH 93/94] removed unused lines --- megatron/model/init_functions.py | 10 ++-------- megatron/training.py | 13 ++----------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 2f85e4517..4387f8829 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -115,26 +115,20 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier): def small_init_init_method(dim, mup_width_multiplier=1.0): """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" - std = math.sqrt(2 / (5 * dim)) + std = math.sqrt(2 / (5 * dim)) / math.sqrt(args.mup_width_multiplier) def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) - if mup_width_multiplier != 1: - with torch.no_grad(): - init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ def wang_init_method(n_layers, dim, mup_width_multiplier=1.0): - std = 2 / n_layers / math.sqrt(dim) + std = 2 / n_layers / math.sqrt(dim) / math.sqrt(args.mup_width_multiplier) def init_(tensor, mup_width_multiplier=mup_width_multiplier): init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std) - if mup_width_multiplier != 1: - with torch.no_grad(): - init_weight.div_(math.sqrt(mup_width_multiplier)) return init_weight return init_ diff --git a/megatron/training.py b/megatron/training.py index be97de8cb..6ef9f1afb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -123,24 +123,16 @@ def coord_check(neox_args, timers, train_data_iterator): def lazy_model(hidden_size, attention_head, d_model_base=2**8): def gen(): - old_hidden_size = neox_args.hidden_size - old_num_attention_heads = neox_args.num_attention_heads - old_mup_d_model_base = neox_args.mup_d_model_base - old_mup_width_multiplier = neox_args.mup_width_multiplier neox_args.hidden_size = hidden_size neox_args.num_attention_heads = attention_head neox_args.mup_d_model_base = d_model_base - neox_args.mup_width_multiplier = hidden_size / neox_args.mup_d_model_base + neox_args.mup_width_multiplier = hidden_size / d_model_base model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) - neox_args.hidden_size = old_hidden_size - neox_args.num_attention_heads = old_num_attention_heads - neox_args.mup_d_model_base = old_mup_d_model_base - neox_args.mup_width_multiplier = old_mup_width_multiplier return model, optimizer, lr_scheduler return gen @@ -559,8 +551,7 @@ def get_optimizer(model, neox_args): else: try: # default to apex as it's slightly faster - # from apex.optimizers import FusedAdam as Adam - from torch.optim import Adam + from apex.optimizers import FusedAdam as Adam except ImportError: # if apex isn't installed, use deepspeed's FusedAdam print( From 485cad4c320fe7eaddf992083ec3bbc15bf713b7 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 2 May 2024 16:38:56 +0000 Subject: [PATCH 94/94] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index df0c97096..0ebb1b063 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 4e37645 + Default = 6a8ad71 current git hash of repository @@ -1785,6 +1785,41 @@ Training Arguments +- **mup_save**: str + + Default = None + + Path to save results when using muP + + + +- **mup_lr**: float + + Default = None + + An alias parameter for lr, + if not None will override lr + + + +- **mup_std**: float + + Default = None + + An alias parameter for init_method_std, + if not None will override init_method_std + + + +- **mup_hidden_size**: int + + Default = None + + An alias parameter for hidden_size, + if not None will override hidden_size + + + - **coord_check**: bool Default = False