From 0d921f79e5c1f73f5e4c9418afcf5de05196f1e4 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:02:02 +0000
Subject: [PATCH 01/94] changed ordering for setting up norm_factor

---
 megatron/model/transformer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 63f4122e2..eeb141fa1 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -295,14 +295,14 @@ def __init__(
             bias=neox_args.use_bias_in_attn_linear,
         )
 
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = max(1, self.layer_number)
-            self.norm_factor *= coeff
-
         if neox_args.use_mup:
             self.norm_factor = self.hidden_size_per_attention_head
+        else:
+            coeff = None
+            self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+            if self.apply_query_key_layer_scaling:
+                coeff = max(1, self.layer_number)
+                self.norm_factor *= coeff
 
         self.rpe = rpe
 
@@ -956,6 +956,12 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
     else:
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
+
+    # if self.neox_args.use_mup:
+    #     # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
+    #     # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
+    #     logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
+
     # Gather if needed.
     if parallel_output:
         return logits_parallel

From abee54daef5a0ca7e27a7f143ca8d93111dea54c Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 1 Dec 2023 03:02:58 +0000
Subject: [PATCH 02/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index bc2e8fc57..aa7b72d29 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 2da1083
+    Default = 0d921f7
 
     current git hash of repository
 

From a08c3efbf1688e9e46ea654b2f0a8195a0ae404e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:49:53 +0000
Subject: [PATCH 03/94] updated muP args to the minimum required

---
 megatron/model/gpt2_model.py         | 12 +++-----
 megatron/model/init_functions.py     | 43 ++++++----------------------
 megatron/model/transformer.py        |  9 ++----
 megatron/neox_arguments/neox_args.py | 33 +++++++--------------
 4 files changed, 26 insertions(+), 71 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 2725614cd..5fd70c49f 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -119,6 +119,9 @@ def __init__(
         self.init_method, self.output_layer_init_method = get_init_methods(
             self.neox_args
         )
+        self.init_method, self.output_layer_init_method = get_init_methods(
+            self.neox_args
+        )
         self.__topology__ = topology
 
         self.specs = []
@@ -268,16 +271,9 @@ def init_specs(self):
 
         def _logits_helper(embedding, lm_output):
             """Just a wrapper to massage inputs/outputs from pipeline."""
-            if self.neox_args.use_mup:
-                # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
-                # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
-                lm_output = (
-                    lm_output
-                    / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
-                )
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output
+                lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args
             )
             return logits
 
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 11bcdc310..ff4c36b53 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -16,41 +16,22 @@
 
 import torch
 
-try:
-    import mup
-except ImportError:
-    pass
 
-
-def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0):
+def init_method_normal(sigma):
     """Init method based on N(0, sigma)."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=sigma)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
     return init_
 
 
-def scaled_init_method_normal(
-    sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0
-):
+def scaled_init_method_normal(sigma, num_layers):
     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
     std = sigma / math.sqrt(2.0 * num_layers)
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 
     return init_
 
@@ -169,21 +150,15 @@ def init_(tensor, use_mup=use_mup_outer):
 
 def get_init_methods(args):
 
-    if args.use_mup:
-        try:
-            import mup
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
-
     def _get(name):
         if name == "normal":
             return init_method_normal(
-                args.init_method_std, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std*args.mup_init_scale
             )
         elif name == "scaled_normal":
             return scaled_init_method_normal(
-                args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std*args.mup_init_scale,
+                num_layers=args.num_layers
             )
         elif name == "orthogonal":
             return orthogonal_init_method(args.use_mup, args.mup_init_scale)
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index eeb141fa1..0785561cb 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -945,7 +945,7 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None):
     """LM logits using word embedding weights."""
     # Parallel logits.
     input_parallel = mpu.copy_to_model_parallel_region(input_)
@@ -956,11 +956,8 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
     else:
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
-
-    # if self.neox_args.use_mup:
-    #     # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
-    #     # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
-    #     logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
+    if args is not None and args.use_mup:
+        logits_parallel *= args.mup_output_logit_multiplier
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 957960832..58780881b 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -263,6 +263,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     init_method_std: float = 0.02
     """
     Standard deviation of the zero mean normal distribution used for weight initialization.
+    When using muP this is the base std
     """
 
     apply_query_key_layer_scaling: bool = False
@@ -427,6 +428,7 @@ class NeoXArgsOptimizer(NeoXArgsTemplate):
     lr: float = None
     """
     Max Learning rate during training
+    When using muP, this is the base learning rate
     """
 
 
@@ -1015,7 +1017,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
 
     use_mup: bool = False
     """
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+    Whether to use muP
     """
 
     coord_check: bool = False
@@ -1033,35 +1035,20 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Path to the base shapes to save to/load from
     """
 
-    mup_init_scale: float = 1.0
+    mup_emb: int = 1
     """
-    Initialization scale: All the parameters are multiplied by this value
+    Embedding output multiplier
     """
 
-    mup_attn_temp: float = 1.0
+    mup_m_width: int = 1
     """
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
+    Manually set the layer width multiplier (d_model/d_model,base)
     """
 
-    mup_output_temp: float = 1.0
+    mup_d_model_base: int = 64
     """
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-    """
-
-    mup_embedding_mult: float = 1.0
-    """
-    Scalar by which we multiply the output of the embedding layer
-    """
-
-    mup_rp_embedding_mult: float = 1.0
-    """
-    Scalar by which we multiply vectors representing relative position
-    """
-
-    mup_width_scale: int = 2
-    """
-    What to scale width by when creating the delta model for mup
+    d_model,base
+    Proxy (base) model's layer width
     """
 
 

From c35e8309a6f5b1e73f8d1dd888c23c481011b818 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:55:29 +0000
Subject: [PATCH 04/94] calculate m_width

---
 megatron/training.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index ed9c0bcd0..0dea5ab17 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -439,11 +439,9 @@ def get_model(neox_args, use_cache=False):
     neox_args.use_mup = old_use_mup
 
     if neox_args.use_mup:
-        try:
-            import mup
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
+
+        if neox_args.mup_m_width == 1:
+            neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
 
         base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
 

From 81fdc4d1f7b7558aa55c97ad9adc04cd2e7bf693 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 1 Dec 2023 09:30:44 +0000
Subject: [PATCH 05/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 50 +++++++++++----------------------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index aa7b72d29..93c0328fe 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 0d921f7
+    Default = 2d127df
 
     current git hash of repository
 
@@ -452,6 +452,7 @@ Model Arguments
     Default = 0.02
 
     Standard deviation of the zero mean normal distribution used for weight initialization.
+    When using muP this is the base std
 
 
 
@@ -663,6 +664,7 @@ Optimizer Arguments
     Default = None
 
     Max Learning rate during training
+    When using muP, this is the base learning rate
 
 
 
@@ -1521,7 +1523,7 @@ Training Arguments
 
     Default = False
 
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+    Whether to use muP
 
 
 
@@ -1549,52 +1551,28 @@ Training Arguments
 
 
 
-- **mup_init_scale**: float
+- **mup_emb**: int
 
-    Default = 1.0
-
-    Initialization scale: All the parameters are multiplied by this value
-
-
-
-- **mup_attn_temp**: float
-
-    Default = 1.0
-
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
-
-
-
-- **mup_output_temp**: float
-
-    Default = 1.0
-
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-
-
-
-- **mup_embedding_mult**: float
-
-    Default = 1.0
+    Default = 1
 
-    Scalar by which we multiply the output of the embedding layer
+    Embedding output multiplier
 
 
 
-- **mup_rp_embedding_mult**: float
+- **mup_m_width**: int
 
-    Default = 1.0
+    Default = 1
 
-    Scalar by which we multiply vectors representing relative position
+    Manually set the layer width multiplier (d_model/d_model,base)
 
 
 
-- **mup_width_scale**: int
+- **mup_d_model_base**: int
 
-    Default = 2
+    Default = 64
 
-    What to scale width by when creating the delta model for mup
+    d_model,base
+    Proxy (base) model's layer width
 
 
 

From 7d6b2468b5634157a90550d85a55605728a80918 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:02:02 +0000
Subject: [PATCH 06/94] changed ordering for setting up norm_factor

---
 megatron/model/transformer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 63f4122e2..eeb141fa1 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -295,14 +295,14 @@ def __init__(
             bias=neox_args.use_bias_in_attn_linear,
         )
 
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = max(1, self.layer_number)
-            self.norm_factor *= coeff
-
         if neox_args.use_mup:
             self.norm_factor = self.hidden_size_per_attention_head
+        else:
+            coeff = None
+            self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+            if self.apply_query_key_layer_scaling:
+                coeff = max(1, self.layer_number)
+                self.norm_factor *= coeff
 
         self.rpe = rpe
 
@@ -956,6 +956,12 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
     else:
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
+
+    # if self.neox_args.use_mup:
+    #     # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
+    #     # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
+    #     logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
+
     # Gather if needed.
     if parallel_output:
         return logits_parallel

From a0d1929aa99b070b3c3f6b5a7c921d469a2ad08a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:49:53 +0000
Subject: [PATCH 07/94] updated muP args to the minimum required

---
 megatron/model/gpt2_model.py         | 12 +++-----
 megatron/model/init_functions.py     | 43 ++++++----------------------
 megatron/model/transformer.py        |  9 ++----
 megatron/neox_arguments/neox_args.py | 33 +++++++--------------
 4 files changed, 26 insertions(+), 71 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 2725614cd..5fd70c49f 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -119,6 +119,9 @@ def __init__(
         self.init_method, self.output_layer_init_method = get_init_methods(
             self.neox_args
         )
+        self.init_method, self.output_layer_init_method = get_init_methods(
+            self.neox_args
+        )
         self.__topology__ = topology
 
         self.specs = []
@@ -268,16 +271,9 @@ def init_specs(self):
 
         def _logits_helper(embedding, lm_output):
             """Just a wrapper to massage inputs/outputs from pipeline."""
-            if self.neox_args.use_mup:
-                # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
-                # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
-                lm_output = (
-                    lm_output
-                    / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
-                )
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output
+                lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args
             )
             return logits
 
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 11bcdc310..ff4c36b53 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -16,41 +16,22 @@
 
 import torch
 
-try:
-    import mup
-except ImportError:
-    pass
 
-
-def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0):
+def init_method_normal(sigma):
     """Init method based on N(0, sigma)."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=sigma)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
     return init_
 
 
-def scaled_init_method_normal(
-    sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0
-):
+def scaled_init_method_normal(sigma, num_layers):
     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
     std = sigma / math.sqrt(2.0 * num_layers)
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
-            with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 
     return init_
 
@@ -169,21 +150,15 @@ def init_(tensor, use_mup=use_mup_outer):
 
 def get_init_methods(args):
 
-    if args.use_mup:
-        try:
-            import mup
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
-
     def _get(name):
         if name == "normal":
             return init_method_normal(
-                args.init_method_std, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std*args.mup_init_scale
             )
         elif name == "scaled_normal":
             return scaled_init_method_normal(
-                args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale
+                sigma=args.init_method_std*args.mup_init_scale,
+                num_layers=args.num_layers
             )
         elif name == "orthogonal":
             return orthogonal_init_method(args.use_mup, args.mup_init_scale)
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index eeb141fa1..0785561cb 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -945,7 +945,7 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None):
     """LM logits using word embedding weights."""
     # Parallel logits.
     input_parallel = mpu.copy_to_model_parallel_region(input_)
@@ -956,11 +956,8 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
     else:
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
-
-    # if self.neox_args.use_mup:
-    #     # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
-    #     # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
-    #     logits_parallel /= self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
+    if args is not None and args.use_mup:
+        logits_parallel *= args.mup_output_logit_multiplier
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 957960832..58780881b 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -263,6 +263,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     init_method_std: float = 0.02
     """
     Standard deviation of the zero mean normal distribution used for weight initialization.
+    When using muP this is the base std
     """
 
     apply_query_key_layer_scaling: bool = False
@@ -427,6 +428,7 @@ class NeoXArgsOptimizer(NeoXArgsTemplate):
     lr: float = None
     """
     Max Learning rate during training
+    When using muP, this is the base learning rate
     """
 
 
@@ -1015,7 +1017,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
 
     use_mup: bool = False
     """
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+    Whether to use muP
     """
 
     coord_check: bool = False
@@ -1033,35 +1035,20 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Path to the base shapes to save to/load from
     """
 
-    mup_init_scale: float = 1.0
+    mup_emb: int = 1
     """
-    Initialization scale: All the parameters are multiplied by this value
+    Embedding output multiplier
     """
 
-    mup_attn_temp: float = 1.0
+    mup_m_width: int = 1
     """
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
+    Manually set the layer width multiplier (d_model/d_model,base)
     """
 
-    mup_output_temp: float = 1.0
+    mup_d_model_base: int = 64
     """
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-    """
-
-    mup_embedding_mult: float = 1.0
-    """
-    Scalar by which we multiply the output of the embedding layer
-    """
-
-    mup_rp_embedding_mult: float = 1.0
-    """
-    Scalar by which we multiply vectors representing relative position
-    """
-
-    mup_width_scale: int = 2
-    """
-    What to scale width by when creating the delta model for mup
+    d_model,base
+    Proxy (base) model's layer width
     """
 
 

From d63b3b85014b8dbdf72df5c4962400d62db947da Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 03:55:29 +0000
Subject: [PATCH 08/94] calculate m_width

---
 megatron/training.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index ed9c0bcd0..0dea5ab17 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -439,11 +439,9 @@ def get_model(neox_args, use_cache=False):
     neox_args.use_mup = old_use_mup
 
     if neox_args.use_mup:
-        try:
-            import mup
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
+
+        if neox_args.mup_m_width == 1:
+            neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
 
         base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
 

From 9be82fed4d761111091b78c261cd3a9ed8a25506 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 1 Dec 2023 03:02:58 +0000
Subject: [PATCH 09/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index bc2e8fc57..aa7b72d29 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 2da1083
+    Default = 0d921f7
 
     current git hash of repository
 

From 66214d949b09e3dbc1557bd993715a5f49b4daaf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 09:31:32 +0000
Subject: [PATCH 10/94] removed redundant line

---
 megatron/model/gpt2_model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 5fd70c49f..012711a62 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -119,9 +119,6 @@ def __init__(
         self.init_method, self.output_layer_init_method = get_init_methods(
             self.neox_args
         )
-        self.init_method, self.output_layer_init_method = get_init_methods(
-            self.neox_args
-        )
         self.__topology__ = topology
 
         self.specs = []

From a6bad07f39d3364f74c39da3d844d61d722ae729 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 1 Dec 2023 09:34:08 +0000
Subject: [PATCH 11/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 93c0328fe..236226fdb 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 2d127df
+    Default = 17b7183
 
     current git hash of repository
 

From 63984bdfcac7e296439046b3b80b9a6c8501766d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 09:34:36 +0000
Subject: [PATCH 12/94] removed redundant lines

---
 megatron/model/gpt2_model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 5fd70c49f..012711a62 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -119,9 +119,6 @@ def __init__(
         self.init_method, self.output_layer_init_method = get_init_methods(
             self.neox_args
         )
-        self.init_method, self.output_layer_init_method = get_init_methods(
-            self.neox_args
-        )
         self.__topology__ = topology
 
         self.specs = []

From 11114e27958ed76b9d4b76969546b7325c126314 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 1 Dec 2023 09:35:05 +0000
Subject: [PATCH 13/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 236226fdb..0a30acdbe 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 17b7183
+    Default = 02687a8
 
     current git hash of repository
 

From 05c4de35aeb1d51e0a12d64dd5b109aa8f5f031e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 13:29:25 +0000
Subject: [PATCH 14/94] modify init with mup

---
 megatron/model/init_functions.py | 78 ++++++++++++++------------------
 1 file changed, 35 insertions(+), 43 deletions(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index ff4c36b53..44666b229 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -68,12 +68,12 @@ def _orthogonal(tensor, gain=1):
     return tensor
 
 
-def orthogonal_init_method(n_layers=1, use_mup=False, mup_init_scale=1.0):
+def orthogonal_init_method(n_layers=1, mup_m_width=1.0):
     """Fills the input Tensor with a (semi) orthogonal matrix, as described in
     Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
     Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
 
-    if use_mup:
+    if mup_m_width != 1:
         raise ValueError(
             "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error"
         )
@@ -84,67 +84,59 @@ def init_(tensor):
     return init_
 
 
-def xavier_uniform_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_uniform_init_method(mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.xavier_uniform_(tensor)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.xavier_uniform_(tensor)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.xavier_uniform_(tensor)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def xavier_normal_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_normal_init_method(mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.xavier_normal_(tensor)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.xavier_normal_(tensor)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.xavier_normal_(tensor)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
+def small_init_init_method(dim, mup_m_width=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
     the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
     std = math.sqrt(2 / (5 * dim))
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+                init_weight.div_(mup_m_width)
+        return init_weight
 
     return init_
 
 
-def wang_init_method(n_layers, dim, use_mup_outer=False, mup_init_scale=1.0):
+def wang_init_method(n_layers, dim, mup_m_width=1.0):
     std = 2 / n_layers / math.sqrt(dim)
 
-    def init_(tensor, use_mup=use_mup_outer):
-        if use_mup:
-            mup.init.normal_(tensor, mean=0.0, std=std)
+    def init_(tensor, mup_m_width=mup_m_width):
+        init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        if mup_m_width != 1:
             with torch.no_grad():
-                tensor.mul_(mup_init_scale)
-            return tensor
-        else:
-            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
+                init_weight.div_(mup_m_width)
+        return init_weight
+            
     return init_
 
 
@@ -153,30 +145,30 @@ def get_init_methods(args):
     def _get(name):
         if name == "normal":
             return init_method_normal(
-                sigma=args.init_method_std*args.mup_init_scale
+                sigma=args.init_method_std/math.sqrt(args.mup_m_width)
             )
         elif name == "scaled_normal":
             return scaled_init_method_normal(
-                sigma=args.init_method_std*args.mup_init_scale,
+                sigma=args.init_method_std/math.sqrt(args.mup_m_width),
                 num_layers=args.num_layers
             )
         elif name == "orthogonal":
-            return orthogonal_init_method(args.use_mup, args.mup_init_scale)
+            return orthogonal_init_method(args.mup_m_width)
         elif name == "scaled_orthogonal":
             return orthogonal_init_method(
-                args.num_layers, args.use_mup, args.mup_init_scale
+                args.num_layers, args.mup_m_width
             )
         elif name == "xavier_uniform":
-            return xavier_uniform_init_method(args.use_mup, args.mup_init_scale)
+            return xavier_uniform_init_method(args.mup_m_width)
         elif name == "xavier_normal":
-            return xavier_normal_init_method(args.use_mup, args.mup_init_scale)
+            return xavier_normal_init_method(args.mup_m_width)
         elif name == "wang_init":
             return wang_init_method(
-                args.num_layers, args.hidden_size, args.use_mup, args.mup_init_scale
+                args.num_layers, args.hidden_size, args.mup_m_width
             )
         elif name == "small_init":
             return small_init_init_method(
-                args.hidden_size, args.use_mup, args.mup_init_scale
+                args.hidden_size, args.mup_m_width
             )
         else:
             raise NotImplementedError(f"Unknown init method {name}")

From 71a91e40455203c4cbab9ded0588094fc23920c9 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 13:29:47 +0000
Subject: [PATCH 15/94] divide logits by the m_width

---
 megatron/model/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 0785561cb..9f48f1342 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -957,7 +957,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
     if args is not None and args.use_mup:
-        logits_parallel *= args.mup_output_logit_multiplier
+        logits_parallel /= args.mup_m_width
 
     # Gather if needed.
     if parallel_output:

From 99c8ce05a792029b0f04f5c914fec35832c09dcf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 13:52:30 +0000
Subject: [PATCH 16/94] moved position of mup parameters being processed

---
 megatron/training.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 0dea5ab17..4463134e7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -402,8 +402,23 @@ def get_model(neox_args, use_cache=False):
 
     # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below.
     # If mup isn't being used anyways, this has no effect.
-    old_use_mup = neox_args.use_mup
-    neox_args.use_mup = False
+    # old_use_mup = neox_args.use_mup
+    # neox_args.use_mup = False
+    if neox_args.use_mup:
+
+        if neox_args.mup_m_width == 1:
+            neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
+
+        base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
+
+        if neox_args.save_base_shapes:
+            save_base_shapes(neox_args, base_shapes, use_cache)
+
+        # mup.set_base_shapes(model, base_shapes)
+
+        # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute
+        # mup_weights_reinit(neox_args, model)
+
     model = GPT2ModelPipe(
         neox_args=neox_args,
         num_tokentypes=0,
@@ -436,22 +451,7 @@ def get_model(neox_args, use_cache=False):
         # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
         model = model.to_sequential()
 
-    neox_args.use_mup = old_use_mup
-
-    if neox_args.use_mup:
-
-        if neox_args.mup_m_width == 1:
-            neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
-
-        base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
-
-        if neox_args.save_base_shapes:
-            save_base_shapes(neox_args, base_shapes, use_cache)
-
-        mup.set_base_shapes(model, base_shapes)
-
-        # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute
-        mup_weights_reinit(neox_args, model)
+    # neox_args.use_mup = old_use_mup
 
     if neox_args.deepspeed:
         # DeepSpeed handles CUDA, FP16, and DDP components.

From b253ab6a41e4f703f7c869148ac9c5896ccdb70c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Dec 2023 13:53:04 +0000
Subject: [PATCH 17/94] add note

---
 megatron/model/gpt2_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 012711a62..6a44fc8ca 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -175,6 +175,7 @@ def init_specs(self):
         # Embedding layer
         # input will be (input_ids, position_ids, attention_mask)
 
+        # TODO Initilized weights here should not be divided by m_width
         if weight_tying:
             self.specs.append(
                 TiedLayerSpec(

From 1919499fa01a5526ee726e9b3565c5c667501caf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 6 Dec 2023 04:29:41 +0000
Subject: [PATCH 18/94] made param groups to hold flag for mup scaling

---
 megatron/model/utils.py | 57 ++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 6beac5ca2..d505fdbb3 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -18,6 +18,7 @@
 """Utilities for models."""
 
 import torch
+from megatron.mpu import VocabParallelEmbedding
 from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
 from megatron.model.fused_softmax import SoftmaxFusionTypes
 from types import GeneratorType
@@ -28,8 +29,11 @@ def get_params_for_weight_decay_optimization(module, neox_args):
     """Divide params into with-weight-decay and without-weight-decay groups.
     Layernorms and biases will have no weight decay but the rest will.
     """
-    weight_decay_params = {"params": []}
-    no_weight_decay_params = {"params": [], "weight_decay": 0.0}
+    weight_decay_params = {"params": [], "lr_adjust": True}
+    no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
+    embedding_weight_decay_params = {"params": [], "lr_adjust": False}
+    embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
+
     for module_ in module.modules():
         if any(
             [
@@ -44,26 +48,49 @@ def get_params_for_weight_decay_optimization(module, neox_args):
                 [p for p in list(module_._parameters.values()) if p is not None]
             )
         else:
-            weight_decay_params["params"].extend(
-                [
-                    p
-                    for n, p in list(module_._parameters.items())
-                    if p is not None and n != "bias"
-                ]
-            )
-            no_weight_decay_params["params"].extend(
+            if any(
                 [
-                    p
-                    for n, p in list(module_._parameters.items())
-                    if p is not None and n == "bias"
+                    isinstance(module_, VocabParallelEmbedding),
                 ]
-            )
+            ):
+
+                embedding_weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n != "bias"
+                    ]
+                )
+                embedding_no_weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n == "bias"
+                    ]
+                )
+            else:
+
+                weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n != "bias"
+                    ]
+                )
+                no_weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n == "bias"
+                    ]
+                )
+
     if neox_args.weight_decay == 0.0:
         # only return a single param group
         # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
         # to avoid this, only use a single param group when weight decay is off.
         return [no_weight_decay_params]
-    return weight_decay_params, no_weight_decay_params
+    return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params
 
 
 def exists(x):

From 17678e01f445a76892ea6f8ac7127e8a30e7a8ae Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 6 Dec 2023 04:30:06 +0000
Subject: [PATCH 19/94] lr scale

---
 megatron/training.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4463134e7..fad9655bc 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -535,13 +535,15 @@ def get_optimizer(model, neox_args):
     elif neox_args.optimizer_type.lower() == "adam":
         # Use Adam
         if neox_args.use_mup:
-            try:
-                from mup import MuAdam
-
-                adam_optimizer = MuAdam
-            except ModuleNotFoundError:
-                print("Please install mup https://github.com/microsoft/mup")
-                raise Exception
+            # try:
+            #     from mup import MuAdam
+
+            #     adam_optimizer = MuAdam
+            # except ModuleNotFoundError:
+            #     print("Please install mup https://github.com/microsoft/mup")
+            #     raise Exception
+            from deepspeed.ops.adam import FusedAdam as Adam
+            adam_optimizer = Adam
         else:
             if neox_args.use_bnb_optimizer:
                 try:
@@ -583,6 +585,12 @@ def get_optimizer(model, neox_args):
     else:
         raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized")
 
+    # This is where the LR scaling is applied
+    if neox_args.use_mup:
+        for pg in optimizer.param_groups:
+            if ("lr_adjust" in pg) and pg["lr_adjust"] is True:
+                pg["lr"] /= neox_args.mup_m_width
+
     if neox_args.deepspeed:
         # fp16 wrapper is not required for DeepSpeed.
         return optimizer, param_groups

From 2bd5ae6d15a8de982daf628cb8f0a82f34a524a0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 6 Dec 2023 04:30:22 +0000
Subject: [PATCH 20/94] update config

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 0a30acdbe..3d714a7c7 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -664,7 +664,7 @@ Optimizer Arguments
     Default = None
 
     Max Learning rate during training
-    When using muP, this is the base learning rate
+    When using muP, this is the base lr
 
 
 

From 66422913280ebb2048e7764d27b04470f8479005 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 6 Dec 2023 04:30:47 +0000
Subject: [PATCH 21/94] adjust process of mup variables

---
 megatron/model/transformer.py        | 2 +-
 megatron/model/word_embeddings.py    | 8 ++++----
 megatron/mpu/layers.py               | 8 ++++++--
 megatron/neox_arguments/neox_args.py | 7 ++++---
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 9f48f1342..437f0b38e 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -295,10 +295,10 @@ def __init__(
             bias=neox_args.use_bias_in_attn_linear,
         )
 
+        coeff = None
         if neox_args.use_mup:
             self.norm_factor = self.hidden_size_per_attention_head
         else:
-            coeff = None
             self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
             if self.apply_query_key_layer_scaling:
                 coeff = max(1, self.layer_number)
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 488baf042..29b20b320 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -51,8 +51,8 @@ def __init__(
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
         self.use_mup = neox_args.use_mup
-        self.mup_embedding_mult = neox_args.mup_embedding_mult
-        self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
+        self.mup_m_emb = float(neox_args.mup_m_emb)
+        # self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
 
         # Word embeddings (parallel).
         self.word_embeddings = mpu.VocabParallelEmbedding(
@@ -142,7 +142,6 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
                 # OPT always adds 2 for some reason, according to the HF implementation
                 position_ids = position_ids + self.opt_pos_emb_offset
             position_embeddings = self.position_embeddings(position_ids)
-            position_embeddings.mul_(self.mup_rp_embedding_mult)
             embeddings = words_embeddings + position_embeddings
         else:
             embeddings = words_embeddings
@@ -155,9 +154,10 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
         # Dropout.
         embeddings = self.embedding_dropout(embeddings)
 
+        # Y_emb = m_emb * embed(X)
         if self.use_mup:
             with torch.no_grad():
-                embeddings.mul_(self.mup_embedding_mult)
+                embeddings = torch.mul(embeddings, self.mup_m_emb)
 
         return embeddings
 
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 92edbd6eb..859553551 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -429,6 +429,7 @@ def __init__(
         self.stride = stride
         self.mup_rescale_parameters = mup_rescale_parameters
         self.use_mup = neox_args.use_mup
+        self.m_width = neox_args.mup_m_width
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -547,8 +548,10 @@ def set_parallel_output(self, value: bool):
         )  # if gather_output is True, parallel output is False, so we set the opposite
 
     def forward(self, input_):
+
+        # Y_logits = W_unembed * X / m_width
         if self.use_mup and self.mup_rescale_parameters:
-            input_ /= self.width_mult()
+            input_ /= self.m_width
         # Set up backprop all-reduce.
         input_parallel = copy_to_model_parallel_region(input_)
         # Matrix multiply.
@@ -624,6 +627,7 @@ def __init__(
         self.keep_master_weight_for_test = keep_master_weight_for_test
         self.mup_rescale_parameters = mup_rescale_parameters
         self.use_mup = neox_args.use_mup
+        self.m_width = neox_args.mup_m_width
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -735,7 +739,7 @@ def set_parallel_output(self, parallel_output: bool):
 
     def forward(self, input_):
         if self.use_mup and self.mup_rescale_parameters:
-            input_ /= self.width_mult()
+            input_ /= self.m_width
         # Set up backprop all-reduce.
         if self.input_is_parallel:
             input_parallel = input_
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 58780881b..de7169654 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -103,6 +103,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     hidden_size: int = None
     """
     Transformer hidden size.
+    When using muP, this is d_model
     """
 
     num_attention_heads: int = None
@@ -1035,17 +1036,17 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Path to the base shapes to save to/load from
     """
 
-    mup_emb: int = 1
+    mup_m_emb: float = 1.0
     """
     Embedding output multiplier
     """
 
-    mup_m_width: int = 1
+    mup_m_width: float = None
     """
     Manually set the layer width multiplier (d_model/d_model,base)
     """
 
-    mup_d_model_base: int = 64
+    mup_d_model_base: int = 256
     """
     d_model,base
     Proxy (base) model's layer width

From 8be6c66b2add4b704417b46644602b375fc240d3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 18 Dec 2023 06:32:44 +0000
Subject: [PATCH 22/94] remove calling save_base_shapes

---
 megatron/training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index fad9655bc..221fa6cd3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -409,10 +409,10 @@ def get_model(neox_args, use_cache=False):
         if neox_args.mup_m_width == 1:
             neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
 
-        base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
+        # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
 
-        if neox_args.save_base_shapes:
-            save_base_shapes(neox_args, base_shapes, use_cache)
+        # if neox_args.save_base_shapes:
+        #     save_base_shapes(neox_args, base_shapes, use_cache)
 
         # mup.set_base_shapes(model, base_shapes)
 

From c9fb18ba12b8974f9310e5094e75317c71666192 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 18 Dec 2023 07:48:47 +0000
Subject: [PATCH 23/94] lr adjustments is done in train_step to address lr
 being reset due to lr_scheduling

---
 megatron/training.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 221fa6cd3..994ff5fd9 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -585,12 +585,6 @@ def get_optimizer(model, neox_args):
     else:
         raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized")
 
-    # This is where the LR scaling is applied
-    if neox_args.use_mup:
-        for pg in optimizer.param_groups:
-            if ("lr_adjust" in pg) and pg["lr_adjust"] is True:
-                pg["lr"] /= neox_args.mup_m_width
-
     if neox_args.deepspeed:
         # fp16 wrapper is not required for DeepSpeed.
         return optimizer, param_groups
@@ -729,6 +723,11 @@ def backward_step(neox_args, timers, optimizer, model, loss):
 def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
     """Single training step."""
 
+    if neox_args.use_mup:
+        for pg in optimizer.param_groups:
+            if ("lr_adjust" in pg) and pg["lr_adjust"] is True:
+                pg["lr"] /= neox_args.mup_m_width
+
     # Pipeline parallelism schedules forward/backward/step
     if neox_args.is_pipe_parallel:
         reduced_loss = train_step_pipe(

From 795371c3525ea3537074d2c5f416c7f2ab7e4207 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 18 Dec 2023 09:45:26 +0000
Subject: [PATCH 24/94] lr scaling for mup is moved here instead

---
 megatron/learning_rates.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index d5d2640c9..424aee20c 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -37,6 +37,7 @@ def __init__(
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
+        mup_m_width=1,
     ):
 
         # Class values.
@@ -51,6 +52,7 @@ def __init__(
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
         self.use_mup = use_mup
+        self.mup_m_width = mup_m_width
         if self.override_lr_scheduler:
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
@@ -95,8 +97,8 @@ def step(self, step_num=None):
         self.num_iters = step_num
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
-            if self.use_mup and "width_mult" in group:
-                group["lr"] = new_lr / group["width_mult"]
+            if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
+                group["lr"] = new_lr / self.mup_m_width
             else:
                 group["lr"] = new_lr
 

From 087beee884442ce3f98a6de34ed1aff533169e9a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 3 Jan 2024 14:27:33 +0000
Subject: [PATCH 25/94] removed mup usage for coord check

---
 megatron/mup_substitute.py | 75 +++++++++++++++++++++++---------------
 megatron/training.py       | 20 +++++-----
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index e16a21589..78ddedff0 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -10,7 +10,7 @@
 import torch
 import torch.nn.functional as F
 
-from mup import coord_check as mup_coord_check
+# from mup import coord_check as mup_coord_check
 from megatron.training import train_step
 
 
@@ -39,31 +39,44 @@ def _get_coord_data(
 ):
     df = []
 
+    def word_embedding_coord_check_hook(module, input, output):
+        with torch.no_grad():
+            word_embedding_act_abs_mean_list.append(output.abs().mean().item())
+
     for i in range(nseeds):
         torch.manual_seed(i)
         for width, model in models.items():
             model = model()
             model.train()
-            optimizer = optcls(model)
+            # optimizer = optcls(model)
+            optimizer, _ = optcls(model, neox_args)
             for step in range(nsteps + 1):
+                word_embedding_act_abs_mean_list = []
                 remove_hooks = []
                 # add hooks
-                for name, module in model.named_modules():
-                    if filter_module_by_name and not filter_module_by_name(name):
-                        continue
-                    remove_hooks.append(
-                        module.register_forward_hook(
-                            mup_coord_check._record_coords(
-                                df,
-                                width,
-                                name,
-                                step + 1,
-                                output_fdict=output_fdict,
-                                input_fdict=input_fdict,
-                                param_fdict=param_fdict,
-                            )
-                        )
-                    )
+                # for name, module in model.named_modules():
+                #     if name.endswith(".embedding.word_embeddings"):
+                #         print("yess")
+                #         import sys; sys.exit
+                #         remove_hook.append(
+                #             module.register_forward_hook(word_embedding_coord_check_hook))
+
+                #     # if filter_module_by_name and not filter_module_by_name(name):
+                #     #     continue
+                #     # pass
+                #     # remove_hooks.append(
+                #     #     module.register_forward_hook(
+                #     #         mup_coord_check._record_coords(
+                #     #             df,
+                #     #             width,
+                #     #             name,
+                #     #             step + 1,
+                #     #             output_fdict=output_fdict,
+                #     #             input_fdict=input_fdict,
+                #     #             param_fdict=param_fdict,
+                #     #         )
+                #     #     )
+                #     # )
 
                 # train for a step
                 loss_dict, skipped_iter = train_step(
@@ -79,6 +92,8 @@ def _get_coord_data(
                 for handle in remove_hooks:
                     handle.remove()
 
+            print("word_embedding_act_abs_mean_list")
+            print(word_embedding_act_abs_mean_list)
             import gc
 
             del model
@@ -180,9 +195,10 @@ def get_coord_data(
     if lr is None:
         lr = 0.1 if optimizer == "sgd" else 1e-3
     if mup:
-        from mup.optim import MuAdam as Adam
-        from mup.optim import MuAdamW as AdamW
-        from mup.optim import MuSGD as SGD
+        # from mup.optim import MuAdam as Adam
+        # from mup.optim import MuAdamW as AdamW
+        # from mup.optim import MuSGD as SGD
+        from deepspeed.ops.adam import FusedAdam as Adam
     else:
         from torch.optim import SGD, Adam, AdamW
 
@@ -195,14 +211,15 @@ def get_trainable(model):
                     params.append(p)
         return params
 
-    if optimizer == "sgd":
-        optcls = lambda model: SGD(get_trainable(model), lr=lr)
-    elif optimizer == "adam":
-        optcls = lambda model: Adam(get_trainable(model), lr=lr)
-    elif optimizer == "adamw":
-        optcls = lambda model: AdamW(get_trainable(model), lr=lr)
-    elif optimizer is None:
-        raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
+    # if optimizer == "sgd":
+    #     optcls = lambda model: SGD(get_trainable(model), lr=lr)
+    # elif optimizer == "adam":
+    #     optcls = lambda model: Adam(get_trainable(model), lr=lr)
+    # elif optimizer == "adamw":
+    #     optcls = lambda model: AdamW(get_trainable(model), lr=lr)
+    # elif optimizer is None:
+    #     raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
+    optcls = optimizer
 
     data = _get_coord_data(
         neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
diff --git a/megatron/training.py b/megatron/training.py
index 994ff5fd9..999b857d3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -126,7 +126,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache):
 
 def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
-    from mup.coord_check import plot_coord_data
+    # from mup.coord_check import plot_coord_data
 
     def lazy_model(hidden_size):
         def gen():
@@ -149,17 +149,19 @@ def gen():
     for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
         models[hidden_size] = lazy_model(hidden_size)
 
+    # optimizer, _ = get_optimizer(model, neox_args)
+
     neox_args.use_mup = True
     df_up = get_coord_data(
-        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True
+        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True, optimizer=get_optimizer
     )
     neox_args.use_mup = False
     df_sp = get_coord_data(
-        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False
+        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False, optimizer=get_optimizer
     )
 
-    plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
-    plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
+    # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
+    # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
 
     print_rank_0("Saved coord check plots... exiting")
     sys.exit(1)
@@ -204,6 +206,7 @@ def pretrain(neox_args):
     timers("train/valid/test data iterators").stop()
 
     if neox_args.use_mup and neox_args.coord_check:
+        print_rank_0("Do muP Coord Check")
         mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator)
 
     # Print setup timing.
@@ -408,6 +411,7 @@ def get_model(neox_args, use_cache=False):
 
         if neox_args.mup_m_width == 1:
             neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
+        print_rank_0(f"mup_m_width set to {neox_args.mup_m_width}")
 
         # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
 
@@ -623,6 +627,7 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=neox_args.override_lr_scheduler,
         use_mup=neox_args.use_mup,
+        mup_m_width=neox_args.mup_m_width,
     )
 
     return lr_scheduler
@@ -723,11 +728,6 @@ def backward_step(neox_args, timers, optimizer, model, loss):
 def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
     """Single training step."""
 
-    if neox_args.use_mup:
-        for pg in optimizer.param_groups:
-            if ("lr_adjust" in pg) and pg["lr_adjust"] is True:
-                pg["lr"] /= neox_args.mup_m_width
-
     # Pipeline parallelism schedules forward/backward/step
     if neox_args.is_pipe_parallel:
         reduced_loss = train_step_pipe(

From e7b7bf67a437f69a46ba6b90b1257519d5e48465 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 24 Jan 2024 15:52:21 +0000
Subject: [PATCH 26/94] latest update on coord check implementation

---
 megatron/mup_substitute.py | 89 ++++++++++++++++++++------------------
 megatron/training.py       | 33 +++++++++-----
 2 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 78ddedff0..6b54d904f 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -10,6 +10,8 @@
 import torch
 import torch.nn.functional as F
 
+from megatron import print_rank_0
+
 # from mup import coord_check as mup_coord_check
 from megatron.training import train_step
 
@@ -30,7 +32,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=1,
+    nseeds=3,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -43,40 +45,47 @@ def word_embedding_coord_check_hook(module, input, output):
         with torch.no_grad():
             word_embedding_act_abs_mean_list.append(output.abs().mean().item())
 
+    word_embedding_act_abs_mean_list = []
+    _seeds = []
+    _steps = []
+    remove_hooks = []
+
     for i in range(nseeds):
         torch.manual_seed(i)
         for width, model in models.items():
             model = model()
             model.train()
-            # optimizer = optcls(model)
-            optimizer, _ = optcls(model, neox_args)
+            optimizer = optcls(model)
+            # optimizer, _ = optcls(model, neox_args)
+
             for step in range(nsteps + 1):
-                word_embedding_act_abs_mean_list = []
-                remove_hooks = []
+
                 # add hooks
-                # for name, module in model.named_modules():
-                #     if name.endswith(".embedding.word_embeddings"):
-                #         print("yess")
-                #         import sys; sys.exit
-                #         remove_hook.append(
-                #             module.register_forward_hook(word_embedding_coord_check_hook))
-
-                #     # if filter_module_by_name and not filter_module_by_name(name):
-                #     #     continue
-                #     # pass
-                #     # remove_hooks.append(
-                #     #     module.register_forward_hook(
-                #     #         mup_coord_check._record_coords(
-                #     #             df,
-                #     #             width,
-                #     #             name,
-                #     #             step + 1,
-                #     #             output_fdict=output_fdict,
-                #     #             input_fdict=input_fdict,
-                #     #             param_fdict=param_fdict,
-                #     #         )
-                #     #     )
-                #     # )
+                for name, module in model.named_modules():
+                    if name.endswith(".word_embeddings"):
+                        remove_hooks.append(
+                            module.register_forward_hook(word_embedding_coord_check_hook))
+
+                        _steps.append(step)
+                        _seeds.append(i)
+
+
+                    # if filter_module_by_name and not filter_module_by_name(name):
+                    #     continue
+                    # pass
+                    # remove_hooks.append(
+                    #     module.register_forward_hook(
+                    #         mup_coord_check._record_coords(
+                    #             df,
+                    #             width,
+                    #             name,
+                    #             step + 1,
+                    #             output_fdict=output_fdict,
+                    #             input_fdict=input_fdict,
+                    #             param_fdict=param_fdict,
+                    #         )
+                    #     )
+                    # )
 
                 # train for a step
                 loss_dict, skipped_iter = train_step(
@@ -91,14 +100,13 @@ def word_embedding_coord_check_hook(module, input, output):
                 # remove hooks
                 for handle in remove_hooks:
                     handle.remove()
-
-            print("word_embedding_act_abs_mean_list")
-            print(word_embedding_act_abs_mean_list)
             import gc
-
             del model
             gc.collect()
 
+    for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list):
+        print_rank_0(_i, _j, _k)
+
     return pd.DataFrame(df)
 
 
@@ -211,15 +219,14 @@ def get_trainable(model):
                     params.append(p)
         return params
 
-    # if optimizer == "sgd":
-    #     optcls = lambda model: SGD(get_trainable(model), lr=lr)
-    # elif optimizer == "adam":
-    #     optcls = lambda model: Adam(get_trainable(model), lr=lr)
-    # elif optimizer == "adamw":
-    #     optcls = lambda model: AdamW(get_trainable(model), lr=lr)
-    # elif optimizer is None:
-    #     raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
-    optcls = optimizer
+    if optimizer == "sgd":
+        optcls = lambda model: SGD(get_trainable(model), lr=lr)
+    elif optimizer == "adam":
+        optcls = lambda model: Adam(get_trainable(model), lr=lr)
+    elif optimizer == "adamw":
+        optcls = lambda model: AdamW(get_trainable(model), lr=lr)
+    elif optimizer is None:
+        raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
 
     data = _get_coord_data(
         neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
diff --git a/megatron/training.py b/megatron/training.py
index 4825141ed..86e2d5fa3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -55,7 +55,7 @@
     CharCounter,
 )
 from megatron.model.gpt2_model import cross_entropy
-from eval_tasks import run_eval_harness
+# from eval_tasks import run_eval_harness
 
 
 def mup_weights_reinit(neox_args, model):
@@ -124,7 +124,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache):
     sys.exit(1)
 
 
-def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator):
+def mup_coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
     # from mup.coord_check import plot_coord_data
 
@@ -133,7 +133,7 @@ def gen():
             old_hidden_size = neox_args.hidden_size
             neox_args.hidden_size = hidden_size
 
-            model, optimizer, _ = setup_model_and_optimizer(
+            model, optimizer, lr_scheduler = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
@@ -145,24 +145,35 @@ def gen():
 
     models = {}
 
-    # Hidden size needs to be divisible by num attention heads
-    for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
-        models[hidden_size] = lazy_model(hidden_size)
+    # # Hidden size needs to be divisible by num attention heads
+    # for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
+    #     models[hidden_size] = lazy_model(hidden_size)
 
-    # optimizer, _ = get_optimizer(model, neox_args)
+    # 128
+    # 256
+    # 512
+    # 1024
+    # 2048
+    # 4096
+    # 8192
 
+    models[neox_args.hidden_size] = lazy_model(neox_args.hidden_size)
+
+    print_rank_0("df_up")
     neox_args.use_mup = True
     df_up = get_coord_data(
-        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True, optimizer=get_optimizer
+        neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
     )
+    print_rank_0("df_sp")
     neox_args.use_mup = False
     df_sp = get_coord_data(
-        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False, optimizer=get_optimizer
+        neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam"
     )
 
     # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
     # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
 
+
     print_rank_0("Saved coord check plots... exiting")
     sys.exit(1)
 
@@ -207,7 +218,9 @@ def pretrain(neox_args):
 
     if neox_args.use_mup and neox_args.coord_check:
         print_rank_0("Do muP Coord Check")
-        mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator)
+        mup_coord_check(neox_args, timers, train_data_iterator)
+    else:
+        pass
 
     # Print setup timing.
     print_rank_0("done with setups ...")

From 8dea9cef9eed9e0b1e6434addde5d529a3a0e36b Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:02:20 +0000
Subject: [PATCH 27/94] fix merge conflict

---
 configs/neox_arguments.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 5d58932fa..3b95b7e67 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,11 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-<<<<<<< HEAD
-    Default = 02687a8
-=======
     Default = 31cb364
->>>>>>> e5a7ea71e96eeada636c9612036dc85e886d973d
 
     current git hash of repository
 

From 3664ebab5a0eb1614a1883c313584a1d623f5256 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:04:41 +0000
Subject: [PATCH 28/94] changed `mup_m_width` to `mup_width_multiplier`

---
 configs/neox_arguments.md            |  4 +--
 megatron/learning_rates.py           |  6 ++--
 megatron/model/init_functions.py     | 52 ++++++++++++++--------------
 megatron/model/transformer.py        |  2 +-
 megatron/mpu/layers.py               |  4 +--
 megatron/neox_arguments/neox_args.py |  2 +-
 megatron/training.py                 |  8 ++---
 7 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 3b95b7e67..409baab3d 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -1567,7 +1567,7 @@ Training Arguments
 
 
 
-- **mup_m_width**: int
+- **mup_width_multiplier**: int
 
     Default = 1
 
@@ -1577,7 +1577,7 @@ Training Arguments
 
 - **mup_d_model_base**: int
 
-    Default = 64
+    Default = 256
 
     d_model,base
     Proxy (base) model's layer width
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 424aee20c..da0d3a353 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -37,7 +37,7 @@ def __init__(
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
-        mup_m_width=1,
+        mup_width_multiplier=1,
     ):
 
         # Class values.
@@ -52,7 +52,7 @@ def __init__(
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
         self.use_mup = use_mup
-        self.mup_m_width = mup_m_width
+        self.mup_width_multiplier = mup_width_multiplier
         if self.override_lr_scheduler:
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
@@ -98,7 +98,7 @@ def step(self, step_num=None):
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
             if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
-                group["lr"] = new_lr / self.mup_m_width
+                group["lr"] = new_lr / self.mup_width_multiplier
             else:
                 group["lr"] = new_lr
 
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 44666b229..3eecd7308 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -68,12 +68,12 @@ def _orthogonal(tensor, gain=1):
     return tensor
 
 
-def orthogonal_init_method(n_layers=1, mup_m_width=1.0):
+def orthogonal_init_method(n_layers=1, mup_width_multiplier=1.0):
     """Fills the input Tensor with a (semi) orthogonal matrix, as described in
     Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
     Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
 
-    if mup_m_width != 1:
+    if mup_width_multiplier != 1:
         raise ValueError(
             "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error"
         )
@@ -84,57 +84,57 @@ def init_(tensor):
     return init_
 
 
-def xavier_uniform_init_method(mup_m_width=1.0):
+def xavier_uniform_init_method(mup_width_multiplier=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
 
-    def init_(tensor, mup_m_width=mup_m_width):
+    def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.xavier_uniform_(tensor)
-        if mup_m_width != 1:
+        if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_m_width)
+                init_weight.div_(mup_width_multiplier)
         return init_weight
 
     return init_
 
 
-def xavier_normal_init_method(mup_m_width=1.0):
+def xavier_normal_init_method(mup_width_multiplier=1.0):
     """Fills the input Tensor with values according to the method described in Understanding the difficulty of
     training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
 
-    def init_(tensor, mup_m_width=mup_m_width):
+    def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.xavier_normal_(tensor)
-        if mup_m_width != 1:
+        if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_m_width)
+                init_weight.div_(mup_width_multiplier)
         return init_weight
 
     return init_
 
 
-def small_init_init_method(dim, mup_m_width=1.0):
+def small_init_init_method(dim, mup_width_multiplier=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
     the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
     std = math.sqrt(2 / (5 * dim))
 
-    def init_(tensor, mup_m_width=mup_m_width):
+    def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
-        if mup_m_width != 1:
+        if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_m_width)
+                init_weight.div_(mup_width_multiplier)
         return init_weight
 
     return init_
 
 
-def wang_init_method(n_layers, dim, mup_m_width=1.0):
+def wang_init_method(n_layers, dim, mup_width_multiplier=1.0):
     std = 2 / n_layers / math.sqrt(dim)
 
-    def init_(tensor, mup_m_width=mup_m_width):
+    def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
-        if mup_m_width != 1:
+        if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_m_width)
+                init_weight.div_(mup_width_multiplier)
         return init_weight
             
     return init_
@@ -145,30 +145,30 @@ def get_init_methods(args):
     def _get(name):
         if name == "normal":
             return init_method_normal(
-                sigma=args.init_method_std/math.sqrt(args.mup_m_width)
+                sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier)
             )
         elif name == "scaled_normal":
             return scaled_init_method_normal(
-                sigma=args.init_method_std/math.sqrt(args.mup_m_width),
+                sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier),
                 num_layers=args.num_layers
             )
         elif name == "orthogonal":
-            return orthogonal_init_method(args.mup_m_width)
+            return orthogonal_init_method(args.mup_width_multiplier)
         elif name == "scaled_orthogonal":
             return orthogonal_init_method(
-                args.num_layers, args.mup_m_width
+                args.num_layers, args.mup_width_multiplier
             )
         elif name == "xavier_uniform":
-            return xavier_uniform_init_method(args.mup_m_width)
+            return xavier_uniform_init_method(args.mup_width_multiplier)
         elif name == "xavier_normal":
-            return xavier_normal_init_method(args.mup_m_width)
+            return xavier_normal_init_method(args.mup_width_multiplier)
         elif name == "wang_init":
             return wang_init_method(
-                args.num_layers, args.hidden_size, args.mup_m_width
+                args.num_layers, args.hidden_size, args.mup_width_multiplier
             )
         elif name == "small_init":
             return small_init_init_method(
-                args.hidden_size, args.mup_m_width
+                args.hidden_size, args.mup_width_multiplier
             )
         else:
             raise NotImplementedError(f"Unknown init method {name}")
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 660a0ad6d..79424ca44 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -972,7 +972,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
     if args is not None and args.use_mup:
-        logits_parallel /= args.mup_m_width
+        logits_parallel /= args.mup_width_multiplier
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 859553551..db125375f 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -429,7 +429,7 @@ def __init__(
         self.stride = stride
         self.mup_rescale_parameters = mup_rescale_parameters
         self.use_mup = neox_args.use_mup
-        self.m_width = neox_args.mup_m_width
+        self.m_width = neox_args.mup_width_multiplier
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -627,7 +627,7 @@ def __init__(
         self.keep_master_weight_for_test = keep_master_weight_for_test
         self.mup_rescale_parameters = mup_rescale_parameters
         self.use_mup = neox_args.use_mup
-        self.m_width = neox_args.mup_m_width
+        self.m_width = neox_args.mup_width_multiplier
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 19864b821..b7b0bcb8c 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1046,7 +1046,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Embedding output multiplier
     """
 
-    mup_m_width: float = None
+    mup_width_multiplier: float = None
     """
     Manually set the layer width multiplier (d_model/d_model,base)
     """
diff --git a/megatron/training.py b/megatron/training.py
index 86e2d5fa3..d8461c1eb 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -422,9 +422,9 @@ def get_model(neox_args, use_cache=False):
     # neox_args.use_mup = False
     if neox_args.use_mup:
 
-        if neox_args.mup_m_width == 1:
-            neox_args.mup_m_width = neox_args.hidden_size / neox_args.mup_d_model_base
-        print_rank_0(f"mup_m_width set to {neox_args.mup_m_width}")
+        if neox_args.mup_width_multiplier == 1:
+            neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
+        print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
         # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
 
@@ -640,7 +640,7 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=neox_args.override_lr_scheduler,
         use_mup=neox_args.use_mup,
-        mup_m_width=neox_args.mup_m_width,
+        mup_width_multiplier=neox_args.mup_width_multiplier,
     )
 
     return lr_scheduler

From 6a462476a0db172657accc13c1e12dc7b2f97bbc Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:14:53 +0000
Subject: [PATCH 29/94] fixed notations

---
 megatron/model/init_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 3eecd7308..57666e567 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -18,7 +18,7 @@
 
 
 def init_method_normal(sigma):
-    """Init method based on N(0, sigma)."""
+    """Init method based on N(0, sigma^2)."""
 
     def init_(tensor):
         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
@@ -27,7 +27,7 @@ def init_(tensor):
 
 
 def scaled_init_method_normal(sigma, num_layers):
-    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    """Init method based on N(0, sigma^2/(2*num_layers)."""
     std = sigma / math.sqrt(2.0 * num_layers)
 
     def init_(tensor):

From 7439f9a1cd8d63baefa2c29b5d523c7cc54fb91d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:19:17 +0000
Subject: [PATCH 30/94] correct scale

---
 megatron/model/init_functions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 57666e567..bbf109413 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -92,7 +92,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.xavier_uniform_(tensor)
         if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_width_multiplier)
+                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
 
     return init_
@@ -106,7 +106,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.xavier_normal_(tensor)
         if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_width_multiplier)
+                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
 
     return init_
@@ -121,7 +121,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
         if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_width_multiplier)
+                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
 
     return init_
@@ -134,7 +134,7 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
         if mup_width_multiplier != 1:
             with torch.no_grad():
-                init_weight.div_(mup_width_multiplier)
+                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
             
     return init_

From 5b2d31c9f2fe88da0a03800bd999a159934f9518 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:54:18 +0000
Subject: [PATCH 31/94] m_emb * embed(X)

---
 megatron/model/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 79424ca44..42e3f1893 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -972,7 +972,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
     if args is not None and args.use_mup:
-        logits_parallel /= args.mup_width_multiplier
+        logits_parallel *= args.mup_emb
 
     # Gather if needed.
     if parallel_output:

From 98caa82f92b51b2897a3407dfc0103fa184138e7 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 06:59:14 +0000
Subject: [PATCH 32/94] removed mup rescale in the layers

---
 megatron/model/transformer.py |   1 -
 megatron/mpu/layers.py        | 151 ----------------------------------
 2 files changed, 152 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 42e3f1893..0fe907569 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -220,7 +220,6 @@ def __init__(
                 init_method=init_method,
                 gather_output=not parallel_output,
                 skip_bias_add=False,
-                mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
             )
 
     #        else:
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index db125375f..20dd5dec3 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -162,25 +162,6 @@ def __init__(
                 self.weight, init_method, partition_dim=0, stride=1
             )
 
-    def mup_reinitialize_weights(self, neox_args):
-        if neox_args.use_cpu_initialization:
-            _initialize_affine_weight_cpu(
-                neox_args,
-                self.weight,
-                self.num_embeddings,
-                self.embedding_dim,
-                self.num_embeddings_per_partition,
-                0,
-                partial(self.init_method, use_mup=True),
-            )
-        else:
-            _initialize_affine_weight_gpu(
-                self.weight,
-                partial(self.init_method, use_mup=True),
-                partition_dim=0,
-                stride=1,
-            )
-
     def forward(self, input_):
         if self.model_parallel_size > 1:
             # Build the mask.
@@ -292,25 +273,6 @@ def __init__(
         self._k_len_cached = None
         self._rel_pos_bucket_cached = None
 
-    def mup_reinitialize_weights(self, neox_args):
-        if self.use_cpu_initialization:
-            _initialize_affine_weight_cpu(
-                neox_args,
-                self.weight,
-                self.num_buckets,
-                self.heads,
-                self.num_heads_per_partition,
-                partition_dim=1,
-                init_method=partial(self.init_method, use_mup=True),
-            )
-        else:
-            _initialize_affine_weight_gpu(
-                self.weight,
-                partial(self.init_method, use_mup=True),
-                partition_dim=1,
-                stride=1,
-            )
-
     @staticmethod
     def get_heads_range(global_n_heads, rank, world_size):
         per_partition_n_heads = divide(global_n_heads, world_size)
@@ -413,7 +375,6 @@ def __init__(
         stride=1,
         keep_master_weight_for_test=False,
         skip_bias_add=False,
-        mup_rescale_parameters=False,
     ):
         super(ColumnParallelLinear, self).__init__()
 
@@ -427,9 +388,6 @@ def __init__(
         self.skip_bias_add = skip_bias_add
         self.init_method = init_method
         self.stride = stride
-        self.mup_rescale_parameters = mup_rescale_parameters
-        self.use_mup = neox_args.use_mup
-        self.m_width = neox_args.mup_width_multiplier
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -491,56 +449,6 @@ def __init__(
         else:
             self.register_parameter("bias", None)
 
-    # Copied from Mup
-    def width_mult(self):
-        assert hasattr(self.weight, "infshape"), (
-            "Please call set_base_shapes(...). If using torch.nn.DataParallel, "
-            "switch to distributed training with "
-            "torch.nn.parallel.DistributedDataParallel instead"
-        )
-        return self.weight.infshape.width_mult()
-
-    # Copied from Mup
-    def _rescale_parameters(self):
-        """Rescale parameters to convert SP initialization to μP initialization.
-        Warning: This method is NOT idempotent and should be called only once
-        unless you know what you are doing.
-        """
-        if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params:
-            raise RuntimeError(
-                "`_rescale_parameters` has been called once before already. "
-                "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n"
-                "If you called `set_base_shapes` on a model loaded from a checkpoint, "
-                "or just want to re-set the base shapes of an existing model, "
-                "make sure to set the flag `rescale_params=False`.\n"
-                "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call."
-            )
-        if self.bias is not None:
-            self.bias.data *= self.width_mult() ** 0.5
-        self.weight.data *= self.width_mult() ** 0.5
-        self._has_rescaled_params = True
-
-    def mup_reinitialize_weights(self, neox_args):
-        if neox_args.use_cpu_initialization:
-            self.master_weight = _initialize_affine_weight_cpu(
-                neox_args,
-                self.weight,
-                self.output_size,
-                self.input_size,
-                self.output_size_per_partition,
-                0,
-                partial(self.init_method, use_mup=True),
-                stride=self.stride,
-                return_master_weight=keep_master_weight_for_test,
-            )
-        else:
-            _initialize_affine_weight_gpu(
-                self.weight,
-                partial(self.init_method, use_mup=True),
-                partition_dim=0,
-                stride=self.stride,
-            )
-
     def set_parallel_output(self, value: bool):
         assert isinstance(value, bool)
         self.gather_output = (
@@ -549,9 +457,6 @@ def set_parallel_output(self, value: bool):
 
     def forward(self, input_):
 
-        # Y_logits = W_unembed * X / m_width
-        if self.use_mup and self.mup_rescale_parameters:
-            input_ /= self.m_width
         # Set up backprop all-reduce.
         input_parallel = copy_to_model_parallel_region(input_)
         # Matrix multiply.
@@ -609,7 +514,6 @@ def __init__(
         keep_master_weight_for_test=False,
         skip_bias_add=False,
         parallel_output=False,
-        mup_rescale_parameters=False,
     ):
         super(RowParallelLinear, self).__init__()
 
@@ -625,9 +529,6 @@ def __init__(
         self.init_method = init_method
         self.stride = stride
         self.keep_master_weight_for_test = keep_master_weight_for_test
-        self.mup_rescale_parameters = mup_rescale_parameters
-        self.use_mup = neox_args.use_mup
-        self.m_width = neox_args.mup_width_multiplier
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -683,63 +584,11 @@ def __init__(
         else:
             self.register_parameter("bias", None)
 
-    # Copied from Mup
-    def width_mult(self):
-        assert hasattr(self.weight, "infshape"), (
-            "Please call set_base_shapes(...). If using torch.nn.DataParallel, "
-            "switch to distributed training with "
-            "torch.nn.parallel.DistributedDataParallel instead"
-        )
-        return self.weight.infshape.width_mult()
-
-    # Copied from Mup
-    def _rescale_parameters(self):
-        """Rescale parameters to convert SP initialization to μP initialization.
-        Warning: This method is NOT idempotent and should be called only once
-        unless you know what you are doing.
-        """
-        if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params:
-            raise RuntimeError(
-                "`_rescale_parameters` has been called once before already. "
-                "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n"
-                "If you called `set_base_shapes` on a model loaded from a checkpoint, "
-                "or just want to re-set the base shapes of an existing model, "
-                "make sure to set the flag `rescale_params=False`.\n"
-                "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call."
-            )
-        if self.bias is not None:
-            self.bias.data *= self.width_mult() ** 0.5
-        self.weight.data *= self.width_mult() ** 0.5
-        self._has_rescaled_params = True
-
-    def mup_reinitialize_weights(self, neox_args):
-        if neox_args.use_cpu_initialization:
-            self.master_weight = _initialize_affine_weight_cpu(
-                neox_args,
-                self.weight,
-                self.output_size,
-                self.input_size,
-                self.input_size_per_partition,
-                1,
-                partial(self.init_method, use_mup=True),
-                stride=self.stride,
-                return_master_weight=self.keep_master_weight_for_test,
-            )
-        else:
-            _initialize_affine_weight_gpu(
-                self.weight,
-                partial(self.init_method, use_mup=True),
-                partition_dim=1,
-                stride=self.stride,
-            )
-
     def set_parallel_output(self, parallel_output: bool):
         assert isinstance(parallel_output, bool)
         self.parallel_output = parallel_output
 
     def forward(self, input_):
-        if self.use_mup and self.mup_rescale_parameters:
-            input_ /= self.m_width
         # Set up backprop all-reduce.
         if self.input_is_parallel:
             input_parallel = input_

From 5c99637c50bd2163bc43eb369c651099ec5e6aea Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 07:03:08 +0000
Subject: [PATCH 33/94] removed mup rescale in the layers

---
 megatron/model/transformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 0fe907569..347126b6e 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -236,7 +236,6 @@ def __init__(
     #                init_method=init_method,
     #                parallel_output=parallel_output,
     #                skip_bias_add=False,
-    #                mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
     #            )
 
     def forward(self, hidden_states):

From a636f0610c23f4239ee753367ffb2bff5a4bbcbf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 2 Feb 2024 07:16:59 +0000
Subject: [PATCH 34/94] adjust mup_m_emb to mup_embedding_multiplier

---
 configs/neox_arguments.md            | 2 +-
 megatron/model/transformer.py        | 2 +-
 megatron/model/word_embeddings.py    | 5 ++---
 megatron/neox_arguments/neox_args.py | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 409baab3d..7acbde83d 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -1559,7 +1559,7 @@ Training Arguments
 
 
 
-- **mup_emb**: int
+- **mup_embedding_multiplier**: int
 
     Default = 1
 
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 347126b6e..b5cf1754d 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -970,7 +970,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
     if args is not None and args.use_mup:
-        logits_parallel *= args.mup_emb
+        logits_parallel /= args.mup_width_multiplier
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 29b20b320..bac8d2808 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -51,8 +51,7 @@ def __init__(
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
         self.use_mup = neox_args.use_mup
-        self.mup_m_emb = float(neox_args.mup_m_emb)
-        # self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
+        self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier)
 
         # Word embeddings (parallel).
         self.word_embeddings = mpu.VocabParallelEmbedding(
@@ -157,7 +156,7 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
         # Y_emb = m_emb * embed(X)
         if self.use_mup:
             with torch.no_grad():
-                embeddings = torch.mul(embeddings, self.mup_m_emb)
+                embeddings = torch.mul(embeddings, self.mup_embedding_multiplier)
 
         return embeddings
 
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index b7b0bcb8c..50ae34156 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1041,7 +1041,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Path to the base shapes to save to/load from
     """
 
-    mup_m_emb: float = 1.0
+    mup_embedding_multiplier: float = 1.0
     """
     Embedding output multiplier
     """

From 39190c59308e3c1229de96b24831904a63b1c668 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 20 Feb 2024 15:00:58 +0000
Subject: [PATCH 35/94] add multiplier mup_output_multiplier

---
 megatron/model/transformer.py        | 1 +
 megatron/neox_arguments/neox_args.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index b5cf1754d..2f9004e0e 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -971,6 +971,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
 
     if args is not None and args.use_mup:
         logits_parallel /= args.mup_width_multiplier
+        logits_parallel *= args.mup_output_multiplier 
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 50ae34156..dc134f883 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1046,6 +1046,11 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Embedding output multiplier
     """
 
+    mup_output_multiplier: float = 1.0
+    """
+    Output logits multiplier
+    """
+
     mup_width_multiplier: float = None
     """
     Manually set the layer width multiplier (d_model/d_model,base)

From 2489cc062507e652b72dac1edd373188801be534 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 20 Feb 2024 15:31:14 +0000
Subject: [PATCH 36/94] reorder model loading

---
 megatron/mup_substitute.py |  2 -
 megatron/training.py       | 78 ++++----------------------------------
 2 files changed, 8 insertions(+), 72 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 6b54d904f..bdde503bd 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -11,8 +11,6 @@
 import torch.nn.functional as F
 
 from megatron import print_rank_0
-
-# from mup import coord_check as mup_coord_check
 from megatron.training import train_step
 
 
diff --git a/megatron/training.py b/megatron/training.py
index d8461c1eb..6807459f6 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -71,59 +71,6 @@ def has_method(o, name):
             layer.mup_reinitialize_weights(neox_args)
 
 
-def save_base_shapes(neox_args, base_shapes, use_cache):
-
-    # Instantiation of the base model fails in the init function (init_functions.py) because we haven't called set_base_shapes on it at this point, so disable it temporarily here
-    neox_args.use_mup = False
-
-    base_model = GPT2ModelPipe(
-        neox_args=neox_args,
-        num_tokentypes=0,
-        parallel_output=True,
-        topology=mpu.get_topology(),
-        use_cache=use_cache,
-    )
-
-    if not neox_args.is_pipe_parallel:
-        base_model = base_model.to_sequential()
-
-    try:
-        import mup
-    except ModuleNotFoundError:
-        print("Please install mup https://github.com/microsoft/mup")
-        raise Exception
-
-    base_shapes = mup.get_shapes(base_model)
-
-    del base_model
-
-    old_hidden_size = neox_args.hidden_size
-    neox_args.hidden_size = neox_args.hidden_size * neox_args.mup_width_scale
-
-    delta_model = GPT2ModelPipe(
-        neox_args=neox_args,
-        num_tokentypes=0,
-        parallel_output=True,
-        topology=mpu.get_topology(),
-        use_cache=use_cache,
-    )
-
-    if not neox_args.is_pipe_parallel:
-        delta_model = delta_model.to_sequential()
-
-    delta_shapes = mup.get_shapes(delta_model)
-
-    # change back
-    neox_args.use_mup = True
-    neox_args.hidden_size = old_hidden_size
-
-    save_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
-    print(f"saving base shapes at {save_shapes}")
-    mup.make_base_shapes(base_shapes, delta_shapes, savefile=save_shapes)
-    print(f"base shapes saved...exiting")
-    sys.exit(1)
-
-
 def mup_coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
     # from mup.coord_check import plot_coord_data
@@ -200,13 +147,6 @@ def pretrain(neox_args):
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
-    # Model, optimizer, and learning rate.
-    timers("model and optimizer").start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(
-        neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
-    )
-    timers("model and optimizer").stop()
-
     # Data stuff.
     timers("train/valid/test data iterators").start()
     (
@@ -219,9 +159,17 @@ def pretrain(neox_args):
     if neox_args.use_mup and neox_args.coord_check:
         print_rank_0("Do muP Coord Check")
         mup_coord_check(neox_args, timers, train_data_iterator)
+        sys.exit()
     else:
         pass
 
+    # Model, optimizer, and learning rate.
+    timers("model and optimizer").start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+        neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
+    )
+    timers("model and optimizer").stop()
+
     # Print setup timing.
     print_rank_0("done with setups ...")
     timers.log(["model and optimizer", "train/valid/test data iterators"])
@@ -426,16 +374,6 @@ def get_model(neox_args, use_cache=False):
             neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
         print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
-        # base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
-
-        # if neox_args.save_base_shapes:
-        #     save_base_shapes(neox_args, base_shapes, use_cache)
-
-        # mup.set_base_shapes(model, base_shapes)
-
-        # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute
-        # mup_weights_reinit(neox_args, model)
-
     model = GPT2ModelPipe(
         neox_args=neox_args,
         num_tokentypes=0,

From 23b877670ac7e388894233ec72b2388c2cacc9ae Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 20 Feb 2024 15:46:25 +0000
Subject: [PATCH 37/94] removed comments

---
 megatron/training.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 6807459f6..4fa74e010 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -73,7 +73,6 @@ def has_method(o, name):
 
 def mup_coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
-    # from mup.coord_check import plot_coord_data
 
     def lazy_model(hidden_size):
         def gen():
@@ -92,19 +91,9 @@ def gen():
 
     models = {}
 
-    # # Hidden size needs to be divisible by num attention heads
-    # for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
-    #     models[hidden_size] = lazy_model(hidden_size)
-
-    # 128
-    # 256
-    # 512
-    # 1024
-    # 2048
-    # 4096
-    # 8192
-
-    models[neox_args.hidden_size] = lazy_model(neox_args.hidden_size)
+    # Hidden size needs to be divisible by num attention heads
+    for hidden_size in [2**p for p in range(7,14)]:
+        models[hidden_size] = lazy_model(hidden_size)
 
     print_rank_0("df_up")
     neox_args.use_mup = True

From 10e935e9919ed4b50d66d2ea51f76ee7ca5b01ba Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 20 Feb 2024 15:46:56 +0000
Subject: [PATCH 38/94] removed comments

---
 megatron/training.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4fa74e010..4fbe9dcd2 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -108,8 +108,6 @@ def gen():
 
     # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
     # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
-
-
     print_rank_0("Saved coord check plots... exiting")
     sys.exit(1)
 

From a0aca99adab69df3874d4c259e255f1290d3d3d6 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 20 Feb 2024 16:48:38 +0000
Subject: [PATCH 39/94] implement full process

---
 megatron/mup_substitute.py | 80 +++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index bdde503bd..7baa687b9 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -37,16 +37,14 @@ def _get_coord_data(
     show_progress=True,
     one_hot_target=False,
 ):
-    df = []
-
-    def word_embedding_coord_check_hook(module, input, output):
-        with torch.no_grad():
-            word_embedding_act_abs_mean_list.append(output.abs().mean().item())
-
-    word_embedding_act_abs_mean_list = []
-    _seeds = []
-    _steps = []
-    remove_hooks = []
+    df = {
+        "seed": [],
+        "step": [],
+        "we_act": [],
+        "ao_act": [],
+        "fo_act": [],
+        "width": [],
+    }
 
     for i in range(nseeds):
         torch.manual_seed(i)
@@ -54,36 +52,31 @@ def word_embedding_coord_check_hook(module, input, output):
             model = model()
             model.train()
             optimizer = optcls(model)
-            # optimizer, _ = optcls(model, neox_args)
 
             for step in range(nsteps + 1):
 
-                # add hooks
+                word_embedding_act_abs_mean_list = []
+                attn_output_act_abs_mean_list = []
+                ffn_output_act_abs_mean_list = []
+                remove_hooks = []
+
+                def word_embedding_coord_check_hook(module, input, output):
+                    with torch.no_grad():
+                        word_embedding_act_abs_mean_list.append(output.abs().mean().item())
+
+                def attn_output_coord_check_hook(module, input, output):
+                    with torch.no_grad():
+                        attn_output_act_abs_mean_list.append(output[0].abs().mean().item())
+
+                def ffn_output_coord_check_hook(module, input, output):
+                    with torch.no_grad():
+                        ffn_output_act_abs_mean_list.append(output[0].abs().mean().item())
+
                 for name, module in model.named_modules():
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
-                            module.register_forward_hook(word_embedding_coord_check_hook))
-
-                        _steps.append(step)
-                        _seeds.append(i)
-
-
-                    # if filter_module_by_name and not filter_module_by_name(name):
-                    #     continue
-                    # pass
-                    # remove_hooks.append(
-                    #     module.register_forward_hook(
-                    #         mup_coord_check._record_coords(
-                    #             df,
-                    #             width,
-                    #             name,
-                    #             step + 1,
-                    #             output_fdict=output_fdict,
-                    #             input_fdict=input_fdict,
-                    #             param_fdict=param_fdict,
-                    #         )
-                    #     )
-                    # )
+                            module.register_forward_hook(word_embedding_coord_check_hook)
+                        )
 
                 # train for a step
                 loss_dict, skipped_iter = train_step(
@@ -95,15 +88,30 @@ def word_embedding_coord_check_hook(module, input, output):
                     lr_scheduler=lr_scheduler,
                 )
 
+                word_embedding_act_abs_mean = None
+                attn_output_act_abs_mean = None
+                ffn_output_act_abs_mean = None
+
                 # remove hooks
                 for handle in remove_hooks:
                     handle.remove()
+                word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list)
+                attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list)
+                ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list)
+
+                df["seed"].append(i)
+                df["step"].append(step)
+                df["we_act"].append(word_embedding_act_abs_mean)
+                # df["ao_act"].append(attn_output_act_abs_mean)
+                # df["fo_act"].append(ffn_output_act_abs_mean)
+                df["width"].append(width)
+
             import gc
             del model
             gc.collect()
 
-    for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list):
-        print_rank_0(_i, _j, _k)
+    # for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list):
+    #     print_rank_0(_i, _j, _k)
 
     return pd.DataFrame(df)
 

From 9472b35917524ef104c496002987a050dbea4c7e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 21 Feb 2024 07:01:43 +0000
Subject: [PATCH 40/94] set neox_args.iteration to 0 for coord_check mode

---
 megatron/training.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4fbe9dcd2..b3a0207b7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -134,17 +134,18 @@ def pretrain(neox_args):
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
-    # Data stuff.
-    timers("train/valid/test data iterators").start()
-    (
-        train_data_iterator,
-        valid_data_iterator,
-        test_data_iterator,
-    ) = build_train_valid_test_data_iterators(neox_args=neox_args)
-    timers("train/valid/test data iterators").stop()
-
     if neox_args.use_mup and neox_args.coord_check:
         print_rank_0("Do muP Coord Check")
+        # Data stuff
+        neox_args.iteration = 0
+        timers("train/valid/test data iterators").start()
+        (
+            train_data_iterator,
+            valid_data_iterator,
+            test_data_iterator,
+        ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+        timers("train/valid/test data iterators").stop()
+
         mup_coord_check(neox_args, timers, train_data_iterator)
         sys.exit()
     else:
@@ -157,6 +158,15 @@ def pretrain(neox_args):
     )
     timers("model and optimizer").stop()
 
+    # Data stuff.
+    timers("train/valid/test data iterators").start()
+    (
+        train_data_iterator,
+        valid_data_iterator,
+        test_data_iterator,
+    ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+    timers("train/valid/test data iterators").stop()
+
     # Print setup timing.
     print_rank_0("done with setups ...")
     timers.log(["model and optimizer", "train/valid/test data iterators"])

From 5c5f2df265328b3f510c25e489e5391f36e284e2 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 21 Feb 2024 07:20:18 +0000
Subject: [PATCH 41/94] move mup_width_multiplier init

---
 megatron/training.py | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index b3a0207b7..4ce2d5904 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -134,22 +134,26 @@ def pretrain(neox_args):
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
-    if neox_args.use_mup and neox_args.coord_check:
-        print_rank_0("Do muP Coord Check")
-        # Data stuff
-        neox_args.iteration = 0
-        timers("train/valid/test data iterators").start()
-        (
-            train_data_iterator,
-            valid_data_iterator,
-            test_data_iterator,
-        ) = build_train_valid_test_data_iterators(neox_args=neox_args)
-        timers("train/valid/test data iterators").stop()
-
-        mup_coord_check(neox_args, timers, train_data_iterator)
-        sys.exit()
-    else:
-        pass
+    if neox_args.use_mup:
+
+        if neox_args.mup_width_multiplier is None:
+            neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
+        print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
+
+        if neox_args.coord_check:
+            print_rank_0("---- Do muP Coord Check ----")
+            # Data stuff
+            neox_args.iteration = 0
+            timers("train/valid/test data iterators").start()
+            (
+                train_data_iterator,
+                valid_data_iterator,
+                test_data_iterator,
+            ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+            timers("train/valid/test data iterators").stop()
+
+            mup_coord_check(neox_args, timers, train_data_iterator)
+            sys.exit()
 
     # Model, optimizer, and learning rate.
     timers("model and optimizer").start()
@@ -365,11 +369,6 @@ def get_model(neox_args, use_cache=False):
     # If mup isn't being used anyways, this has no effect.
     # old_use_mup = neox_args.use_mup
     # neox_args.use_mup = False
-    if neox_args.use_mup:
-
-        if neox_args.mup_width_multiplier == 1:
-            neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
-        print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
     model = GPT2ModelPipe(
         neox_args=neox_args,

From 7eca3e7944bd47cbc043f55dc94fa2a318b1d12e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 21 Feb 2024 07:21:37 +0000
Subject: [PATCH 42/94] mup_coord_check returns 2 df

---
 megatron/training.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4ce2d5904..c54bf0ef2 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -95,9 +95,9 @@ def gen():
     for hidden_size in [2**p for p in range(7,14)]:
         models[hidden_size] = lazy_model(hidden_size)
 
-    print_rank_0("df_up")
+    print_rank_0("df_mup")
     neox_args.use_mup = True
-    df_up = get_coord_data(
+    df_mup = get_coord_data(
         neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
     )
     print_rank_0("df_sp")
@@ -109,8 +109,7 @@ def gen():
     # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
     # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
     print_rank_0("Saved coord check plots... exiting")
-    sys.exit(1)
-
+    return df_mup, df_sp
 
 def pretrain(neox_args):
     """Main training program.
@@ -152,7 +151,7 @@ def pretrain(neox_args):
             ) = build_train_valid_test_data_iterators(neox_args=neox_args)
             timers("train/valid/test data iterators").stop()
 
-            mup_coord_check(neox_args, timers, train_data_iterator)
+            df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator)
             sys.exit()
 
     # Model, optimizer, and learning rate.

From c9a3a6560f90b533a5683ea54d29366bd452c8d2 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 21 Feb 2024 16:36:05 +0000
Subject: [PATCH 43/94] can run

---
 megatron/mup_substitute.py | 11 ++++-------
 megatron/training.py       | 21 +++++++++++++--------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 7baa687b9..55edecec1 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -51,6 +51,7 @@ def _get_coord_data(
         for width, model in models.items():
             model = model()
             model.train()
+            neox_args.hidden_size = width
             optimizer = optcls(model)
 
             for step in range(nsteps + 1):
@@ -208,13 +209,9 @@ def get_coord_data(
     """
     if lr is None:
         lr = 0.1 if optimizer == "sgd" else 1e-3
-    if mup:
-        # from mup.optim import MuAdam as Adam
-        # from mup.optim import MuAdamW as AdamW
-        # from mup.optim import MuSGD as SGD
-        from deepspeed.ops.adam import FusedAdam as Adam
-    else:
-        from torch.optim import SGD, Adam, AdamW
+        
+    from torch.optim import SGD, AdamW, Adam
+    # from deepspeed.ops.adam import FusedAdam as Adam
 
     def get_trainable(model):
         params = model.parameters()
diff --git a/megatron/training.py b/megatron/training.py
index c54bf0ef2..1396862d0 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -79,7 +79,7 @@ def gen():
             old_hidden_size = neox_args.hidden_size
             neox_args.hidden_size = hidden_size
 
-            model, optimizer, lr_scheduler = setup_model_and_optimizer(
+            model, *_ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
@@ -92,23 +92,26 @@ def gen():
     models = {}
 
     # Hidden size needs to be divisible by num attention heads
-    for hidden_size in [2**p for p in range(7,14)]:
+    for hidden_size in [2**p for p in range(8,14)]:
         models[hidden_size] = lazy_model(hidden_size)
 
-    print_rank_0("df_mup")
+    print_rank_0(">>> Coord Check for mu Parameterization")
     neox_args.use_mup = True
     df_mup = get_coord_data(
         neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
     )
-    print_rank_0("df_sp")
+    print_rank_0(">>> Coord Check for standard Parameterization")
     neox_args.use_mup = False
     df_sp = get_coord_data(
         neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam"
     )
 
-    # plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
+    print_rank_0(df_mup)
+    # plot_coord_data(df_mup, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
     # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
     print_rank_0("Saved coord check plots... exiting")
+
+    import sys; sys.exit()
     return df_mup, df_sp
 
 def pretrain(neox_args):
@@ -492,8 +495,9 @@ def get_optimizer(model, neox_args):
             # except ModuleNotFoundError:
             #     print("Please install mup https://github.com/microsoft/mup")
             #     raise Exception
-            from deepspeed.ops.adam import FusedAdam as Adam
-            adam_optimizer = Adam
+            # from deepspeed.ops.adam import FusedAdam as Adam
+            # adam_optimizer = Adam
+            adam_optimizer = torch.optim.Adam
         else:
             if neox_args.use_bnb_optimizer:
                 try:
@@ -514,7 +518,8 @@ def get_optimizer(model, neox_args):
                     print(
                         "WARNING: APEX not installed - defaulting to deepspeed's fused adam"
                     )
-                    from deepspeed.ops.adam import FusedAdam as Adam
+                    # from deepspeed.ops.adam import FusedAdam as Adam
+                    from torch.optim import Adam
                 adam_optimizer = Adam
         optimizer = adam_optimizer(
             param_groups,

From a7877d4fb07699406f97de8a1cf7a68b351b9dc7 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 06:31:03 +0000
Subject: [PATCH 44/94] remove commehts

---
 megatron/mup_substitute.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 55edecec1..7c9e0534b 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -21,7 +21,7 @@ def _get_coord_data(
     models,
     dataloader,
     optcls,
-    nsteps=3,
+    nsteps=10,
     dict_in_out=False,
     flatten_input=False,
     flatten_output=False,
@@ -30,7 +30,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=3,
+    nseeds=1,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -111,9 +111,6 @@ def ffn_output_coord_check_hook(module, input, output):
             del model
             gc.collect()
 
-    # for _i,_j,_k in zip(_seeds, _steps, word_embedding_act_abs_mean_list):
-    #     print_rank_0(_i, _j, _k)
-
     return pd.DataFrame(df)
 
 

From bd9d399f61e25ac098fc2243e10b1b226e3d1ec7 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 06:45:02 +0000
Subject: [PATCH 45/94] add hooks

---
 megatron/mup_substitute.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 7c9e0534b..ebf98d64d 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -74,13 +74,22 @@ def ffn_output_coord_check_hook(module, input, output):
                         ffn_output_act_abs_mean_list.append(output[0].abs().mean().item())
 
                 for name, module in model.named_modules():
+                    print_rank_0(name)
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
                             module.register_forward_hook(word_embedding_coord_check_hook)
                         )
+                    elif name.endswith(".attention.dense"):
+                        remove_hooks.append(
+                            module.register_forward_hook(attn_output_coord_check_hook)
+                        )
+                    elif name.endswith(".mlp.dense_4h_to_h"):
+                        remove_hooks.append(
+                            module.register_forward_hook(ffn_output_coord_check_hook)
+                        )
 
                 # train for a step
-                loss_dict, skipped_iter = train_step(
+                train_step(
                     neox_args=neox_args,
                     timers=timers,
                     data_iterator=dataloader,

From fe180d3679053781e9a2c5b099407c4428ccddae Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 06:45:15 +0000
Subject: [PATCH 46/94] remove comments

---
 megatron/mup_substitute.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index ebf98d64d..44e3e1d66 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -74,7 +74,6 @@ def ffn_output_coord_check_hook(module, input, output):
                         ffn_output_act_abs_mean_list.append(output[0].abs().mean().item())
 
                 for name, module in model.named_modules():
-                    print_rank_0(name)
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
                             module.register_forward_hook(word_embedding_coord_check_hook)

From b240c19826825dfa7d1cff9f0760fe7bd1ba3acf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 06:45:46 +0000
Subject: [PATCH 47/94] uncomment activation data

---
 megatron/mup_substitute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 44e3e1d66..673ef40b1 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -111,8 +111,8 @@ def ffn_output_coord_check_hook(module, input, output):
                 df["seed"].append(i)
                 df["step"].append(step)
                 df["we_act"].append(word_embedding_act_abs_mean)
-                # df["ao_act"].append(attn_output_act_abs_mean)
-                # df["fo_act"].append(ffn_output_act_abs_mean)
+                df["ao_act"].append(attn_output_act_abs_mean)
+                df["fo_act"].append(ffn_output_act_abs_mean)
                 df["width"].append(width)
 
             import gc

From 93b424165be4dd49731bf9507396ac23b9d3db2f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 09:56:25 +0000
Subject: [PATCH 48/94] plot coords

---
 megatron/mup_substitute.py | 14 ++++++++------
 megatron/training.py       | 36 +++++++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 673ef40b1..5c369ca29 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -30,7 +30,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=1,
+    nseeds=2,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -46,10 +46,11 @@ def _get_coord_data(
         "width": [],
     }
 
-    for i in range(nseeds):
-        torch.manual_seed(i)
-        for width, model in models.items():
-            model = model()
+    for width, model_obj in models.items():
+        for i in range(nseeds):
+            torch.manual_seed(10**i)
+            print_rank_0(f">>> Running Model with width: {width} on seed: {i}")
+            model = model_obj()
             model.train()
             neox_args.hidden_size = width
             optimizer = optcls(model)
@@ -116,8 +117,9 @@ def ffn_output_coord_check_hook(module, input, output):
                 df["width"].append(width)
 
             import gc
-            del model
+            del model, optimizer
             gc.collect()
+            torch.cuda.empty_cache()
 
     return pd.DataFrame(df)
 
diff --git a/megatron/training.py b/megatron/training.py
index 1396862d0..f3b5e0fc8 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,6 +57,31 @@
 from megatron.model.gpt2_model import cross_entropy
 # from eval_tasks import run_eval_harness
 
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def plot_coord_data(df, activation, graph_name):
+
+    """If distributed is initialized print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            _plot_data(df, activation, graph_name)
+    else:
+        _plot_data(df, activation, graph_name)
+
+    def _plot_data(df, activation, graph_name):
+        df = df.groupby(['step', 'width']).mean().reset_index()
+        sns.lineplot(
+            data=df,
+            x="width", y=activation, hue="step", errorbar=None, style="step",
+            marker="o", dashes=False, legend='full'
+        )
+        plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
+        plt.savefig(f"{graph_name}.png")
+        return 0
+
+    return 0
 
 def mup_weights_reinit(neox_args, model):
     def has_method(o, name):
@@ -92,7 +117,7 @@ def gen():
     models = {}
 
     # Hidden size needs to be divisible by num attention heads
-    for hidden_size in [2**p for p in range(8,14)]:
+    for hidden_size in [2**p for p in range(8,11)]:
         models[hidden_size] = lazy_model(hidden_size)
 
     print_rank_0(">>> Coord Check for mu Parameterization")
@@ -106,12 +131,13 @@ def gen():
         neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam"
     )
 
-    print_rank_0(df_mup)
-    # plot_coord_data(df_mup, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
-    # plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
+    df_mup.to_csv("df_mup.csv", index=False)
+    df_sp.to_csv("df_sp.csv", index=False)
+    for activation in ["we_act", "ao_act", "fo_act"]:
+        plot_coord_data(df_mup, activation, graph_name=f"coord_check_up.{activation}.jpg")
+        plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp.{activation}.jpg")
     print_rank_0("Saved coord check plots... exiting")
 
-    import sys; sys.exit()
     return df_mup, df_sp
 
 def pretrain(neox_args):

From d4899fc2d384ccb7d77f4ed2e96896baa400d2ce Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 13:14:42 +0000
Subject: [PATCH 49/94] removed variables, add way to plot only from rank 0

---
 megatron/mup_substitute.py |  4 ++--
 megatron/training.py       | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 5c369ca29..a8534baae 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -241,6 +241,6 @@ def get_trainable(model):
     data = _get_coord_data(
         neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
     )
-    data["optimizer"] = optimizer
-    data["lr"] = lr
+    # data["optimizer"] = optimizer
+    # data["lr"] = lr
     return data
diff --git a/megatron/training.py b/megatron/training.py
index f3b5e0fc8..b15d0b811 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -63,13 +63,6 @@
 
 def plot_coord_data(df, activation, graph_name):
 
-    """If distributed is initialized print only on rank 0."""
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            _plot_data(df, activation, graph_name)
-    else:
-        _plot_data(df, activation, graph_name)
-
     def _plot_data(df, activation, graph_name):
         df = df.groupby(['step', 'width']).mean().reset_index()
         sns.lineplot(
@@ -81,6 +74,13 @@ def _plot_data(df, activation, graph_name):
         plt.savefig(f"{graph_name}.png")
         return 0
 
+    """If distributed is initialized print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            _plot_data(df, activation, graph_name)
+    else:
+        _plot_data(df, activation, graph_name)
+
     return 0
 
 def mup_weights_reinit(neox_args, model):
@@ -134,8 +134,8 @@ def gen():
     df_mup.to_csv("df_mup.csv", index=False)
     df_sp.to_csv("df_sp.csv", index=False)
     for activation in ["we_act", "ao_act", "fo_act"]:
-        plot_coord_data(df_mup, activation, graph_name=f"coord_check_up.{activation}.jpg")
-        plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp.{activation}.jpg")
+        plot_coord_data(df_mup, activation, graph_name=f"coord_check_mup-{activation}")
+        plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp-{activation}")
     print_rank_0("Saved coord check plots... exiting")
 
     return df_mup, df_sp

From f589e29d6a3a7354c1761cf8d29e197719f0e53e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 13:48:27 +0000
Subject: [PATCH 50/94] changed key name in dict

---
 megatron/mup_substitute.py | 30 +++++++++++++++++++++++-------
 megatron/training.py       | 29 ++++++++++++++++++++---------
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index a8534baae..fb9511d56 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -40,9 +40,10 @@ def _get_coord_data(
     df = {
         "seed": [],
         "step": [],
-        "we_act": [],
-        "ao_act": [],
-        "fo_act": [],
+        "word_embedding_act_abs_mean": [],
+        "attn_output_act_abs_mean": [],
+        "ffn_output_act_abs_mean": [],
+        "output_logits_act_abs_mean": [],
         "width": [],
     }
 
@@ -60,6 +61,7 @@ def _get_coord_data(
                 word_embedding_act_abs_mean_list = []
                 attn_output_act_abs_mean_list = []
                 ffn_output_act_abs_mean_list = []
+                output_logits_act_abs_mean_list = []
                 remove_hooks = []
 
                 def word_embedding_coord_check_hook(module, input, output):
@@ -74,7 +76,14 @@ def ffn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
                         ffn_output_act_abs_mean_list.append(output[0].abs().mean().item())
 
+                def output_logits_coord_check_hook(module, input, output):
+                    with torch.no_grad():
+                        # print("output_logits_coord_check_hook")
+                        # print_rank_0(output.shape)
+                        output_logits_act_abs_mean_list.append(output[0].abs().mean().item())
+
                 for name, module in model.named_modules():
+                    print_rank_0(name)
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
                             module.register_forward_hook(word_embedding_coord_check_hook)
@@ -87,6 +96,10 @@ def ffn_output_coord_check_hook(module, input, output):
                         remove_hooks.append(
                             module.register_forward_hook(ffn_output_coord_check_hook)
                         )
+                    elif name.endswith(".final_linear"):
+                        remove_hooks.append(
+                            module.register_forward_hook(output_logits_coord_check_hook)
+                        )
 
                 # train for a step
                 train_step(
@@ -101,6 +114,7 @@ def ffn_output_coord_check_hook(module, input, output):
                 word_embedding_act_abs_mean = None
                 attn_output_act_abs_mean = None
                 ffn_output_act_abs_mean = None
+                output_logits_act_abs_mean = None
 
                 # remove hooks
                 for handle in remove_hooks:
@@ -108,12 +122,14 @@ def ffn_output_coord_check_hook(module, input, output):
                 word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list)
                 attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list)
                 ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list)
+                output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list)
 
                 df["seed"].append(i)
-                df["step"].append(step)
-                df["we_act"].append(word_embedding_act_abs_mean)
-                df["ao_act"].append(attn_output_act_abs_mean)
-                df["fo_act"].append(ffn_output_act_abs_mean)
+                df["step"].append(f"t={step}")
+                df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean)
+                df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean)
+                df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean)
+                df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean)
                 df["width"].append(width)
 
             import gc
diff --git a/megatron/training.py b/megatron/training.py
index b15d0b811..15f16a56c 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -61,9 +61,9 @@
 import matplotlib.pyplot as plt
 
 
-def plot_coord_data(df, activation, graph_name):
+def plot_coord_data(df, graph_name_prefix, mup=True):
 
-    def _plot_data(df, activation, graph_name):
+    def _plot_data(df, activation, graph_name_prefix):
         df = df.groupby(['step', 'width']).mean().reset_index()
         sns.lineplot(
             data=df,
@@ -71,15 +71,28 @@ def _plot_data(df, activation, graph_name):
             marker="o", dashes=False, legend='full'
         )
         plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
-        plt.savefig(f"{graph_name}.png")
+        plt.xlabel("Width")
+        plt.ylabel("Activation with {}".format("muP" if mup else "SP"))
+        plt.title(f"{activation}")
+        plt.savefig(f"{graph_name_prefix}-{activation}.png")
+        plt.close()
+
         return 0
 
+    activation_list = [
+        "word_embedding_act_abs_mean",
+        "attn_output_act_abs_mean",
+        "ffn_output_act_abs_mean",
+        "output_logits_act_abs_mean",
+    ]
     """If distributed is initialized print only on rank 0."""
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0:
-            _plot_data(df, activation, graph_name)
+            for activation in activation_list:
+                _plot_data(df, activation, graph_name_prefix)
     else:
-        _plot_data(df, activation, graph_name)
+        for activation in activation_list:
+            _plot_data(df, activation, graph_name_prefix)
 
     return 0
 
@@ -115,7 +128,6 @@ def gen():
         return gen
 
     models = {}
-
     # Hidden size needs to be divisible by num attention heads
     for hidden_size in [2**p for p in range(8,11)]:
         models[hidden_size] = lazy_model(hidden_size)
@@ -133,9 +145,8 @@ def gen():
 
     df_mup.to_csv("df_mup.csv", index=False)
     df_sp.to_csv("df_sp.csv", index=False)
-    for activation in ["we_act", "ao_act", "fo_act"]:
-        plot_coord_data(df_mup, activation, graph_name=f"coord_check_mup-{activation}")
-        plot_coord_data(df_sp, activation, graph_name=f"coord_check_sp-{activation}")
+    plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True)
+    plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False)
     print_rank_0("Saved coord check plots... exiting")
 
     return df_mup, df_sp

From 8261e0dc35fd679676e360eb8504619ceae0f86a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 13:50:51 +0000
Subject: [PATCH 51/94] remove print

---
 megatron/mup_substitute.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index fb9511d56..8cf29973d 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -83,7 +83,6 @@ def output_logits_coord_check_hook(module, input, output):
                         output_logits_act_abs_mean_list.append(output[0].abs().mean().item())
 
                 for name, module in model.named_modules():
-                    print_rank_0(name)
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
                             module.register_forward_hook(word_embedding_coord_check_hook)

From 25aa786c334384daec9bb3b4cada2f5bb26b615e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 14:49:48 +0000
Subject: [PATCH 52/94] fix how width_multiplier is applied

---
 megatron/mup_substitute.py |  1 +
 megatron/training.py       | 41 +++++++++++++++++++-------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 8cf29973d..a8931e249 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -253,6 +253,7 @@ def get_trainable(model):
     elif optimizer is None:
         raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
 
+    neox_args.use_mup = mup
     data = _get_coord_data(
         neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
     )
diff --git a/megatron/training.py b/megatron/training.py
index 15f16a56c..d8b5e779f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -116,6 +116,7 @@ def lazy_model(hidden_size):
         def gen():
             old_hidden_size = neox_args.hidden_size
             neox_args.hidden_size = hidden_size
+            neox_args.mup_width_multiplier = None
 
             model, *_ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
@@ -129,7 +130,7 @@ def gen():
 
     models = {}
     # Hidden size needs to be divisible by num attention heads
-    for hidden_size in [2**p for p in range(8,11)]:
+    for hidden_size in [2**p for p in range(7,11)]:
         models[hidden_size] = lazy_model(hidden_size)
 
     print_rank_0(">>> Coord Check for mu Parameterization")
@@ -173,26 +174,20 @@ def pretrain(neox_args):
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
-    if neox_args.use_mup:
-
-        if neox_args.mup_width_multiplier is None:
-            neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
-        print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
-
-        if neox_args.coord_check:
-            print_rank_0("---- Do muP Coord Check ----")
-            # Data stuff
-            neox_args.iteration = 0
-            timers("train/valid/test data iterators").start()
-            (
-                train_data_iterator,
-                valid_data_iterator,
-                test_data_iterator,
-            ) = build_train_valid_test_data_iterators(neox_args=neox_args)
-            timers("train/valid/test data iterators").stop()
-
-            df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator)
-            sys.exit()
+    if neox_args.use_mup and neox_args.coord_check:
+        print_rank_0("---- Do muP Coord Check ----")
+        # Data stuff
+        neox_args.iteration = 0
+        timers("train/valid/test data iterators").start()
+        (
+            train_data_iterator,
+            valid_data_iterator,
+            test_data_iterator,
+        ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+        timers("train/valid/test data iterators").stop()
+
+        df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator)
+        sys.exit()
 
     # Model, optimizer, and learning rate.
     timers("model and optimizer").start()
@@ -623,6 +618,10 @@ def get_learning_rate_scheduler(optimizer, neox_args):
 
 def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     """Setup model and optimizer."""
+    if neox_args.mup_width_multiplier is None:
+        neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
+    print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
+
     model = get_model(neox_args=neox_args, use_cache=use_cache)
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)

From 4d246a15c1dc6fe504a9de5c50d9f9e99e0a3cac Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 22 Feb 2024 15:54:51 +0000
Subject: [PATCH 53/94] updated plot config

---
 megatron/mup_substitute.py | 5 +++--
 megatron/training.py       | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index a8931e249..7cd5cb1ea 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -124,7 +124,7 @@ def output_logits_coord_check_hook(module, input, output):
                 output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list)
 
                 df["seed"].append(i)
-                df["step"].append(f"t={step}")
+                df["step"].append(step)
                 df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean)
                 df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean)
                 df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean)
@@ -134,7 +134,8 @@ def output_logits_coord_check_hook(module, input, output):
             import gc
             del model, optimizer
             gc.collect()
-            torch.cuda.empty_cache()
+            with torch.no_grad():
+                torch.cuda.empty_cache()
 
     return pd.DataFrame(df)
 
diff --git a/megatron/training.py b/megatron/training.py
index d8b5e779f..c5afd7a94 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -65,12 +65,14 @@ def plot_coord_data(df, graph_name_prefix, mup=True):
 
     def _plot_data(df, activation, graph_name_prefix):
         df = df.groupby(['step', 'width']).mean().reset_index()
+        sns.color_palette("magma")
         sns.lineplot(
             data=df,
             x="width", y=activation, hue="step", errorbar=None, style="step",
             marker="o", dashes=False, legend='full'
         )
         plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
+        plt.tight_layout()
         plt.xlabel("Width")
         plt.ylabel("Activation with {}".format("muP" if mup else "SP"))
         plt.title(f"{activation}")

From 84c5380c1989c1ef90ec9c3915cd17217fd1d895 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 26 Feb 2024 15:39:57 +0000
Subject: [PATCH 54/94] update files

---
 configs/coord_check.yml    | 113 +++++++++++++++++++++++++++++++++++++
 megatron/mup_substitute.py |   8 ++-
 2 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 configs/coord_check.yml

diff --git a/configs/coord_check.yml b/configs/coord_check.yml
new file mode 100644
index 000000000..299eab290
--- /dev/null
+++ b/configs/coord_check.yml
@@ -0,0 +1,113 @@
+{
+  # parallelism settings
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 8,
+  "num_attention_heads": 8,
+  "seq_length": 128,
+  "max_position_embeddings": 128,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  # "attention_config": [[["flash"], 8]],
+
+  # these should provide some speedup but takes a while to build, set to true if desired
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  # init methods
+  "init_method": "normal",
+  "output_layer_init_method": "scaled_normal",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.006,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  # "min_lr": 0.006,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 8,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.0,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "precision": "fp32",
+  # "fp16": {
+  #   "fp16": true,
+  #   "enabled": true,
+  #   "loss_scale": 0,
+  #   "loss_scale_window": 1000,
+  #   "initial_scale_power": 12,
+  #   "hysteresis": 2,
+  #   "min_loss_scale": 1,
+  # },
+
+  # misc. training settings
+  "train_iters": 300,
+  "lr_decay_iters": 300,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 300,
+  "eval_interval": 300,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer",
+  "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
+
+  "coord_check": true,
+  "use_mup": true,
+  # sigma_base
+  "init_method_std": 0.08,
+  # "mup_embedding_multiplier": 5,
+  # "mup_output_multiplier": 1,
+  # "mup_width_multiplier": 1,
+  "mup_d_model_base": 128,
+  "hidden_size": 128,
+
+  "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document",
+
+  # "launcher": "slurm",
+  # "deepspeed_slurm": true,
+
+}
diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 7cd5cb1ea..a01a9bec4 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -2,6 +2,7 @@
 Helper functions for performing coord check.
 """
 import os
+import gc
 from copy import copy
 from itertools import product
 
@@ -30,7 +31,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=2,
+    nseeds=10,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -131,12 +132,15 @@ def output_logits_coord_check_hook(module, input, output):
                 df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean)
                 df["width"].append(width)
 
-            import gc
             del model, optimizer
             gc.collect()
             with torch.no_grad():
                 torch.cuda.empty_cache()
 
+        gc.collect()
+        with torch.no_grad():
+            torch.cuda.empty_cache()
+
     return pd.DataFrame(df)
 
 

From 42d4cdea9b1833df9c175dd6ac28d970bd0ec11f Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Mon, 26 Feb 2024 15:47:02 +0000
Subject: [PATCH 55/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 90 +++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 23 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index e74993c0c..d79034a14 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -7,7 +7,7 @@ LR Scheduler Arguments
 
 
 
-- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential']
+- **lr_decay_style**: Literal
 
     Default = linear
 
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 211e726
+    Default = b2f1101
 
     current git hash of repository
 
@@ -253,7 +253,7 @@ Model Arguments
 
 
 
-- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16']
+- **precision**: Literal
 
     Default = None
 
@@ -274,6 +274,17 @@ Model Arguments
     Default = None
 
     Transformer hidden size.
+    When using muP, this is d_model
+
+
+
+- **intermediate_size**: int
+
+    Default = None
+
+    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+
+    If not passed, will be set to a reasonable default.
 
 
 
@@ -283,6 +294,22 @@ Model Arguments
 
     Number of transformer attention heads.
 
+    If num_kv_heads is set, will control only number of query heads.
+
+
+
+- **num_kv_heads**: int
+
+    Default = None
+
+    Number of transformer key/value attention heads.
+
+    If set to None or the same value as num_attention_heads, will perform multi-head attention (MHA).
+    If set to < num_attention_heads but > 1, will perform grouped-query attention (GQA) (https://arxiv.org/pdf/2305.13245.pdf)
+    If set to 1, will perform multi-query attention.
+
+    Must be < num_attention_heads and divide num_attention_heads evenly.
+
 
 
 - **seq_length**: int
@@ -293,6 +320,14 @@ Model Arguments
 
 
 
+- **sliding_window_width**: int
+
+    Default = None
+
+    Width of the attention sliding window. Only supported with Flash Attention 2.
+
+
+
 - **max_position_embeddings**: int
 
     Default = None
@@ -301,7 +336,7 @@ Model Arguments
 
 
 
-- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
+- **norm**: Literal
 
     Default = layernorm
 
@@ -349,7 +384,7 @@ Model Arguments
 
 
 
-- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none']
+- **pos_emb**: Literal
 
     Default = learned
 
@@ -463,7 +498,7 @@ Model Arguments
 
 
 
-- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
+- **activation**: Literal
 
     Default = gelu
 
@@ -568,7 +603,7 @@ Model Arguments
 
 
 
-- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
+- **init_method**: Literal
 
     Default = normal
 
@@ -577,7 +612,7 @@ Model Arguments
 
 
 
-- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
+- **output_layer_init_method**: Literal
 
     Default = scaled_normal
 
@@ -660,7 +695,7 @@ Model Arguments
 
 
 
-- **output_layer_parallelism**: typing.Literal['column']
+- **output_layer_parallelism**: Literal
 
     Default = column
 
@@ -674,7 +709,7 @@ Optimizer Arguments
 
 
 
-- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion']
+- **optimizer_type**: Literal
 
     Default = adam
 
@@ -691,7 +726,7 @@ Optimizer Arguments
 
 
 
-- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']]
+- **zero_stage**: Union
 
     Default = None
 
@@ -736,7 +771,7 @@ Optimizer Arguments
     Default = None
 
     Max Learning rate during training
-    When using muP, this is the base lr
+    When using muP, this is the base learning rate
 
 
 
@@ -1026,7 +1061,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default =
+    Default = 
 
 
     a single prompt's end. Defaults to newline
@@ -1068,7 +1103,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default =
+    Default = 
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1090,7 +1125,7 @@ Tokenizer Arguments
 
 
 
-- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
+- **tokenizer_type**: Literal
 
     Default = GPT2BPETokenizer
 
@@ -1221,7 +1256,7 @@ Training Arguments
 
 
 
-- **data_impl**: typing.Literal['infer', 'mmap', 'cached']
+- **data_impl**: Literal
 
     Default = infer
 
@@ -1285,7 +1320,7 @@ Training Arguments
 
 
 
-- **checkpoint_scale**: typing.Literal['linear', 'log']
+- **checkpoint_scale**: Literal
 
     Default = linear
 
@@ -1617,17 +1652,25 @@ Training Arguments
 
 
 
-- **mup_embedding_multiplier**: int
+- **mup_embedding_multiplier**: float
 
-    Default = 1
+    Default = 1.0
 
     Embedding output multiplier
 
 
 
-- **mup_width_multiplier**: int
+- **mup_output_multiplier**: float
 
-    Default = 1
+    Default = 1.0
+
+    Output logits multiplier
+
+
+
+- **mup_width_multiplier**: float
+
+    Default = None
 
     Manually set the layer width multiplier (d_model/d_model,base)
 
@@ -1788,7 +1831,7 @@ Args for deepspeed config
 
     Default = None
 
-
+    
 
 
 
@@ -2035,7 +2078,7 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 
 
 
-- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
+- **launcher**: Literal
 
     Default = pdsh
 
@@ -2088,3 +2131,4 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
+

From 4c477d51a4b563375b1a7567e853b641ff1fff21 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 02:50:27 +0000
Subject: [PATCH 56/94] init function, add input embedding different
 initialization

---
 megatron/model/gpt2_model.py     |  4 ++--
 megatron/model/init_functions.py | 26 ++++++++++++++++----------
 megatron/mup_substitute.py       |  4 +++-
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index f3ccbdf6b..652004c8b 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -118,7 +118,7 @@ def __init__(
         self.parallel_output = parallel_output
         self.hidden_size = self.neox_args.hidden_size
         self.num_tokentypes = num_tokentypes
-        self.init_method, self.output_layer_init_method = get_init_methods(
+        self.init_method, self.input_embedding_init_method, self.output_layer_init_method = get_init_methods(
             self.neox_args
         )
         self.__topology__ = topology
@@ -188,7 +188,7 @@ def init_specs(self):
                     self.neox_args.padded_vocab_size,
                     self.neox_args.max_position_embeddings,
                     self.neox_args.hidden_dropout,
-                    self.init_method,
+                    self.input_embedding_init_method,
                     self.num_tokentypes,
                     tied_weight_attr="word_embeddings_weight",
                 )
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index fc4d23ea6..7554a7b94 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -142,35 +142,41 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
 
 def get_init_methods(args):
 
-    def _get(name):
+    def _get(name, use_mup=False):
         if name == "normal":
+            sigma = args.init_method_std
+            if use_mup:
+                sigma = sigma/math.sqrt(args.mup_width_multiplier)
             return init_method_normal(
-                sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier)
+                sigma=sigma,
             )
         elif name == "scaled_normal":
+            sigma = args.init_method_std
+            if use_mup:
+                sigma = sigma/math.sqrt(args.mup_width_multiplier)
             return scaled_init_method_normal(
-                sigma=args.init_method_std/math.sqrt(args.mup_width_multiplier),
+                sigma=sigma,
                 num_layers=args.num_layers
             )
         elif name == "orthogonal":
-            return orthogonal_init_method(args.mup_width_multiplier)
+            return orthogonal_init_method(args.mup_width_multiplier if use_mup else 1.0)
         elif name == "scaled_orthogonal":
             return orthogonal_init_method(
-                args.num_layers, args.mup_width_multiplier
+                args.num_layers, args.mup_width_multiplier if use_mup else 1.0
             )
         elif name == "xavier_uniform":
-            return xavier_uniform_init_method(args.mup_width_multiplier)
+            return xavier_uniform_init_method(args.mup_width_multiplier if use_mup else 1.0)
         elif name == "xavier_normal":
-            return xavier_normal_init_method(args.mup_width_multiplier)
+            return xavier_normal_init_method(args.mup_width_multiplier if use_mup else 1.0)
         elif name == "wang_init":
             return wang_init_method(
-                args.num_layers, args.hidden_size, args.mup_width_multiplier
+                args.num_layers, args.hidden_size, args.mup_width_multiplier if use_mup else 1.0
             )
         elif name == "small_init":
             return small_init_init_method(
-                args.hidden_size, args.mup_width_multiplier
+                args.hidden_size, args.mup_width_multiplier if use_mup else 1.0
             )
         else:
             raise NotImplementedError(f"Unknown init method {name}")
 
-    return _get(args.init_method), _get(args.output_layer_init_method)
+    return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method)
diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index a01a9bec4..520266b78 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -31,7 +31,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=10,
+    nseeds=2,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -47,6 +47,8 @@ def _get_coord_data(
         "output_logits_act_abs_mean": [],
         "width": [],
     }
+    with torch.no_grad():
+        torch.cuda.empty_cache()
 
     for width, model_obj in models.items():
         for i in range(nseeds):

From 65c103e5676a58ff4f1d2d35122c0ef45e6bd740 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 15:16:34 +0000
Subject: [PATCH 57/94] changeoutput layer to normal

---
 configs/coord_check.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/coord_check.yml b/configs/coord_check.yml
index 299eab290..05e7b6bb8 100644
--- a/configs/coord_check.yml
+++ b/configs/coord_check.yml
@@ -22,7 +22,7 @@
 
   # init methods
   "init_method": "normal",
-  "output_layer_init_method": "scaled_normal",
+  "output_layer_init_method": "normal",
 
   # optimizer settings
   "optimizer": {
@@ -48,8 +48,8 @@
   },
 
   # batch / data settings
-  "train_micro_batch_size_per_gpu": 4,
-  "gradient_accumulation_steps": 8,
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 32,
   "data_impl": "mmap",
   "num_workers": 1,
 

From 08b5d40a7c7bc115d5f45b3f3b97cc6b40b6f1cf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 15:17:41 +0000
Subject: [PATCH 58/94] change from mean to std

---
 megatron/mup_substitute.py | 52 +++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 520266b78..87cff559d 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -31,7 +31,7 @@ def _get_coord_data(
     filter_module_by_name=None,
     fix_data=True,
     cuda=True,
-    nseeds=2,
+    nseeds=10,
     output_fdict=None,
     input_fdict=None,
     param_fdict=None,
@@ -41,10 +41,10 @@ def _get_coord_data(
     df = {
         "seed": [],
         "step": [],
-        "word_embedding_act_abs_mean": [],
-        "attn_output_act_abs_mean": [],
-        "ffn_output_act_abs_mean": [],
-        "output_logits_act_abs_mean": [],
+        "word_embedding_act_abs_std": [],
+        "attn_output_act_abs_std": [],
+        "ffn_output_act_abs_std": [],
+        "output_logits_act_abs_std": [],
         "width": [],
     }
     with torch.no_grad():
@@ -53,7 +53,7 @@ def _get_coord_data(
     for width, model_obj in models.items():
         for i in range(nseeds):
             torch.manual_seed(10**i)
-            print_rank_0(f">>> Running Model with width: {width} on seed: {i}")
+            print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
             model = model_obj()
             model.train()
             neox_args.hidden_size = width
@@ -61,29 +61,29 @@ def _get_coord_data(
 
             for step in range(nsteps + 1):
 
-                word_embedding_act_abs_mean_list = []
-                attn_output_act_abs_mean_list = []
-                ffn_output_act_abs_mean_list = []
-                output_logits_act_abs_mean_list = []
+                word_embedding_act_abs_std_list = []
+                attn_output_act_abs_std_list = []
+                ffn_output_act_abs_std_list = []
+                output_logits_act_abs_std_list = []
                 remove_hooks = []
 
                 def word_embedding_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        word_embedding_act_abs_mean_list.append(output.abs().mean().item())
+                        word_embedding_act_abs_std_list.append(output.abs().std().item())
 
                 def attn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        attn_output_act_abs_mean_list.append(output[0].abs().mean().item())
+                        attn_output_act_abs_std_list.append(output[0].abs().std().item())
 
                 def ffn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        ffn_output_act_abs_mean_list.append(output[0].abs().mean().item())
+                        ffn_output_act_abs_std_list.append(output[0].abs().std().item())
 
                 def output_logits_coord_check_hook(module, input, output):
                     with torch.no_grad():
                         # print("output_logits_coord_check_hook")
                         # print_rank_0(output.shape)
-                        output_logits_act_abs_mean_list.append(output[0].abs().mean().item())
+                        output_logits_act_abs_std_list.append(output[0].abs().std().item())
 
                 for name, module in model.named_modules():
                     if name.endswith(".word_embeddings"):
@@ -113,25 +113,25 @@ def output_logits_coord_check_hook(module, input, output):
                     lr_scheduler=lr_scheduler,
                 )
 
-                word_embedding_act_abs_mean = None
-                attn_output_act_abs_mean = None
-                ffn_output_act_abs_mean = None
-                output_logits_act_abs_mean = None
+                word_embedding_act_abs_std = None
+                attn_output_act_abs_std = None
+                ffn_output_act_abs_std = None
+                output_logits_act_abs_std = None
 
                 # remove hooks
                 for handle in remove_hooks:
                     handle.remove()
-                word_embedding_act_abs_mean = np.mean(word_embedding_act_abs_mean_list)
-                attn_output_act_abs_mean = np.mean(attn_output_act_abs_mean_list)
-                ffn_output_act_abs_mean = np.mean(ffn_output_act_abs_mean_list)
-                output_logits_act_abs_mean = np.mean(output_logits_act_abs_mean_list)
+                word_embedding_act_abs_std = np.mean(word_embedding_act_abs_std_list)
+                attn_output_act_abs_std = np.mean(attn_output_act_abs_std_list)
+                ffn_output_act_abs_std = np.mean(ffn_output_act_abs_std_list)
+                output_logits_act_abs_std = np.mean(output_logits_act_abs_std_list)
 
                 df["seed"].append(i)
                 df["step"].append(step)
-                df["word_embedding_act_abs_mean"].append(word_embedding_act_abs_mean)
-                df["attn_output_act_abs_mean"].append(attn_output_act_abs_mean)
-                df["ffn_output_act_abs_mean"].append(ffn_output_act_abs_mean)
-                df["output_logits_act_abs_mean"].append(output_logits_act_abs_mean)
+                df["word_embedding_act_abs_std"].append(word_embedding_act_abs_std)
+                df["attn_output_act_abs_std"].append(attn_output_act_abs_std)
+                df["ffn_output_act_abs_std"].append(ffn_output_act_abs_std)
+                df["output_logits_act_abs_std"].append(output_logits_act_abs_std)
                 df["width"].append(width)
 
             del model, optimizer

From 2ca94a8598b1a018d59d3c7b337dc8cff0b7e7d2 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 15:18:58 +0000
Subject: [PATCH 59/94] double attention head for every hidden size doubled

---
 megatron/training.py | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 050d39fb5..0d4cfcab7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -74,7 +74,7 @@ def _plot_data(df, activation, graph_name_prefix):
             marker="o", dashes=False, legend='full'
         )
         plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
-        plt.tight_layout()
+        plt.tight_layout(pad=3.0)
         plt.xlabel("Width")
         plt.ylabel("Activation with {}".format("muP" if mup else "SP"))
         plt.title(f"{activation}")
@@ -116,44 +116,49 @@ def has_method(o, name):
 def mup_coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
 
-    def lazy_model(hidden_size):
+    def lazy_model(hidden_size, attention_head):
         def gen():
             old_hidden_size = neox_args.hidden_size
+            old_num_attention_heads = neox_args.num_attention_heads
             neox_args.hidden_size = hidden_size
+            neox_args.num_attention_heads = attention_head
             neox_args.mup_width_multiplier = None
-
             model, *_ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
             neox_args.hidden_size = old_hidden_size
-
+            neox_args.num_attention_heads = old_num_attention_heads
             return model
 
         return gen
 
     models = {}
     # Hidden size needs to be divisible by num attention heads
-    for hidden_size in [2**p for p in range(7,11)]:
-        models[hidden_size] = lazy_model(hidden_size)
+    for idx, hidden_size in enumerate([2**p for p in range(7,12)]):
+        models[hidden_size] = lazy_model(
+            hidden_size,
+            neox_args.num_attention_heads*(2**idx)
+        )
+
+    # print_rank_0(">>> Coord Check for mu Parameterization")
+    # neox_args.use_mup = True
+    # df_mup = get_coord_data(
+    #     neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
+    # )
+    # df_mup.to_csv("df_mup.csv", index=False)
+    # plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True)
 
-    print_rank_0(">>> Coord Check for mu Parameterization")
-    neox_args.use_mup = True
-    df_mup = get_coord_data(
-        neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
-    )
     print_rank_0(">>> Coord Check for standard Parameterization")
     neox_args.use_mup = False
     df_sp = get_coord_data(
         neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam"
     )
-
-    df_mup.to_csv("df_mup.csv", index=False)
     df_sp.to_csv("df_sp.csv", index=False)
-    plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True)
     plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False)
-    print_rank_0("Saved coord check plots... exiting")
 
+    print_rank_0("Saved coord check plots... exiting")
+    import sys; sys.exit()
     return df_mup, df_sp
 
 def pretrain(neox_args):
@@ -190,7 +195,7 @@ def pretrain(neox_args):
         ) = build_train_valid_test_data_iterators(neox_args=neox_args)
         timers("train/valid/test data iterators").stop()
 
-        df_mup, df_sp = mup_coord_check(neox_args, timers, train_data_iterator)
+        mup_coord_check(neox_args, timers, train_data_iterator)
         sys.exit()
 
     # Model, optimizer, and learning rate.
@@ -534,13 +539,15 @@ def get_optimizer(model, neox_args):
         # Use Adam
         if neox_args.use_mup:
             # try:
-            #     from mup import MuAdam
-
-            #     adam_optimizer = MuAdam
+            # #     from mup import MuAdam
+            # #     adam_optimizer = MuAdam
+            # # except ModuleNotFoundError:
+            # #     print("Please install mup https://github.com/microsoft/mup")
+            # #     raise Exception
+            #     from deepspeed.ops.adam import FusedAdam as Adam
+            #     adam_optimizer = Adam
             # except ModuleNotFoundError:
-            #     print("Please install mup https://github.com/microsoft/mup")
-            #     raise Exception
-            # from deepspeed.ops.adam import FusedAdam as Adam
+            # from apex.optimizers import FusedAdam as Adam
             # adam_optimizer = Adam
             adam_optimizer = torch.optim.Adam
         else:
@@ -642,7 +649,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     """Setup model and optimizer."""
     if neox_args.mup_width_multiplier is None:
         neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
-    print_rank_0(f"mup_width_multiplier set to {neox_args.mup_width_multiplier}")
+        print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
     model = get_model(neox_args=neox_args, use_cache=use_cache)
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)

From 497485ca059655392071c0ed3eb8077351341a4c Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 27 Feb 2024 15:21:42 +0000
Subject: [PATCH 60/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 40 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 684b58609..591bd9384 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -7,7 +7,7 @@ LR Scheduler Arguments
 
 
 
-- **lr_decay_style**: Literal
+- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential']
 
     Default = linear
 
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 6bb4e62
+    Default = 7483246
 
     current git hash of repository
 
@@ -253,7 +253,7 @@ Model Arguments
 
 
 
-- **precision**: Literal
+- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16']
 
     Default = None
 
@@ -288,16 +288,6 @@ Model Arguments
 
 
 
-- **intermediate_size**: int
-
-    Default = None
-
-    Transformer intermediate size. Currently only used for "mlp_type": "llama".
-
-    If not passed, will be set to a reasonable default.
-
-
-
 - **num_attention_heads**: int
 
     Default = None
@@ -346,7 +336,7 @@ Model Arguments
 
 
 
-- **norm**: Literal
+- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
 
     Default = layernorm
 
@@ -394,7 +384,7 @@ Model Arguments
 
 
 
-- **pos_emb**: Literal
+- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none']
 
     Default = learned
 
@@ -508,7 +498,7 @@ Model Arguments
 
 
 
-- **activation**: Literal
+- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
 
     Default = gelu
 
@@ -613,7 +603,7 @@ Model Arguments
 
 
 
-- **init_method**: Literal
+- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
 
     Default = normal
 
@@ -622,7 +612,7 @@ Model Arguments
 
 
 
-- **output_layer_init_method**: Literal
+- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
 
     Default = scaled_normal
 
@@ -705,7 +695,7 @@ Model Arguments
 
 
 
-- **output_layer_parallelism**: Literal
+- **output_layer_parallelism**: typing.Literal['column']
 
     Default = column
 
@@ -719,7 +709,7 @@ Optimizer Arguments
 
 
 
-- **optimizer_type**: Literal
+- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion']
 
     Default = adam
 
@@ -736,7 +726,7 @@ Optimizer Arguments
 
 
 
-- **zero_stage**: Union
+- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']]
 
     Default = None
 
@@ -1135,7 +1125,7 @@ Tokenizer Arguments
 
 
 
-- **tokenizer_type**: Literal
+- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
 
     Default = GPT2BPETokenizer
 
@@ -1266,7 +1256,7 @@ Training Arguments
 
 
 
-- **data_impl**: Literal
+- **data_impl**: typing.Literal['infer', 'mmap', 'cached']
 
     Default = infer
 
@@ -1330,7 +1320,7 @@ Training Arguments
 
 
 
-- **checkpoint_scale**: Literal
+- **checkpoint_scale**: typing.Literal['linear', 'log']
 
     Default = linear
 
@@ -2088,7 +2078,7 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 
 
 
-- **launcher**: Literal
+- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
 
     Default = pdsh
 

From 34fb7ca9c23dab34a58a4dacecabfb1843878d5f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 18:45:07 +0000
Subject: [PATCH 61/94] added args

---
 megatron/neox_arguments/neox_args.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 08075aa0f..c04d566de 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1103,6 +1103,16 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Whether to generate a "coord check" plot to verify mup's implementation in neox
     """
 
+    coord_check_nsteps: int = 10
+    """
+    
+    """
+
+    coord_check_nseeds: int = 5
+    """
+    
+    """
+
     save_base_shapes: bool = False
     """
     Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file.

From 2d53f1f48f8eb10cdfc0cd7c210af14b985d083e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 18:45:28 +0000
Subject: [PATCH 62/94] simplify coordcheck

---
 megatron/mup_substitute.py | 189 +++++++------------------------------
 1 file changed, 34 insertions(+), 155 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 87cff559d..49eab375b 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -10,34 +10,20 @@
 import pandas as pd
 import torch
 import torch.nn.functional as F
-
+import deepspeed
 from megatron import print_rank_0
 from megatron.training import train_step
 
 
-def _get_coord_data(
+def get_coord_data(
     neox_args,
     timers,
-    lr_scheduler,
     models,
     dataloader,
-    optcls,
     nsteps=10,
-    dict_in_out=False,
-    flatten_input=False,
-    flatten_output=False,
-    output_name="loss",
-    lossfn="xent",
-    filter_module_by_name=None,
-    fix_data=True,
-    cuda=True,
-    nseeds=10,
-    output_fdict=None,
-    input_fdict=None,
-    param_fdict=None,
-    show_progress=True,
-    one_hot_target=False,
+    nseeds=2,
 ):
+    lr_scheduler = None
     df = {
         "seed": [],
         "step": [],
@@ -54,10 +40,9 @@ def _get_coord_data(
         for i in range(nseeds):
             torch.manual_seed(10**i)
             print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
-            model = model_obj()
+            model, optimizer = model_obj()
             model.train()
             neox_args.hidden_size = width
-            optimizer = optcls(model)
 
             for step in range(nsteps + 1):
 
@@ -69,21 +54,19 @@ def _get_coord_data(
 
                 def word_embedding_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        word_embedding_act_abs_std_list.append(output.abs().std().item())
+                        word_embedding_act_abs_std_list.append(output.cpu().abs().std().item())
 
                 def attn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        attn_output_act_abs_std_list.append(output[0].abs().std().item())
+                        attn_output_act_abs_std_list.append(output[0].cpu().abs().std().item())
 
                 def ffn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        ffn_output_act_abs_std_list.append(output[0].abs().std().item())
+                        ffn_output_act_abs_std_list.append(output[0].cpu().abs().std().item())
 
                 def output_logits_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        # print("output_logits_coord_check_hook")
-                        # print_rank_0(output.shape)
-                        output_logits_act_abs_std_list.append(output[0].abs().std().item())
+                        output_logits_act_abs_std_list.append(output[0].cpu().abs().std().item())
 
                 for name, module in model.named_modules():
                     if name.endswith(".word_embeddings"):
@@ -134,136 +117,32 @@ def output_logits_coord_check_hook(module, input, output):
                 df["output_logits_act_abs_std"].append(output_logits_act_abs_std)
                 df["width"].append(width)
 
-            del model, optimizer
+            print_rank_0(
+                f">>> BEFORE Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
+            )
+            def del_obj_attrs(obj):
+                attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))]
+                for attr in attributes:
+                    try:
+                        delattr(obj,attr)
+                    except:
+                        pass
+
+            def unlink_hp_params(lp_param_list):
+                for lp in lp_param_list:
+                    lp._hp_mapping = None
+                return
+
+            for i, _ in enumerate(optimizer.optimizer.param_groups):
+                unlink_hp_params(optimizer.bit16_groups[i])
+            del_obj_attrs(optimizer)
+            model.destroy()
+            del optimizer
             gc.collect()
-            with torch.no_grad():
-                torch.cuda.empty_cache()
-
-        gc.collect()
-        with torch.no_grad():
             torch.cuda.empty_cache()
+            deepspeed.runtime.utils.empty_cache()
+            print_rank_0(
+                f">>> AFTER Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
+            )
 
     return pd.DataFrame(df)
-
-
-def get_coord_data(
-    neox_args,
-    timers,
-    lr_scheduler,
-    models,
-    dataloader,
-    optimizer="sgd",
-    lr=None,
-    mup=True,
-    filter_trainable_by_name=None,
-    **kwargs
-):
-    """Get coord data for coord check.
-    Train the models in `models` with data from `dataloader` and optimizer
-    specified by `optimizer` and `lr` for `nsteps` steps, and record coordinate
-    statistics specified by `output_fdict`, `input_fdict`, `param_fdict`. By
-    default, only `l1` is computed for output activations of each module.
-    This function wraps around `_get_coord_data`, with the main difference being
-    user can specify common optimizers via a more convenient interface.
-    Inputs:
-        models:
-            a dict of lazy models, where the keys are numbers indicating width.
-            Each entry of `models` is a function that instantiates a model given
-            nothing.
-        dataloader:
-            an iterator whose elements are either Huggingface style dicts, if
-            `dict_in_out` is True, or (input, label). If `fix_data` is True
-            (which is the default), then only the first element of `dataloader`
-            is used in a loop and the rest of `dataloder` is ignored.
-        optimizer:
-            a string in `['sgd', 'adam', 'adamw']`, with default being `'sgd'`.
-        lr:
-            learning rate. By default is 0.1 for `'sgd'` and 1e-3 for others.
-        mup:
-            If True, then use the optimizer from `mup.optim`; otherwise, use the
-            one from `torch.optim`.
-        filter_trainable_by_name:
-            a function that returns a bool given module names (from
-            `model.named_modules()`), or None. If not None, then only modules
-            whose name yields True will be trained.
-        nsteps:
-            number of steps to train the model
-        dict_in_out:
-            whether the data loader contains Huggingface-style dict input and
-            output. Default: False
-        flatten_input:
-            if not `dict_in_out`, reshape the input to be
-            `input.view(input.shape[0], -1)`. Typically used for testing MLPs.
-        flatten_output:
-            if not `dict_in_out`, reshape the label to be `label.view(-1,
-            input.shape[-1])`.
-        output_name:
-            if `dict_in_out`, this is the key for the loss value if the output
-            is a dict. If the output is not a dict, then we assume the first
-            element of the output is the loss.
-        lossfn:
-            loss function to use if not `dict_in_out`. Can be either a string from
-            [`xent`, 'mse', 'nll', 'l1'] or a python `callable` such that
-            `lossfn(output, target)` returns the loss value. Examples of valid
-            `callable`s are `F.cross_entropy`, `F.mse_loss`, etc, where `F` is
-            `torch.nn.functional`. Default: 'xent'
-        filter_module_by_name:
-            a function that returns a bool given module names (from
-            `model.named_modules()`), or None. If not None, then only modules
-            whose name yields True will be recorded.
-        cuda:
-            whether to use cuda or not. Default: True
-        nseeds:
-            number of times to repeat the training, each with different seeds.
-        output_fdict, input_fdict, param_fdict:
-            function dicts to be used in `_record_coords`. By default, only `l1`
-            is computed for output activations of each module.
-        show_progress:
-            show progress using tqdm. Default: True
-        one_hot_target:
-            convert target label into a one-hot vector. This typically is only
-            used for `'mse'` or `'l1'` losses in classification tasks.
-            Default: False
-    Output:
-        a pandas DataFrame containing recorded results. The column names are
-        `'width', 'module', 't'` as well as names of statistics recorded, such
-        as `'l1'` (see `FDICT` for other premade statistics that can be
-        collected).
-
-    Breaking Changes:
-        In v1.0.0, when `lossfn=='mse'`, the target is automatically converted
-        to a one hot vector before loss computation. Starting in v1.1.0, this
-        behavior is turned off, and the user needs to explicitly turn on this
-        behavior by setting `one_hot_target=True`.
-    """
-    if lr is None:
-        lr = 0.1 if optimizer == "sgd" else 1e-3
-        
-    from torch.optim import SGD, AdamW, Adam
-    # from deepspeed.ops.adam import FusedAdam as Adam
-
-    def get_trainable(model):
-        params = model.parameters()
-        if filter_trainable_by_name is not None:
-            params = []
-            for name, p in model.named_parameters():
-                if filter_trainable_by_name(name):
-                    params.append(p)
-        return params
-
-    if optimizer == "sgd":
-        optcls = lambda model: SGD(get_trainable(model), lr=lr)
-    elif optimizer == "adam":
-        optcls = lambda model: Adam(get_trainable(model), lr=lr)
-    elif optimizer == "adamw":
-        optcls = lambda model: AdamW(get_trainable(model), lr=lr)
-    elif optimizer is None:
-        raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
-
-    neox_args.use_mup = mup
-    data = _get_coord_data(
-        neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
-    )
-    # data["optimizer"] = optimizer
-    # data["lr"] = lr
-    return data

From 789761017eb3494f777381700c9a6f8ceab4781e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 18:46:04 +0000
Subject: [PATCH 63/94] seperate sp and mup configs

---
 .../{coord_check.yml => coord_check_mup.yml}  |   2 +
 configs/coord_check_sp.yml                    | 115 ++++++++++++++++++
 2 files changed, 117 insertions(+)
 rename configs/{coord_check.yml => coord_check_mup.yml} (98%)
 create mode 100644 configs/coord_check_sp.yml

diff --git a/configs/coord_check.yml b/configs/coord_check_mup.yml
similarity index 98%
rename from configs/coord_check.yml
rename to configs/coord_check_mup.yml
index 05e7b6bb8..1a253c8ce 100644
--- a/configs/coord_check.yml
+++ b/configs/coord_check_mup.yml
@@ -96,6 +96,8 @@
   "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
 
   "coord_check": true,
+  "coord_check_nsteps": 10,
+  "coord_check_nseeds": 5,
   "use_mup": true,
   # sigma_base
   "init_method_std": 0.08,
diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml
new file mode 100644
index 000000000..e878927df
--- /dev/null
+++ b/configs/coord_check_sp.yml
@@ -0,0 +1,115 @@
+{
+  # parallelism settings
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 8,
+  "num_attention_heads": 8,
+  "seq_length": 128,
+  "max_position_embeddings": 128,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  # "attention_config": [[["flash"], 8]],
+
+  # these should provide some speedup but takes a while to build, set to true if desired
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  # init methods
+  "init_method": "normal",
+  "output_layer_init_method": "normal",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.006,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  # "min_lr": 0.006,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 32,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.0,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "precision": "fp32",
+  # "fp16": {
+  #   "fp16": true,
+  #   "enabled": true,
+  #   "loss_scale": 0,
+  #   "loss_scale_window": 1000,
+  #   "initial_scale_power": 12,
+  #   "hysteresis": 2,
+  #   "min_loss_scale": 1,
+  # },
+
+  # misc. training settings
+  "train_iters": 300,
+  "lr_decay_iters": 300,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 300,
+  "eval_interval": 300,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer",
+  "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
+
+  "coord_check": true,
+  "coord_check_nsteps": 10,
+  "coord_check_nseeds": 5,
+  "use_mup": false,
+  # sigma_base
+  "init_method_std": 0.08,
+  # "mup_embedding_multiplier": 5,
+  # "mup_output_multiplier": 1,
+  # "mup_width_multiplier": 1,
+  "mup_d_model_base": 128,
+  "hidden_size": 128,
+
+  "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document",
+
+  # "launcher": "slurm",
+  # "deepspeed_slurm": true,
+
+}

From 4f3920903bf6fd25efceabc386aec6818ace59cd Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 27 Feb 2024 18:46:32 +0000
Subject: [PATCH 64/94] perform coordcheck for sp and mup seperately

---
 megatron/training.py | 50 ++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index ccdcedcbb..5f189fa59 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -24,6 +24,7 @@
 
 import math
 import sys
+import gc
 
 import torch
 import deepspeed
@@ -84,10 +85,10 @@ def _plot_data(df, activation, graph_name_prefix):
         return 0
 
     activation_list = [
-        "word_embedding_act_abs_mean",
-        "attn_output_act_abs_mean",
-        "ffn_output_act_abs_mean",
-        "output_logits_act_abs_mean",
+        "word_embedding_act_abs_std",
+        "attn_output_act_abs_std",
+        "ffn_output_act_abs_std",
+        "output_logits_act_abs_std",
     ]
     """If distributed is initialized print only on rank 0."""
     if torch.distributed.is_initialized():
@@ -113,7 +114,7 @@ def has_method(o, name):
             layer.mup_reinitialize_weights(neox_args)
 
 
-def mup_coord_check(neox_args, timers, train_data_iterator):
+def coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
 
     def lazy_model(hidden_size, attention_head):
@@ -123,43 +124,38 @@ def gen():
             neox_args.hidden_size = hidden_size
             neox_args.num_attention_heads = attention_head
             neox_args.mup_width_multiplier = None
-            model, *_ = setup_model_and_optimizer(
+            model, optimizer, _ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
             neox_args.hidden_size = old_hidden_size
             neox_args.num_attention_heads = old_num_attention_heads
-            return model
+            return model, optimizer
 
         return gen
 
     models = {}
     # Hidden size needs to be divisible by num attention heads
-    for idx, hidden_size in enumerate([2**p for p in range(7,12)]):
+    for idx, hidden_size in enumerate([2**p for p in range(8,12)]):
         models[hidden_size] = lazy_model(
             hidden_size,
             neox_args.num_attention_heads*(2**idx)
         )
 
-    # print_rank_0(">>> Coord Check for mu Parameterization")
-    # neox_args.use_mup = True
-    # df_mup = get_coord_data(
-    #     neox_args, timers, None, models, train_data_iterator, mup=True, optimizer="adam"
-    # )
-    # df_mup.to_csv("df_mup.csv", index=False)
-    # plot_coord_data(df_mup, graph_name_prefix=f"coord_check_mup", mup=True)
-
-    print_rank_0(">>> Coord Check for standard Parameterization")
-    neox_args.use_mup = False
-    df_sp = get_coord_data(
-        neox_args, timers, None, models, train_data_iterator, mup=False, optimizer="adam"
+    df_mode = "mup" if neox_args.use_mup else "sp"
+    if neox_args.use_mup:
+        print_rank_0(">>> Coord Check for mu Parameterization")
+    else:
+        print_rank_0(">>> Coord Check for standard Parameterization")
+
+    df = get_coord_data(
+        neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds,
     )
-    df_sp.to_csv("df_sp.csv", index=False)
-    plot_coord_data(df_sp, graph_name_prefix=f"coord_check_sp", mup=False)
+    df.to_csv(f"df_{df_mode}.csv", index=False)
+    plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", mup=neox_args.use_mup)
 
     print_rank_0("Saved coord check plots... exiting")
-    import sys; sys.exit()
-    return df_mup, df_sp
+    return 0
 
 def pretrain(neox_args):
     """Main training program.
@@ -183,8 +179,8 @@ def pretrain(neox_args):
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
-    if neox_args.use_mup and neox_args.coord_check:
-        print_rank_0("---- Do muP Coord Check ----")
+    if neox_args.coord_check:
+        print_rank_0("---- Do Coord Check ----")
         # Data stuff
         neox_args.iteration = 0
         timers("train/valid/test data iterators").start()
@@ -195,7 +191,7 @@ def pretrain(neox_args):
         ) = build_train_valid_test_data_iterators(neox_args=neox_args)
         timers("train/valid/test data iterators").stop()
 
-        mup_coord_check(neox_args, timers, train_data_iterator)
+        coord_check(neox_args, timers, train_data_iterator)
         sys.exit()
 
     # Model, optimizer, and learning rate.

From 5f84a3f8553d9aed74590ca3983c609013108030 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 27 Feb 2024 18:47:16 +0000
Subject: [PATCH 65/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 591bd9384..0a1ed33fc 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 7483246
+    Default = 4f39209
 
     current git hash of repository
 
@@ -1636,6 +1636,22 @@ Training Arguments
 
 
 
+- **coord_check_nsteps**: int
+
+    Default = 10
+
+    
+
+
+
+- **coord_check_nseeds**: int
+
+    Default = 5
+
+    
+
+
+
 - **save_base_shapes**: bool
 
     Default = False

From 479b854af593259f8fa8a32bb426e8bdacd8030c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 28 Feb 2024 15:22:32 +0000
Subject: [PATCH 66/94] update

---
 configs/coord_check_mup.yml | 19 +++++--------------
 configs/coord_check_sp.yml  | 19 +++++--------------
 2 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml
index 1a253c8ce..d16e900cd 100644
--- a/configs/coord_check_mup.yml
+++ b/configs/coord_check_mup.yml
@@ -78,26 +78,17 @@
   # },
 
   # misc. training settings
-  "train_iters": 300,
-  "lr_decay_iters": 300,
+  "train_iters": 10,
+  "lr_decay_iters": 10,
+  "log_interval": 1,
   "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 300,
-  "eval_interval": 300,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
+  "lr_decay_style": "constant",
   "tokenizer_type": "HFTokenizer",
   "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
 
   "coord_check": true,
   "coord_check_nsteps": 10,
-  "coord_check_nseeds": 5,
+  "coord_check_nseeds": 3,
   "use_mup": true,
   # sigma_base
   "init_method_std": 0.08,
diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml
index e878927df..12c8165e1 100644
--- a/configs/coord_check_sp.yml
+++ b/configs/coord_check_sp.yml
@@ -78,26 +78,17 @@
   # },
 
   # misc. training settings
-  "train_iters": 300,
-  "lr_decay_iters": 300,
+  "train_iters": 10,
+  "lr_decay_iters": 10,
+  "log_interval": 1,
   "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 300,
-  "eval_interval": 300,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
+  "lr_decay_style": "constant",
   "tokenizer_type": "HFTokenizer",
   "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
 
   "coord_check": true,
   "coord_check_nsteps": 10,
-  "coord_check_nseeds": 5,
+  "coord_check_nseeds": 3,
   "use_mup": false,
   # sigma_base
   "init_method_std": 0.08,

From 21a7e32739d78f1ca806dd5ea112c2ebdc742240 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 28 Feb 2024 15:22:56 +0000
Subject: [PATCH 67/94] update how params are sorted

---
 megatron/model/utils.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 9e70b6cce..ef36aac8e 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -30,7 +30,7 @@ def get_params_for_weight_decay_optimization(module, neox_args):
     Layernorms and biases will have no weight decay but the rest will.
     """
     weight_decay_params = {"params": [], "lr_adjust": True}
-    no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
+    no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0}
     embedding_weight_decay_params = {"params": [], "lr_adjust": False}
     embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
 
@@ -44,9 +44,14 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         ) or (
             neox_args.weight_decay == 0.0
         ):  # also include all parameters here if no weight decay is being done
-            no_weight_decay_params["params"].extend(
-                [p for p in list(module_._parameters.values()) if p is not None]
-            )
+            if isinstance(module_, VocabParallelEmbedding):
+                embedding_no_weight_decay_params["params"].extend(
+                            [p for p in list(module_._parameters.values()) if p is not None]
+                        )
+            else:
+                no_weight_decay_params["params"].extend(
+                    [p for p in list(module_._parameters.values()) if p is not None]
+                )
         else:
             if any(
                 [
@@ -89,7 +94,8 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         # only return a single param group
         # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
         # to avoid this, only use a single param group when weight decay is off.
-        return [no_weight_decay_params]
+        # return [no_weight_decay_params]
+        return no_weight_decay_params, embedding_no_weight_decay_params
     return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params
 
 

From bb2e0c99249618520552d00a64868b0c3a5e59ac Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 28 Feb 2024 15:24:40 +0000
Subject: [PATCH 68/94] remove unused comments

---
 megatron/mup_substitute.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 49eab375b..11d3aa503 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -23,7 +23,6 @@ def get_coord_data(
     nsteps=10,
     nseeds=2,
 ):
-    lr_scheduler = None
     df = {
         "seed": [],
         "step": [],
@@ -40,7 +39,7 @@ def get_coord_data(
         for i in range(nseeds):
             torch.manual_seed(10**i)
             print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
-            model, optimizer = model_obj()
+            model, optimizer, lr_scheduler = model_obj()
             model.train()
             neox_args.hidden_size = width
 
@@ -117,9 +116,6 @@ def output_logits_coord_check_hook(module, input, output):
                 df["output_logits_act_abs_std"].append(output_logits_act_abs_std)
                 df["width"].append(width)
 
-            print_rank_0(
-                f">>> BEFORE Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
-            )
             def del_obj_attrs(obj):
                 attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))]
                 for attr in attributes:
@@ -141,8 +137,5 @@ def unlink_hp_params(lp_param_list):
             gc.collect()
             torch.cuda.empty_cache()
             deepspeed.runtime.utils.empty_cache()
-            print_rank_0(
-                f">>> AFTER Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
-            )
 
     return pd.DataFrame(df)

From bf1ce068c35c79d3580b2b780fbc329629dc90f3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 13:06:32 +0000
Subject: [PATCH 69/94] adjust

---
 megatron/model/init_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 7554a7b94..cae150464 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -179,4 +179,4 @@ def _get(name, use_mup=False):
         else:
             raise NotImplementedError(f"Unknown init method {name}")
 
-    return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method)
+    return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method, use_mup=args.use_mup)

From 50a3dbadc092fda1b7a4372d96b1fc1a72f561f6 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 13:08:23 +0000
Subject: [PATCH 70/94] simplify

---
 megatron/learning_rates.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 4ae18d49b..9e9994049 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -98,9 +98,8 @@ def step(self, step_num=None):
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
             if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
-                group["lr"] = new_lr / self.mup_width_multiplier
-            else:
-                group["lr"] = new_lr
+                new_lr = new_lr / self.mup_width_multiplier
+            group["lr"] = new_lr
 
     def state_dict(self):
         state_dict = {

From c4c1660fbdf30b0f666e4d82ee4cc9cb78949a6f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 13:41:26 +0000
Subject: [PATCH 71/94] fix mup embedding multiplier

---
 megatron/model/word_embeddings.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 230bdedb3..04cf55c8d 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -50,8 +50,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
-        self.use_mup = neox_args.use_mup
-        self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier)
+        self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1
 
         # Word embeddings (parallel).
         self.word_embeddings = mpu.VocabParallelEmbedding(
@@ -152,11 +151,8 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
 
         # Dropout.
         embeddings = self.embedding_dropout(embeddings)
-
         # Y_emb = m_emb * embed(X)
-        if self.use_mup:
-            with torch.no_grad():
-                embeddings = torch.mul(embeddings, self.mup_embedding_multiplier)
+        embeddings = torch.mul(embeddings, self.mup_embedding_multiplier)
 
         return embeddings
 

From 1c359119c4ade191de263ecb4b8dd1f531b4ea53 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 13:43:15 +0000
Subject: [PATCH 72/94] embeddingpipe fix init

---
 megatron/model/gpt2_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 652004c8b..5426f7749 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -113,7 +113,6 @@ def __init__(
         use_cache=False,
     ):
         self.neox_args = neox_args
-
         self.use_cache = use_cache
         self.parallel_output = parallel_output
         self.hidden_size = self.neox_args.hidden_size
@@ -202,7 +201,7 @@ def init_specs(self):
                     self.neox_args.padded_vocab_size,
                     self.neox_args.max_position_embeddings,
                     self.neox_args.hidden_dropout,
-                    self.init_method,
+                    self.input_embedding_init_method,
                     self.num_tokentypes,
                 )
             )

From 84be4d4f1877f5a5f1bd04e46d93ddaf394abf26 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:01:33 +0000
Subject: [PATCH 73/94] changed how manual seed is loaded

---
 megatron/model/transformer.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index a86f1b99f..8ba004336 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -232,6 +232,9 @@ def __init__(
                 gather_output=not parallel_output,
                 skip_bias_add=False,
             )
+        
+        self.neox_args = neox_args
+        self.is_last_layer = is_last_layer
 
     #        else:
     #            print(
@@ -250,7 +253,15 @@ def __init__(
     #            )
 
     def forward(self, hidden_states):
-        return self.final_linear(hidden_states)
+        logits = self.final_linear(hidden_states)
+
+        if self.is_last_layer:
+            _logits, *_args = logits
+            if self.neox_args.use_mup:
+                _logits /= self.neox_args.mup_width_multiplier
+                _logits *= self.neox_args.mup_output_multiplier 
+            logits = (_logits, *_args)
+        return logits
 
 
 class ParallelSelfAttention(nn.Module):
@@ -348,12 +359,14 @@ def __init__(
 
         coeff = None
         if neox_args.use_mup:
-            self.norm_factor = self.hidden_size_per_attention_head
+            # self.norm_factor = self.hidden_size_per_attention_head
+            self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
         else:
             self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-            if self.apply_query_key_layer_scaling:
-                coeff = max(1, self.layer_number)
-                self.norm_factor *= coeff
+
+        if self.apply_query_key_layer_scaling:
+            coeff = max(1, self.layer_number)
+            self.norm_factor *= coeff
 
         self.rpe = rpe
 

From fbb4daf3a73039d353f3a32154d95dc4e2626c42 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:02:33 +0000
Subject: [PATCH 74/94] removed musgd and other changces

---
 megatron/training.py | 98 ++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 68 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 5f189fa59..a1d234636 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -64,7 +64,7 @@
 import matplotlib.pyplot as plt
 
 
-def plot_coord_data(df, graph_name_prefix, mup=True):
+def plot_coord_data(df, graph_name_prefix, use_mup=True):
 
     def _plot_data(df, activation, graph_name_prefix):
         df = df.groupby(['step', 'width']).mean().reset_index()
@@ -77,7 +77,7 @@ def _plot_data(df, activation, graph_name_prefix):
         plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
         plt.tight_layout(pad=3.0)
         plt.xlabel("Width")
-        plt.ylabel("Activation with {}".format("muP" if mup else "SP"))
+        plt.ylabel("Activation with {}".format("muP" if use_mup else "SP"))
         plt.title(f"{activation}")
         plt.savefig(f"{graph_name_prefix}-{activation}.png")
         plt.close()
@@ -101,18 +101,6 @@ def _plot_data(df, activation, graph_name_prefix):
 
     return 0
 
-def mup_weights_reinit(neox_args, model):
-    def has_method(o, name):
-        return callable(getattr(o, name, None))
-
-    for layer in model.modules():
-        # This normally would happen in set_base_shapes if we actually were able to use the MuReadout class
-        if hasattr(layer, "mup_rescale_parameters") and layer.mup_rescale_parameters:
-            layer._rescale_parameters()
-
-        if has_method(layer, "mup_reinitialize_weights"):
-            layer.mup_reinitialize_weights(neox_args)
-
 
 def coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
@@ -124,13 +112,14 @@ def gen():
             neox_args.hidden_size = hidden_size
             neox_args.num_attention_heads = attention_head
             neox_args.mup_width_multiplier = None
-            model, optimizer, _ = setup_model_and_optimizer(
+            neox_args.mup_d_model_base = 2**8
+            model, optimizer, lr_scheduler = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
             neox_args.hidden_size = old_hidden_size
             neox_args.num_attention_heads = old_num_attention_heads
-            return model, optimizer
+            return model, optimizer, lr_scheduler
 
         return gen
 
@@ -152,8 +141,7 @@ def gen():
         neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds,
     )
     df.to_csv(f"df_{df_mode}.csv", index=False)
-    plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", mup=neox_args.use_mup)
-
+    plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup)
     print_rank_0("Saved coord check plots... exiting")
     return 0
 
@@ -413,11 +401,6 @@ def get_model(neox_args, use_cache=False):
     # Build model on cpu.
     print_rank_0("building GPT2 model ...")
 
-    # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below.
-    # If mup isn't being used anyways, this has no effect.
-    # old_use_mup = neox_args.use_mup
-    # neox_args.use_mup = False
-
     model = GPT2ModelPipe(
         neox_args=neox_args,
         num_tokentypes=0,
@@ -450,8 +433,6 @@ def get_model(neox_args, use_cache=False):
         # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
         model = model.to_sequential()
 
-    # neox_args.use_mup = old_use_mup
-
     if neox_args.deepspeed:
         # DeepSpeed handles CUDA, FP16, and DDP components.
         return model
@@ -532,55 +513,36 @@ def get_optimizer(model, neox_args):
             **neox_args.optimizer["params"],
         )
     elif neox_args.optimizer_type.lower() == "adam":
-        # Use Adam
-        if neox_args.use_mup:
-            # try:
-            # #     from mup import MuAdam
-            # #     adam_optimizer = MuAdam
-            # # except ModuleNotFoundError:
-            # #     print("Please install mup https://github.com/microsoft/mup")
-            # #     raise Exception
-            #     from deepspeed.ops.adam import FusedAdam as Adam
-            #     adam_optimizer = Adam
-            # except ModuleNotFoundError:
-            # from apex.optimizers import FusedAdam as Adam
-            # adam_optimizer = Adam
-            adam_optimizer = torch.optim.Adam
+        if neox_args.use_bnb_optimizer:
+            try:
+                import bitsandbytes as bnb
+
+                adam_optimizer = bnb.optim.Adam8bit
+            except ModuleNotFoundError:
+                print(
+                    "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
+                )
+                raise Exception
         else:
-            if neox_args.use_bnb_optimizer:
-                try:
-                    import bitsandbytes as bnb
-
-                    adam_optimizer = bnb.optim.Adam8bit
-                except ModuleNotFoundError:
-                    print(
-                        "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
-                    )
-                    raise Exception
-            else:
-                try:
-                    # default to apex as it's slightly faster
-                    from apex.optimizers import FusedAdam as Adam
-                except ImportError:
-                    # if apex isn't installed, use deepspeed's FusedAdam
-                    print(
-                        "WARNING: APEX not installed - defaulting to deepspeed's fused adam"
-                    )
-                    # from deepspeed.ops.adam import FusedAdam as Adam
-                    from torch.optim import Adam
-                adam_optimizer = Adam
+            try:
+                # default to apex as it's slightly faster
+                from apex.optimizers import FusedAdam as Adam
+            except ImportError:
+                # if apex isn't installed, use deepspeed's FusedAdam
+                print(
+                    "WARNING: APEX not installed - defaulting to deepspeed's fused adam"
+                )
+                # from deepspeed.ops.adam import FusedAdam as Adam
+                from torch.optim import Adam
+            adam_optimizer = Adam
         optimizer = adam_optimizer(
             param_groups,
             weight_decay=neox_args.weight_decay,
             **neox_args.optimizer["params"],
         )
     elif neox_args.optimizer_type.lower() == "sgd":
-        try:
-            from mup import MuSGD
-        except ModuleNotFoundError:
-            print("Please install mup https://github.com/microsoft/mup")
-            raise Exception
-        optimizer = MuSGD(
+        from torch.optim import SGD
+        optimizer = SGD(
             param_groups,
             weight_decay=neox_args.weight_decay,
             **neox_args.optimizer["params"],
@@ -645,7 +607,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     """Setup model and optimizer."""
     if neox_args.mup_width_multiplier is None:
         neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
-        print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
+    print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
     model = get_model(neox_args=neox_args, use_cache=use_cache)
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)

From fa142ffd1c9557b638e7d0320a67d06b83199b1c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:03:11 +0000
Subject: [PATCH 75/94] update config

---
 configs/coord_check_mup.yml | 3 +--
 configs/coord_check_sp.yml  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml
index d16e900cd..77b333a46 100644
--- a/configs/coord_check_mup.yml
+++ b/configs/coord_check_mup.yml
@@ -22,7 +22,7 @@
 
   # init methods
   "init_method": "normal",
-  "output_layer_init_method": "normal",
+  "output_layer_init_method": "scaled_normal",
 
   # optimizer settings
   "optimizer": {
@@ -79,7 +79,6 @@
 
   # misc. training settings
   "train_iters": 10,
-  "lr_decay_iters": 10,
   "log_interval": 1,
   "distributed_backend": "nccl",
   "lr_decay_style": "constant",
diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml
index 12c8165e1..ad7ef2246 100644
--- a/configs/coord_check_sp.yml
+++ b/configs/coord_check_sp.yml
@@ -22,7 +22,7 @@
 
   # init methods
   "init_method": "normal",
-  "output_layer_init_method": "normal",
+  "output_layer_init_method": "scaled_normal",
 
   # optimizer settings
   "optimizer": {
@@ -79,7 +79,6 @@
 
   # misc. training settings
   "train_iters": 10,
-  "lr_decay_iters": 10,
   "log_interval": 1,
   "distributed_backend": "nccl",
   "lr_decay_style": "constant",

From ad2336f9c5dcb4a9eced62df58f112f1ebf7efb0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:03:50 +0000
Subject: [PATCH 76/94] fixed how params are sorted

---
 megatron/model/utils.py | 81 ++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 29 deletions(-)

diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index ef36aac8e..960c2d956 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -29,44 +29,60 @@ def get_params_for_weight_decay_optimization(module, neox_args):
     """Divide params into with-weight-decay and without-weight-decay groups.
     Layernorms and biases will have no weight decay but the rest will.
     """
-    weight_decay_params = {"params": [], "lr_adjust": True}
-    no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0}
-    embedding_weight_decay_params = {"params": [], "lr_adjust": False}
-    embedding_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
+    lr_adjust_weight_decay_params = {"params": [], "lr_adjust": True}
+    lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0}
+    no_lr_adjust_weight_decay_params = {"params": [], "lr_adjust": False}
+    no_lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
 
     for module_ in module.modules():
-        if any(
-            [
-                isinstance(module_, LayerNorm),
-                isinstance(module_, RMSNorm),
-                isinstance(module_, ScaleNorm),
-            ]
-        ) or (
-            neox_args.weight_decay == 0.0
-        ):  # also include all parameters here if no weight decay is being done
-            if isinstance(module_, VocabParallelEmbedding):
-                embedding_no_weight_decay_params["params"].extend(
-                            [p for p in list(module_._parameters.values()) if p is not None]
-                        )
-            else:
-                no_weight_decay_params["params"].extend(
+        if neox_args.weight_decay == 0.0:
+            if any(
+                [
+                    isinstance(module_, LayerNorm),
+                    isinstance(module_, RMSNorm),
+                    isinstance(module_, ScaleNorm),
+                    isinstance(module_, VocabParallelEmbedding),
+                ]
+            ):
+                no_lr_adjust_no_weight_decay_params["params"].extend(
                     [p for p in list(module_._parameters.values()) if p is not None]
                 )
+            else:
+                no_lr_adjust_no_weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n == "bias"
+                    ]
+                )
+                lr_adjust_no_weight_decay_params["params"].extend(
+                    [
+                        p
+                        for n, p in list(module_._parameters.items())
+                        if p is not None and n != "bias"
+                    ]
+                )
         else:
             if any(
                 [
-                    isinstance(module_, VocabParallelEmbedding),
+                    isinstance(module_, LayerNorm),
+                    isinstance(module_, RMSNorm),
+                    isinstance(module_, ScaleNorm),
                 ]
             ):
-
-                embedding_weight_decay_params["params"].extend(
+                no_lr_adjust_no_weight_decay_params["params"].extend(
+                    [p for p in list(module_._parameters.values()) if p is not None]
+                )
+                
+            elif isinstance(module_, VocabParallelEmbedding):
+                no_lr_adjust_weight_decay_params["params"].extend(
                     [
                         p
                         for n, p in list(module_._parameters.items())
                         if p is not None and n != "bias"
                     ]
                 )
-                embedding_no_weight_decay_params["params"].extend(
+                no_lr_adjust_no_weight_decay_params["params"].extend(
                     [
                         p
                         for n, p in list(module_._parameters.items())
@@ -74,15 +90,14 @@ def get_params_for_weight_decay_optimization(module, neox_args):
                     ]
                 )
             else:
-
-                weight_decay_params["params"].extend(
+                lr_adjust_weight_decay_params["params"].extend(
                     [
                         p
                         for n, p in list(module_._parameters.items())
                         if p is not None and n != "bias"
                     ]
                 )
-                no_weight_decay_params["params"].extend(
+                lr_adjust_no_weight_decay_params["params"].extend(
                     [
                         p
                         for n, p in list(module_._parameters.items())
@@ -94,9 +109,17 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         # only return a single param group
         # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
         # to avoid this, only use a single param group when weight decay is off.
-        # return [no_weight_decay_params]
-        return no_weight_decay_params, embedding_no_weight_decay_params
-    return weight_decay_params, no_weight_decay_params, embedding_weight_decay_params, embedding_no_weight_decay_params
+        # return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params)
+        return (
+            lr_adjust_no_weight_decay_params,
+            no_lr_adjust_no_weight_decay_params
+            )
+    return (
+        lr_adjust_weight_decay_params,
+        lr_adjust_no_weight_decay_params,
+        no_lr_adjust_weight_decay_params,
+        no_lr_adjust_no_weight_decay_params
+        )
 
 
 def exists(x):

From fe73bc39dec4bf165d259e2e7eebf00e3a63e1d3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:04:34 +0000
Subject: [PATCH 77/94] update how seed is computed

---
 megatron/mup_substitute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 11d3aa503..cd7646be1 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -37,7 +37,7 @@ def get_coord_data(
 
     for width, model_obj in models.items():
         for i in range(nseeds):
-            torch.manual_seed(10**i)
+            torch.manual_seed((i+1)*100000)
             print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
             model, optimizer, lr_scheduler = model_obj()
             model.train()

From a3bd44cb2943fdd57124bb0d67f8b4c05e60fefc Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 29 Feb 2024 14:07:27 +0000
Subject: [PATCH 78/94] update to follow pre-commit format

---
 configs/neox_arguments.md            |  7 +++---
 megatron/model/gpt2_model.py         | 15 +++++++----
 megatron/model/init_functions.py     | 30 +++++++++++++---------
 megatron/model/transformer.py        | 10 +++++---
 megatron/model/utils.py              | 23 ++++++++++-------
 megatron/model/word_embeddings.py    |  4 ++-
 megatron/mup_substitute.py           | 28 +++++++++++++++------
 megatron/neox_arguments/neox_args.py |  4 +--
 megatron/training.py                 | 37 +++++++++++++++++++---------
 9 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 591bd9384..ee78b8d0f 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -1061,7 +1061,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default = 
+    Default =
 
 
     a single prompt's end. Defaults to newline
@@ -1103,7 +1103,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default = 
+    Default =
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1831,7 +1831,7 @@ Args for deepspeed config
 
     Default = None
 
-    
+
 
 
 
@@ -2131,4 +2131,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
-
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 5426f7749..17ad6905a 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -117,9 +117,11 @@ def __init__(
         self.parallel_output = parallel_output
         self.hidden_size = self.neox_args.hidden_size
         self.num_tokentypes = num_tokentypes
-        self.init_method, self.input_embedding_init_method, self.output_layer_init_method = get_init_methods(
-            self.neox_args
-        )
+        (
+            self.init_method,
+            self.input_embedding_init_method,
+            self.output_layer_init_method,
+        ) = get_init_methods(self.neox_args)
         self.__topology__ = topology
 
         self.specs = []
@@ -176,7 +178,7 @@ def init_specs(self):
         # Embedding layer
         # input will be (input_ids, position_ids, attention_mask)
 
-        # TODO Initilized weights here should not be divided by m_width
+        # TODO Initialized weights here should not be divided by m_width
         if weight_tying:
             self.specs.append(
                 TiedLayerSpec(
@@ -272,7 +274,10 @@ def _logits_helper(embedding, lm_output):
             """Just a wrapper to massage inputs/outputs from pipeline."""
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output,
+                self.neox_args,
             )
             return logits
 
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index cae150464..2f85e4517 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -136,28 +136,24 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
             with torch.no_grad():
                 init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
-            
+
     return init_
 
 
 def get_init_methods(args):
-
     def _get(name, use_mup=False):
         if name == "normal":
             sigma = args.init_method_std
             if use_mup:
-                sigma = sigma/math.sqrt(args.mup_width_multiplier)
+                sigma = sigma / math.sqrt(args.mup_width_multiplier)
             return init_method_normal(
                 sigma=sigma,
             )
         elif name == "scaled_normal":
             sigma = args.init_method_std
             if use_mup:
-                sigma = sigma/math.sqrt(args.mup_width_multiplier)
-            return scaled_init_method_normal(
-                sigma=sigma,
-                num_layers=args.num_layers
-            )
+                sigma = sigma / math.sqrt(args.mup_width_multiplier)
+            return scaled_init_method_normal(sigma=sigma, num_layers=args.num_layers)
         elif name == "orthogonal":
             return orthogonal_init_method(args.mup_width_multiplier if use_mup else 1.0)
         elif name == "scaled_orthogonal":
@@ -165,12 +161,18 @@ def _get(name, use_mup=False):
                 args.num_layers, args.mup_width_multiplier if use_mup else 1.0
             )
         elif name == "xavier_uniform":
-            return xavier_uniform_init_method(args.mup_width_multiplier if use_mup else 1.0)
+            return xavier_uniform_init_method(
+                args.mup_width_multiplier if use_mup else 1.0
+            )
         elif name == "xavier_normal":
-            return xavier_normal_init_method(args.mup_width_multiplier if use_mup else 1.0)
+            return xavier_normal_init_method(
+                args.mup_width_multiplier if use_mup else 1.0
+            )
         elif name == "wang_init":
             return wang_init_method(
-                args.num_layers, args.hidden_size, args.mup_width_multiplier if use_mup else 1.0
+                args.num_layers,
+                args.hidden_size,
+                args.mup_width_multiplier if use_mup else 1.0,
             )
         elif name == "small_init":
             return small_init_init_method(
@@ -179,4 +181,8 @@ def _get(name, use_mup=False):
         else:
             raise NotImplementedError(f"Unknown init method {name}")
 
-    return _get(args.init_method, use_mup=args.use_mup), _get(args.init_method), _get(args.output_layer_init_method, use_mup=args.use_mup)
+    return (
+        _get(args.init_method, use_mup=args.use_mup),
+        _get(args.init_method),
+        _get(args.output_layer_init_method, use_mup=args.use_mup),
+    )
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 8ba004336..79203eae3 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -232,7 +232,7 @@ def __init__(
                 gather_output=not parallel_output,
                 skip_bias_add=False,
             )
-        
+
         self.neox_args = neox_args
         self.is_last_layer = is_last_layer
 
@@ -259,7 +259,7 @@ def forward(self, hidden_states):
             _logits, *_args = logits
             if self.neox_args.use_mup:
                 _logits /= self.neox_args.mup_width_multiplier
-                _logits *= self.neox_args.mup_output_multiplier 
+                _logits *= self.neox_args.mup_output_multiplier
             logits = (_logits, *_args)
         return logits
 
@@ -1132,7 +1132,9 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None):
+def parallel_lm_logits(
+    input_, word_embeddings_weight, parallel_output, bias=None, args=None
+):
     """LM logits using word embedding weights."""
     # Parallel logits.
     input_parallel = mpu.copy_to_model_parallel_region(input_)
@@ -1145,7 +1147,7 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
 
     if args is not None and args.use_mup:
         logits_parallel /= args.mup_width_multiplier
-        logits_parallel *= args.mup_output_multiplier 
+        logits_parallel *= args.mup_output_multiplier
 
     # Gather if needed.
     if parallel_output:
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 960c2d956..825edee99 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -30,9 +30,17 @@ def get_params_for_weight_decay_optimization(module, neox_args):
     Layernorms and biases will have no weight decay but the rest will.
     """
     lr_adjust_weight_decay_params = {"params": [], "lr_adjust": True}
-    lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": True, "weight_decay": 0.0}
+    lr_adjust_no_weight_decay_params = {
+        "params": [],
+        "lr_adjust": True,
+        "weight_decay": 0.0,
+    }
     no_lr_adjust_weight_decay_params = {"params": [], "lr_adjust": False}
-    no_lr_adjust_no_weight_decay_params = {"params": [], "lr_adjust": False, "weight_decay": 0.0}
+    no_lr_adjust_no_weight_decay_params = {
+        "params": [],
+        "lr_adjust": False,
+        "weight_decay": 0.0,
+    }
 
     for module_ in module.modules():
         if neox_args.weight_decay == 0.0:
@@ -73,7 +81,7 @@ def get_params_for_weight_decay_optimization(module, neox_args):
                 no_lr_adjust_no_weight_decay_params["params"].extend(
                     [p for p in list(module_._parameters.values()) if p is not None]
                 )
-                
+
             elif isinstance(module_, VocabParallelEmbedding):
                 no_lr_adjust_weight_decay_params["params"].extend(
                     [
@@ -110,16 +118,13 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
         # to avoid this, only use a single param group when weight decay is off.
         # return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params)
-        return (
-            lr_adjust_no_weight_decay_params,
-            no_lr_adjust_no_weight_decay_params
-            )
+        return (lr_adjust_no_weight_decay_params, no_lr_adjust_no_weight_decay_params)
     return (
         lr_adjust_weight_decay_params,
         lr_adjust_no_weight_decay_params,
         no_lr_adjust_weight_decay_params,
-        no_lr_adjust_no_weight_decay_params
-        )
+        no_lr_adjust_no_weight_decay_params,
+    )
 
 
 def exists(x):
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 04cf55c8d..517646546 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -50,7 +50,9 @@ def __init__(
         self.hidden_size = hidden_size
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
-        self.mup_embedding_multiplier = float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1
+        self.mup_embedding_multiplier = (
+            float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1
+        )
 
         # Word embeddings (parallel).
         self.word_embeddings = mpu.VocabParallelEmbedding(
diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index cd7646be1..e45f3f82a 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -37,7 +37,7 @@ def get_coord_data(
 
     for width, model_obj in models.items():
         for i in range(nseeds):
-            torch.manual_seed((i+1)*100000)
+            torch.manual_seed((i + 1) * 100000)
             print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
             model, optimizer, lr_scheduler = model_obj()
             model.train()
@@ -53,24 +53,34 @@ def get_coord_data(
 
                 def word_embedding_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        word_embedding_act_abs_std_list.append(output.cpu().abs().std().item())
+                        word_embedding_act_abs_std_list.append(
+                            output.cpu().abs().std().item()
+                        )
 
                 def attn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        attn_output_act_abs_std_list.append(output[0].cpu().abs().std().item())
+                        attn_output_act_abs_std_list.append(
+                            output[0].cpu().abs().std().item()
+                        )
 
                 def ffn_output_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        ffn_output_act_abs_std_list.append(output[0].cpu().abs().std().item())
+                        ffn_output_act_abs_std_list.append(
+                            output[0].cpu().abs().std().item()
+                        )
 
                 def output_logits_coord_check_hook(module, input, output):
                     with torch.no_grad():
-                        output_logits_act_abs_std_list.append(output[0].cpu().abs().std().item())
+                        output_logits_act_abs_std_list.append(
+                            output[0].cpu().abs().std().item()
+                        )
 
                 for name, module in model.named_modules():
                     if name.endswith(".word_embeddings"):
                         remove_hooks.append(
-                            module.register_forward_hook(word_embedding_coord_check_hook)
+                            module.register_forward_hook(
+                                word_embedding_coord_check_hook
+                            )
                         )
                     elif name.endswith(".attention.dense"):
                         remove_hooks.append(
@@ -117,10 +127,12 @@ def output_logits_coord_check_hook(module, input, output):
                 df["width"].append(width)
 
             def del_obj_attrs(obj):
-                attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))]
+                attributes = [
+                    attr for attr in vars(obj) if not callable(getattr(obj, attr))
+                ]
                 for attr in attributes:
                     try:
-                        delattr(obj,attr)
+                        delattr(obj, attr)
                     except:
                         pass
 
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index c04d566de..57613f0ab 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1105,12 +1105,12 @@ class NeoXArgsTraining(NeoXArgsTemplate):
 
     coord_check_nsteps: int = 10
     """
-    
+    Number of steps to do for the coordinate check
     """
 
     coord_check_nseeds: int = 5
     """
-    
+    Number of repetition for each size in coordinate check
     """
 
     save_base_shapes: bool = False
diff --git a/megatron/training.py b/megatron/training.py
index a1d234636..57b241cee 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -65,16 +65,21 @@
 
 
 def plot_coord_data(df, graph_name_prefix, use_mup=True):
-
     def _plot_data(df, activation, graph_name_prefix):
-        df = df.groupby(['step', 'width']).mean().reset_index()
+        df = df.groupby(["step", "width"]).mean().reset_index()
         sns.color_palette("magma")
         sns.lineplot(
             data=df,
-            x="width", y=activation, hue="step", errorbar=None, style="step",
-            marker="o", dashes=False, legend='full'
+            x="width",
+            y=activation,
+            hue="step",
+            errorbar=None,
+            style="step",
+            marker="o",
+            dashes=False,
+            legend="full",
         )
-        plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
+        plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0)
         plt.tight_layout(pad=3.0)
         plt.xlabel("Width")
         plt.ylabel("Activation with {}".format("muP" if use_mup else "SP"))
@@ -125,10 +130,9 @@ def gen():
 
     models = {}
     # Hidden size needs to be divisible by num attention heads
-    for idx, hidden_size in enumerate([2**p for p in range(8,12)]):
+    for idx, hidden_size in enumerate([2**p for p in range(8, 12)]):
         models[hidden_size] = lazy_model(
-            hidden_size,
-            neox_args.num_attention_heads*(2**idx)
+            hidden_size, neox_args.num_attention_heads * (2**idx)
         )
 
     df_mode = "mup" if neox_args.use_mup else "sp"
@@ -138,13 +142,21 @@ def gen():
         print_rank_0(">>> Coord Check for standard Parameterization")
 
     df = get_coord_data(
-        neox_args, timers, models, train_data_iterator, neox_args.coord_check_nsteps, neox_args.coord_check_nseeds,
+        neox_args,
+        timers,
+        models,
+        train_data_iterator,
+        neox_args.coord_check_nsteps,
+        neox_args.coord_check_nseeds,
     )
     df.to_csv(f"df_{df_mode}.csv", index=False)
-    plot_coord_data(df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup)
+    plot_coord_data(
+        df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup
+    )
     print_rank_0("Saved coord check plots... exiting")
     return 0
 
+
 def pretrain(neox_args):
     """Main training program.
 
@@ -542,6 +554,7 @@ def get_optimizer(model, neox_args):
         )
     elif neox_args.optimizer_type.lower() == "sgd":
         from torch.optim import SGD
+
         optimizer = SGD(
             param_groups,
             weight_decay=neox_args.weight_decay,
@@ -606,7 +619,9 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
 
     """Setup model and optimizer."""
     if neox_args.mup_width_multiplier is None:
-        neox_args.mup_width_multiplier = neox_args.hidden_size / neox_args.mup_d_model_base
+        neox_args.mup_width_multiplier = (
+            neox_args.hidden_size / neox_args.mup_d_model_base
+        )
     print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
     model = get_model(neox_args=neox_args, use_cache=use_cache)

From e8639a03b1f6cb351d9e6c347b108303581e07a7 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Thu, 29 Feb 2024 14:09:55 +0000
Subject: [PATCH 79/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 0649b5436..948f399db 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 4f39209
+    Default = 2365fd5
 
     current git hash of repository
 
@@ -1061,7 +1061,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default =
+    Default = 
 
 
     a single prompt's end. Defaults to newline
@@ -1103,7 +1103,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default =
+    Default = 
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1640,7 +1640,7 @@ Training Arguments
 
     Default = 10
 
-    
+    Number of steps to do for the coordinate check
 
 
 
@@ -1648,7 +1648,7 @@ Training Arguments
 
     Default = 5
 
-    
+    Number of repetition for each size in coordinate check
 
 
 
@@ -1847,7 +1847,7 @@ Args for deepspeed config
 
     Default = None
 
-
+    
 
 
 
@@ -2147,3 +2147,4 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
+

From 47e14389bcc51ce1d2549723b9d2cccee861e95f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 5 Mar 2024 05:13:34 +0000
Subject: [PATCH 80/94] fix lr weighting

---
 megatron/learning_rates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 9e9994049..4ae18d49b 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -98,8 +98,9 @@ def step(self, step_num=None):
         new_lr = self.get_lr()
         for group in self.optimizer.param_groups:
             if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
-                new_lr = new_lr / self.mup_width_multiplier
-            group["lr"] = new_lr
+                group["lr"] = new_lr / self.mup_width_multiplier
+            else:
+                group["lr"] = new_lr
 
     def state_dict(self):
         state_dict = {

From a064f9b98191aa8fd6578f9cc9ee4b3c7cbcc792 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 5 Mar 2024 05:14:44 +0000
Subject: [PATCH 81/94] hard set to 1.0 if neox_args.use_mup is false

---
 megatron/model/word_embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 517646546..22ea5989d 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -51,7 +51,7 @@ def __init__(
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
         self.mup_embedding_multiplier = (
-            float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1
+            float(neox_args.mup_embedding_multiplier) if neox_args.use_mup else 1.0
         )
 
         # Word embeddings (parallel).

From 6fe55f4b5c6c8ab1d0eacd7abecc254a17ff4c79 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Sun, 21 Apr 2024 22:59:13 +0000
Subject: [PATCH 82/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 0268da279..fb48f78aa 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 11a5537
+    Default = b0da27a
 
     current git hash of repository
 

From 8bf8bcde743931289b88870919802e32685d4d3d Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 04:01:06 +0000
Subject: [PATCH 83/94] add new parameters

---
 megatron/neox_arguments/neox_args.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 57613f0ab..bdb4c82b3 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1098,6 +1098,29 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Whether to use muP
     """
 
+    mup_save: str = None
+    """
+    Path to save results when using muP
+    """
+
+    mup_lr: float = None
+    """
+    An alias parameter for lr,
+    if not None will override lr
+    """
+
+    mup_std: float = None
+    """
+    An alias parameter for init_method_std,
+    if not None will override init_method_std
+    """
+
+    mup_hidden_size: int = None
+    """
+    An alias parameter for hidden_size,
+    if not None will override hidden_size
+    """
+
     coord_check: bool = False
     """
     Whether to generate a "coord check" plot to verify mup's implementation in neox

From 7f0b03304ae45779c5637770333ba345057cc653 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 04:13:20 +0000
Subject: [PATCH 84/94] add parameter checks

---
 megatron/training.py | 63 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 57b241cee..2f94cdc96 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -64,7 +64,7 @@
 import matplotlib.pyplot as plt
 
 
-def plot_coord_data(df, graph_name_prefix, use_mup=True):
+def plot_coord_data(df, graph_name_prefix, use_mup=True, save_path=None):
     def _plot_data(df, activation, graph_name_prefix):
         df = df.groupby(["step", "width"]).mean().reset_index()
         sns.color_palette("magma")
@@ -84,7 +84,12 @@ def _plot_data(df, activation, graph_name_prefix):
         plt.xlabel("Width")
         plt.ylabel("Activation with {}".format("muP" if use_mup else "SP"))
         plt.title(f"{activation}")
-        plt.savefig(f"{graph_name_prefix}-{activation}.png")
+
+        file_path = f"{graph_name_prefix}-{activation}.png"
+        if save_path is not None:
+            file_path = os.path.join(save_path, file_path)
+
+        plt.savefig(file_path)
         plt.close()
 
         return 0
@@ -110,6 +115,12 @@ def _plot_data(df, activation, graph_name_prefix):
 def coord_check(neox_args, timers, train_data_iterator):
     from megatron.mup_substitute import get_coord_data
 
+    if neox_args.mup_save is None:
+        print_rank_0("Must set mup_save")
+        sys.exit()
+    else:
+        os.makedirs(neox_args.mup_save, exist_ok=True)
+
     def lazy_model(hidden_size, attention_head):
         def gen():
             old_hidden_size = neox_args.hidden_size
@@ -129,8 +140,8 @@ def gen():
         return gen
 
     models = {}
-    # Hidden size needs to be divisible by num attention heads
-    for idx, hidden_size in enumerate([2**p for p in range(8, 12)]):
+    # Hidden size needs to be divisible by num attention heads #14
+    for idx, hidden_size in enumerate([2**p for p in range(8, 11)]):
         models[hidden_size] = lazy_model(
             hidden_size, neox_args.num_attention_heads * (2**idx)
         )
@@ -149,11 +160,13 @@ def gen():
         neox_args.coord_check_nsteps,
         neox_args.coord_check_nseeds,
     )
-    df.to_csv(f"df_{df_mode}.csv", index=False)
-    plot_coord_data(
-        df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup
-    )
-    print_rank_0("Saved coord check plots... exiting")
+
+    if neox_args.mup_save is not None:
+        plot_coord_data(
+            df, graph_name_prefix=f"coord_check_{df_mode}", use_mup=neox_args.use_mup, save_path=neox_args.mup_save
+        )
+        print_rank_0("Saved coord check plots... exiting")
+
     return 0
 
 
@@ -462,7 +475,13 @@ def get_optimizer(model, neox_args):
             f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
         )
         exit()
-    # Build parameter groups (weight decay and non-decay).
+
+    if neox_args["lr"] is not None:
+        neox_args["optimizer"]["params"]["lr"] = neox_args["lr"]
+
+    # Build parameter groups for parameters that 
+    # are affected by weight decay and non-decay or
+    # have adjustable and non-adjustable learning rate.
     param_groups = get_params_for_weight_decay_optimization(model, neox_args)
     print_rank_0(
         f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}'
@@ -538,7 +557,8 @@ def get_optimizer(model, neox_args):
         else:
             try:
                 # default to apex as it's slightly faster
-                from apex.optimizers import FusedAdam as Adam
+                # from apex.optimizers import FusedAdam as Adam
+                from torch.optim import Adam
             except ImportError:
                 # if apex isn't installed, use deepspeed's FusedAdam
                 print(
@@ -618,10 +638,23 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
         )
 
     """Setup model and optimizer."""
-    if neox_args.mup_width_multiplier is None:
-        neox_args.mup_width_multiplier = (
-            neox_args.hidden_size / neox_args.mup_d_model_base
-        )
+    if neox_args.use_mup:
+        if neox_args.mup_lr is not None:
+            neox_args.lr = neox_args.mup_lr
+            print_rank_0(f"Overriding neox_args.lr with neox_args.mup_lr: {neox_args.mup_lr}")
+
+        if neox_args.mup_std is not None:
+            neox_args.init_method_std = neox_args.mup_std
+            print_rank_0(f"Overriding neox_args.init_method_std with neox_args.mup_std: {neox_args.mup_std}")
+
+        if neox_args.mup_hidden_size is not None:
+            neox_args.hidden_size = neox_args.mup_hidden_size
+            print_rank_0(f"Overriding neox_args.hidden_size with neox_args.mup_hidden_size: {neox_args.mup_hidden_size}")
+
+        if neox_args.mup_width_multiplier is None:
+            neox_args.mup_width_multiplier = (
+                neox_args.hidden_size / neox_args.mup_d_model_base
+            )
     print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
 
     model = get_model(neox_args=neox_args, use_cache=use_cache)

From f8028695b2d0211179c1f610a74bf31eeec96075 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 07:44:33 +0000
Subject: [PATCH 85/94] updates to argument processing for mup

---
 megatron/neox_arguments/arguments.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 7bca420cd..e9d23125e 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1134,6 +1134,27 @@ def validate_values(self):
         if not self.deepspeed:
             return False
 
+        if self.use_mup:
+            if self.mup_d_model_base is None:
+                logging.info("mup_d_model_base is required when use_mup is True")
+                return False
+
+            if self.mup_lr is not None:
+                self.lr = self.mup_lr
+                logging.info(f"Overriding lr with mup_lr: {self.mup_lr}")
+
+            if self.mup_std is not None:
+                self.init_method_std = self.mup_std
+                logging.info(f"Overriding init_method_std with mup_std: {self.mup_std}")
+
+            if self.mup_hidden_size is not None:
+                self.hidden_size = self.mup_hidden_size
+                logging.info(f"Overriding hidden_size with mup_hidden_size: {self.mup_hidden_size}")
+
+            if self.mup_width_multiplier is None:
+                self.mup_width_multiplier = self.hidden_size / self.mup_d_model_base
+                logging.info(f"Overriding mup_width_multiplier with hidden_size/mup_d_model_base: {self.mup_width_multiplier}")
+
         # learning rate
         if self.lr is None:
             error_message = self.__class__.__name__ + ".validate_values() lr is None"

From cc711049b807ac4cdc200dbbe748719e54fb4e18 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 07:46:34 +0000
Subject: [PATCH 86/94] add data save and descriptions being printed

---
 megatron/mup_substitute.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index e45f3f82a..1438e0dba 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -32,13 +32,26 @@ def get_coord_data(
         "output_logits_act_abs_std": [],
         "width": [],
     }
+
+    df_mode = "mup" if neox_args.use_mup else "sp"
+    if neox_args.use_mup:
+        print_rank_0("muP Coord Check for mu Parameterization")
+    else:
+        print_rank_0("muP Coord Check for standard Parameterization")
+
+    _df = None
+    df_path = os.path.join(neox_args.mup_save, f"df_{df_mode}.csv")
+    if (neox_args.mup_save is not None) and os.path.exists(df_path):
+        _df = pd.read_csv(df_path)
+
     with torch.no_grad():
         torch.cuda.empty_cache()
 
     for width, model_obj in models.items():
         for i in range(nseeds):
             torch.manual_seed((i + 1) * 100000)
-            print_rank_0(f">>> Running Model with width: {width} on seed: {i}\n")
+            print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}")
+            print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {i}\n")
             model, optimizer, lr_scheduler = model_obj()
             model.train()
             neox_args.hidden_size = width
@@ -150,4 +163,7 @@ def unlink_hp_params(lp_param_list):
             torch.cuda.empty_cache()
             deepspeed.runtime.utils.empty_cache()
 
+            temp_df = pd.DataFrame(df)
+            temp_df.to_csv(os.path.join(neox_args.mup_save, f"df_{df_mode}.csv"), index=False)
+
     return pd.DataFrame(df)

From c8feb39ed21fc4fb2faa355d1bf6c593d10ef0d6 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 07:54:34 +0000
Subject: [PATCH 87/94] update mup

---
 configs/coord_check_mup.yml | 51 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml
index 77b333a46..1a14d8639 100644
--- a/configs/coord_check_mup.yml
+++ b/configs/coord_check_mup.yml
@@ -4,22 +4,24 @@
   "model_parallel_size": 1,
 
   # model settings
-  "num_layers": 8,
-  "num_attention_heads": 8,
-  "seq_length": 128,
-  "max_position_embeddings": 128,
+  "num_layers": 2,
+  "num_attention_heads": 4,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
   "pos_emb": "rotary",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
 
-  # "attention_config": [[["flash"], 8]],
-
   # these should provide some speedup but takes a while to build, set to true if desired
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": true,
 
+  # # init methods
+  # "init_method": "small_init",
+  # "output_layer_init_method": "wang_init",
+
   # init methods
   "init_method": "normal",
   "output_layer_init_method": "scaled_normal",
@@ -28,12 +30,12 @@
   "optimizer": {
     "type": "Adam",
     "params": {
-      "lr": 0.006,
       "betas": [0.9, 0.95],
       "eps": 1.0e-8,
     }
   },
-  # "min_lr": 0.006,
+  "lr_decay_style": constant,
+  "warmup": 0,
 
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
    "zero_optimization": {
@@ -48,8 +50,8 @@
   },
 
   # batch / data settings
-  "train_micro_batch_size_per_gpu": 2,
-  "gradient_accumulation_steps": 32,
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
@@ -61,7 +63,7 @@
 
   # regularization
   "gradient_clipping": 1.0,
-  "weight_decay": 0.0,
+  "weight_decay": 0.1,
   "hidden_dropout": 0,
   "attention_dropout": 0,
 
@@ -72,34 +74,29 @@
   #   "enabled": true,
   #   "loss_scale": 0,
   #   "loss_scale_window": 1000,
-  #   "initial_scale_power": 12,
   #   "hysteresis": 2,
-  #   "min_loss_scale": 1,
+  #   "min_loss_scale": 1
   # },
 
   # misc. training settings
   "train_iters": 10,
   "log_interval": 1,
   "distributed_backend": "nccl",
-  "lr_decay_style": "constant",
-  "tokenizer_type": "HFTokenizer",
-  "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
 
   "coord_check": true,
   "coord_check_nsteps": 10,
   "coord_check_nseeds": 3,
   "use_mup": true,
-  # sigma_base
-  "init_method_std": 0.08,
-  # "mup_embedding_multiplier": 5,
-  # "mup_output_multiplier": 1,
-  # "mup_width_multiplier": 1,
-  "mup_d_model_base": 128,
-  "hidden_size": 128,
+  # base lr
+  "mup_lr": 0.01,
+  # base sigma
+  "mup_std": 0.06,
+  # base size
+  "mup_d_model_base": 256,
 
-  "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document",
-
-  # "launcher": "slurm",
-  # "deepspeed_slurm": true,
+  "tokenizer_type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json",
+  "data-path": "/mnt/ssd-1/lintang/09-mup-neox/data/enwik8/enwik8_text_document",
+  "mup_save": "/mnt/ssd-1/lintang/09-mup-neox/mup_results",
 
 }

From b6b3a02e52d9ed185574a0e7de85d9d6ce695db2 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 07:55:05 +0000
Subject: [PATCH 88/94] update seed

---
 megatron/mup_substitute.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index 1438e0dba..ecac73fba 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -49,9 +49,10 @@ def get_coord_data(
 
     for width, model_obj in models.items():
         for i in range(nseeds):
-            torch.manual_seed((i + 1) * 100000)
+            seed = (i + 1) * 100000
+            torch.manual_seed(seed)
             print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}")
-            print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {i}\n")
+            print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}\n")
             model, optimizer, lr_scheduler = model_obj()
             model.train()
             neox_args.hidden_size = width

From 847e8925197ce0842d65af0fa56c2018fcf6258a Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 07:55:41 +0000
Subject: [PATCH 89/94] remove print text

---
 megatron/training.py | 39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 2f94cdc96..be97de8cb 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -121,20 +121,26 @@ def coord_check(neox_args, timers, train_data_iterator):
     else:
         os.makedirs(neox_args.mup_save, exist_ok=True)
 
-    def lazy_model(hidden_size, attention_head):
+    def lazy_model(hidden_size, attention_head, d_model_base=2**8):
         def gen():
             old_hidden_size = neox_args.hidden_size
             old_num_attention_heads = neox_args.num_attention_heads
+            old_mup_d_model_base = neox_args.mup_d_model_base
+            old_mup_width_multiplier = neox_args.mup_width_multiplier
+
             neox_args.hidden_size = hidden_size
             neox_args.num_attention_heads = attention_head
-            neox_args.mup_width_multiplier = None
-            neox_args.mup_d_model_base = 2**8
+            neox_args.mup_d_model_base = d_model_base
+            neox_args.mup_width_multiplier = hidden_size / neox_args.mup_d_model_base
+
             model, optimizer, lr_scheduler = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
             neox_args.hidden_size = old_hidden_size
             neox_args.num_attention_heads = old_num_attention_heads
+            neox_args.mup_d_model_base = old_mup_d_model_base
+            neox_args.mup_width_multiplier = old_mup_width_multiplier
             return model, optimizer, lr_scheduler
 
         return gen
@@ -147,10 +153,6 @@ def gen():
         )
 
     df_mode = "mup" if neox_args.use_mup else "sp"
-    if neox_args.use_mup:
-        print_rank_0(">>> Coord Check for mu Parameterization")
-    else:
-        print_rank_0(">>> Coord Check for standard Parameterization")
 
     df = get_coord_data(
         neox_args,
@@ -476,8 +478,8 @@ def get_optimizer(model, neox_args):
         )
         exit()
 
-    if neox_args["lr"] is not None:
-        neox_args["optimizer"]["params"]["lr"] = neox_args["lr"]
+    if neox_args.lr is not None:
+        neox_args.optimizer["params"]["lr"] = neox_args.lr
 
     # Build parameter groups for parameters that 
     # are affected by weight decay and non-decay or
@@ -638,25 +640,6 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
         )
 
     """Setup model and optimizer."""
-    if neox_args.use_mup:
-        if neox_args.mup_lr is not None:
-            neox_args.lr = neox_args.mup_lr
-            print_rank_0(f"Overriding neox_args.lr with neox_args.mup_lr: {neox_args.mup_lr}")
-
-        if neox_args.mup_std is not None:
-            neox_args.init_method_std = neox_args.mup_std
-            print_rank_0(f"Overriding neox_args.init_method_std with neox_args.mup_std: {neox_args.mup_std}")
-
-        if neox_args.mup_hidden_size is not None:
-            neox_args.hidden_size = neox_args.mup_hidden_size
-            print_rank_0(f"Overriding neox_args.hidden_size with neox_args.mup_hidden_size: {neox_args.mup_hidden_size}")
-
-        if neox_args.mup_width_multiplier is None:
-            neox_args.mup_width_multiplier = (
-                neox_args.hidden_size / neox_args.mup_d_model_base
-            )
-    print_rank_0(f">>> mup_width_multiplier set to {neox_args.mup_width_multiplier}")
-
     model = get_model(neox_args=neox_args, use_cache=use_cache)
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)

From 1b0027cdc11bc4e11be77e783bd923090592d1af Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 13:35:30 +0000
Subject: [PATCH 90/94] fixed kv

---
 megatron/model/transformer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 79203eae3..5e5e149d4 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -359,8 +359,7 @@ def __init__(
 
         coeff = None
         if neox_args.use_mup:
-            # self.norm_factor = self.hidden_size_per_attention_head
-            self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+            self.norm_factor = self.hidden_size_per_attention_head
         else:
             self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
 

From 055596f414f367006405a90dfc11a0b1f9073bd6 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 16:28:32 +0000
Subject: [PATCH 91/94] update

---
 configs/coord_check_mup.yml |  7 ++---
 configs/coord_check_sp.yml  | 54 +++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/configs/coord_check_mup.yml b/configs/coord_check_mup.yml
index 1a14d8639..a09090029 100644
--- a/configs/coord_check_mup.yml
+++ b/configs/coord_check_mup.yml
@@ -84,15 +84,16 @@
   "distributed_backend": "nccl",
 
   "coord_check": true,
-  "coord_check_nsteps": 10,
-  "coord_check_nseeds": 3,
+  "coord_check_nsteps": 5,
+  "coord_check_nseeds": 1,
   "use_mup": true,
   # base lr
   "mup_lr": 0.01,
   # base sigma
-  "mup_std": 0.06,
+  "mup_std": 0.08,
   # base size
   "mup_d_model_base": 256,
+  "mup_hidden_size": 256,
 
   "tokenizer_type": "HFTokenizer",
   "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json",
diff --git a/configs/coord_check_sp.yml b/configs/coord_check_sp.yml
index ad7ef2246..66573892d 100644
--- a/configs/coord_check_sp.yml
+++ b/configs/coord_check_sp.yml
@@ -4,22 +4,24 @@
   "model_parallel_size": 1,
 
   # model settings
-  "num_layers": 8,
-  "num_attention_heads": 8,
-  "seq_length": 128,
-  "max_position_embeddings": 128,
+  "num_layers": 2,
+  "num_attention_heads": 4,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
   "pos_emb": "rotary",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
 
-  # "attention_config": [[["flash"], 8]],
-
   # these should provide some speedup but takes a while to build, set to true if desired
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": true,
 
+  # # init methods
+  # "init_method": "small_init",
+  # "output_layer_init_method": "wang_init",
+
   # init methods
   "init_method": "normal",
   "output_layer_init_method": "scaled_normal",
@@ -28,12 +30,13 @@
   "optimizer": {
     "type": "Adam",
     "params": {
-      "lr": 0.006,
+      "lr": 0.01,
       "betas": [0.9, 0.95],
       "eps": 1.0e-8,
     }
   },
-  # "min_lr": 0.006,
+  "lr_decay_style": constant,
+  "warmup": 0,
 
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
    "zero_optimization": {
@@ -48,8 +51,8 @@
   },
 
   # batch / data settings
-  "train_micro_batch_size_per_gpu": 2,
-  "gradient_accumulation_steps": 32,
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
@@ -61,7 +64,7 @@
 
   # regularization
   "gradient_clipping": 1.0,
-  "weight_decay": 0.0,
+  "weight_decay": 0.1,
   "hidden_dropout": 0,
   "attention_dropout": 0,
 
@@ -72,34 +75,27 @@
   #   "enabled": true,
   #   "loss_scale": 0,
   #   "loss_scale_window": 1000,
-  #   "initial_scale_power": 12,
   #   "hysteresis": 2,
-  #   "min_loss_scale": 1,
+  #   "min_loss_scale": 1
   # },
 
   # misc. training settings
   "train_iters": 10,
   "log_interval": 1,
   "distributed_backend": "nccl",
-  "lr_decay_style": "constant",
-  "tokenizer_type": "HFTokenizer",
-  "vocab-file": "/weka/lintangsutawika/09-mup-neox/20B_tokenizer.json",
 
   "coord_check": true,
-  "coord_check_nsteps": 10,
-  "coord_check_nseeds": 3,
-  "use_mup": false,
-  # sigma_base
+  "coord_check_nsteps": 5,
+  "coord_check_nseeds": 1,
+  # "use_mup": true,
+  # base sigma
   "init_method_std": 0.08,
-  # "mup_embedding_multiplier": 5,
-  # "mup_output_multiplier": 1,
-  # "mup_width_multiplier": 1,
-  "mup_d_model_base": 128,
-  "hidden_size": 128,
+  # base size
+  "hidden_size": 256,
 
-  "data-path": "/weka/lintangsutawika/09-mup-neox/data/enwik8/enwik8_text_document",
-
-  # "launcher": "slurm",
-  # "deepspeed_slurm": true,
+  "tokenizer_type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-1/lintang/09-mup-neox/20B_tokenizer.json",
+  "data-path": "/mnt/ssd-1/lintang/09-mup-neox/data/enwik8/enwik8_text_document",
+  "mup_save": "/mnt/ssd-1/lintang/09-mup-neox/mup_results",
 
 }

From fabb45ba1f686d89bf9ca8df64e7f7b94c4a46d8 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 16:35:04 +0000
Subject: [PATCH 92/94] update dewcriptions being printed

---
 megatron/mup_substitute.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/megatron/mup_substitute.py b/megatron/mup_substitute.py
index ecac73fba..9770c0765 100644
--- a/megatron/mup_substitute.py
+++ b/megatron/mup_substitute.py
@@ -51,12 +51,11 @@ def get_coord_data(
         for i in range(nseeds):
             seed = (i + 1) * 100000
             torch.manual_seed(seed)
-            print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {neox_args.mup_width_multiplier}")
-            print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}\n")
+
             model, optimizer, lr_scheduler = model_obj()
             model.train()
-            neox_args.hidden_size = width
-
+            print_rank_0(f">>> muP Coord Check: Running Model with width: {width} on seed: {seed}")
+            print_rank_0(f">>> muP Coord Check: mup_width_multiplier set to {model.neox_args.mup_width_multiplier}")
             for step in range(nsteps + 1):
 
                 word_embedding_act_abs_std_list = []

From 5ccf693092bd4354c97d4536d9d4019fd0db66a2 Mon Sep 17 00:00:00 2001
From: lintang <lintang@eleuther.ai>
Date: Thu, 2 May 2024 16:35:48 +0000
Subject: [PATCH 93/94] removed unused lines

---
 megatron/model/init_functions.py | 10 ++--------
 megatron/training.py             | 13 ++-----------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 2f85e4517..4387f8829 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -115,26 +115,20 @@ def init_(tensor, mup_width_multiplier=mup_width_multiplier):
 def small_init_init_method(dim, mup_width_multiplier=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
     the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
-    std = math.sqrt(2 / (5 * dim))
+    std = math.sqrt(2 / (5 * dim)) / math.sqrt(args.mup_width_multiplier)
 
     def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
-        if mup_width_multiplier != 1:
-            with torch.no_grad():
-                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
 
     return init_
 
 
 def wang_init_method(n_layers, dim, mup_width_multiplier=1.0):
-    std = 2 / n_layers / math.sqrt(dim)
+    std = 2 / n_layers / math.sqrt(dim) / math.sqrt(args.mup_width_multiplier)
 
     def init_(tensor, mup_width_multiplier=mup_width_multiplier):
         init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
-        if mup_width_multiplier != 1:
-            with torch.no_grad():
-                init_weight.div_(math.sqrt(mup_width_multiplier))
         return init_weight
 
     return init_
diff --git a/megatron/training.py b/megatron/training.py
index be97de8cb..6ef9f1afb 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -123,24 +123,16 @@ def coord_check(neox_args, timers, train_data_iterator):
 
     def lazy_model(hidden_size, attention_head, d_model_base=2**8):
         def gen():
-            old_hidden_size = neox_args.hidden_size
-            old_num_attention_heads = neox_args.num_attention_heads
-            old_mup_d_model_base = neox_args.mup_d_model_base
-            old_mup_width_multiplier = neox_args.mup_width_multiplier
 
             neox_args.hidden_size = hidden_size
             neox_args.num_attention_heads = attention_head
             neox_args.mup_d_model_base = d_model_base
-            neox_args.mup_width_multiplier = hidden_size / neox_args.mup_d_model_base
+            neox_args.mup_width_multiplier = hidden_size / d_model_base
 
             model, optimizer, lr_scheduler = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
-            neox_args.hidden_size = old_hidden_size
-            neox_args.num_attention_heads = old_num_attention_heads
-            neox_args.mup_d_model_base = old_mup_d_model_base
-            neox_args.mup_width_multiplier = old_mup_width_multiplier
             return model, optimizer, lr_scheduler
 
         return gen
@@ -559,8 +551,7 @@ def get_optimizer(model, neox_args):
         else:
             try:
                 # default to apex as it's slightly faster
-                # from apex.optimizers import FusedAdam as Adam
-                from torch.optim import Adam
+                from apex.optimizers import FusedAdam as Adam
             except ImportError:
                 # if apex isn't installed, use deepspeed's FusedAdam
                 print(

From 485cad4c320fe7eaddf992083ec3bbc15bf713b7 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Thu, 2 May 2024 16:38:56 +0000
Subject: [PATCH 94/94] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index df0c97096..0ebb1b063 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 4e37645
+    Default = 6a8ad71
 
     current git hash of repository
 
@@ -1785,6 +1785,41 @@ Training Arguments
 
 
 
+- **mup_save**: str
+
+    Default = None
+
+    Path to save results when using muP
+
+
+
+- **mup_lr**: float
+
+    Default = None
+
+    An alias parameter for lr,
+    if not None will override lr
+
+
+
+- **mup_std**: float
+
+    Default = None
+
+    An alias parameter for init_method_std,
+    if not None will override init_method_std
+
+
+
+- **mup_hidden_size**: int
+
+    Default = None
+
+    An alias parameter for hidden_size,
+    if not None will override hidden_size
+
+
+
 - **coord_check**: bool
 
     Default = False