EleutherAI · lintangsutawika · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
@@ -111,7 +111,11 @@ Logging Arguments
 
 - **git_hash**: str
 
+<<<<<<< HEAD
+ Default = 02687a8
+=======
  Default = 31cb364
+>>>>>>> e5a7ea71e96eeada636c9612036dc85e886d973d
 
  current git hash of repository
 
@@ -460,6 +464,7 @@ Model Arguments
  Default = 0.02
 
  Standard deviation of the zero mean normal distribution used for weight initialization.
+ When using muP this is the base std
 
 
 
@@ -671,6 +676,7 @@ Optimizer Arguments
  Default = None
 
  Max Learning rate during training
+ When using muP, this is the base lr
 
 
 
@@ -1529,7 +1535,7 @@ Training Arguments
 
  Default = False
 
- Whether to use Microsoft's Mup https://github.com/microsoft/mup
+ Whether to use muP
 
 
 
@@ -1557,52 +1563,28 @@ Training Arguments
 
 
 
-- **mup_init_scale**: float
+- **mup_emb**: int
 
- Default = 1.0
-
- Initialization scale: All the parameters are multiplied by this value
-
-
-
-- **mup_attn_temp**: float
-
- Default = 1.0
-
- Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
-
-
-
-- **mup_output_temp**: float
-
- Default = 1.0
-
- Output temperature: Reciprocal of the multiplier applied to the input to softmax that
- produces the distribution over output tokens.
-
-
-
-- **mup_embedding_mult**: float
-
- Default = 1.0
+ Default = 1
 
- Scalar by which we multiply the output of the embedding layer
+ Embedding output multiplier
 
 
 
-- **mup_rp_embedding_mult**: float
+- **mup_m_width**: int
 
- Default = 1.0
+ Default = 1
 
- Scalar by which we multiply vectors representing relative position
+ Manually set the layer width multiplier (d_model/d_model,base)
 
 
 
-- **mup_width_scale**: int
+- **mup_d_model_base**: int
 
- Default = 2
+ Default = 64
 
- What to scale width by when creating the delta model for mup
+ d_model,base
+ Proxy (base) model's layer width
 
 
 

@@ -37,6 +37,7 @@ def __init__(
  use_checkpoint_lr_scheduler=True,
  override_lr_scheduler=False,
  use_mup=False,
+ mup_m_width=1,
  ):
 
  # Class values.
@@ -51,6 +52,7 @@ def __init__(
  self.override_lr_scheduler = override_lr_scheduler
  self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
  self.use_mup = use_mup
+ self.mup_m_width = mup_m_width
  if self.override_lr_scheduler:
  assert not self.use_checkpoint_lr_scheduler, (
  "both override and " "use-checkpoint are set."
@@ -95,8 +97,8 @@ def step(self, step_num=None):
  self.num_iters = step_num
  new_lr = self.get_lr()
  for group in self.optimizer.param_groups:
- if self.use_mup and "width_mult" in group:
- group["lr"] = new_lr / group["width_mult"]
+ if self.use_mup and ("lr_adjust" in group) and group["lr_adjust"] is True:
+ group["lr"] = new_lr / self.mup_m_width
  else:
  group["lr"] = new_lr
 

@@ -175,6 +175,7 @@ def init_specs(self):
  # Embedding layer
  # input will be (input_ids, position_ids, attention_mask)
 
+ # TODO Initilized weights here should not be divided by m_width
  if weight_tying:
  self.specs.append(
  TiedLayerSpec(
@@ -268,16 +269,9 @@ def init_specs(self):
 
  def _logits_helper(embedding, lm_output):
  """Just a wrapper to massage inputs/outputs from pipeline."""
- if self.neox_args.use_mup:
- # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
- # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
- lm_output = (
- lm_output
- / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
- )
 
  logits = parallel_lm_logits(
- lm_output, embedding.word_embeddings_weight, self.parallel_output
+ lm_output, embedding.word_embeddings_weight, self.parallel_output, self.neox_args
  )
  return logits
 

@@ -16,41 +16,22 @@
 
 import torch
 
-try:
- import mup
-except ImportError:
- pass
 
-
-def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0):
+def init_method_normal(sigma):
  """Init method based on N(0, sigma)."""
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
- mup.init.normal_(tensor, mean=0.0, std=sigma)
- with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+ def init_(tensor):
+ return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
  return init_
 
 
-def scaled_init_method_normal(
- sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0
-):
+def scaled_init_method_normal(sigma, num_layers):
  """Init method based on N(0, sigma/sqrt(2*num_layers)."""
  std = sigma / math.sqrt(2.0 * num_layers)
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
- mup.init.normal_(tensor, mean=0.0, std=std)
- with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+ def init_(tensor):
+ return torch.nn.init.normal_(tensor, mean=0.0, std=std)
 
  return init_
 
@@ -87,12 +68,12 @@ def _orthogonal(tensor, gain=1):
  return tensor
 
 
-def orthogonal_init_method(n_layers=1, use_mup=False, mup_init_scale=1.0):
+def orthogonal_init_method(n_layers=1, mup_m_width=1.0):
  """Fills the input Tensor with a (semi) orthogonal matrix, as described in
  Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
  Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
 
- if use_mup:
+ if mup_m_width != 1:
  raise ValueError(
  "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error"
  )
@@ -103,105 +84,91 @@ def init_(tensor):
  return init_
 
 
-def xavier_uniform_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_uniform_init_method(mup_m_width=1.0):
  """Fills the input Tensor with values according to the method described in Understanding the difficulty of
  training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
-  mup.init.xavier_uniform_(tensor)
+ def init_(tensor, mup_m_width=mup_m_width):
+ init_weight = torch.nn.init.xavier_uniform_(tensor)
+ if mup_m_width != 1:
  with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.xavier_uniform_(tensor)
+ init_weight.div_(mup_m_width)
+ return init_weight
 
  return init_
 
 
-def xavier_normal_init_method(use_mup_outer=False, mup_init_scale=1.0):
+def xavier_normal_init_method(mup_m_width=1.0):
  """Fills the input Tensor with values according to the method described in Understanding the difficulty of
  training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
-  mup.init.xavier_normal_(tensor)
+ def init_(tensor, mup_m_width=mup_m_width):
+ init_weight = torch.nn.init.xavier_normal_(tensor)
+ if mup_m_width != 1:
  with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.xavier_normal_(tensor)
+ init_weight.div_(mup_m_width)
+ return init_weight
 
  return init_
 
 
-def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
+def small_init_init_method(dim, mup_m_width=1.0):
  """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
  the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
  std = math.sqrt(2 / (5 * dim))
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
-  mup.init.normal_(tensor, mean=0.0, std=std)
+ def init_(tensor, mup_m_width=mup_m_width):
+ init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+ if mup_m_width != 1:
  with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+ init_weight.div_(mup_m_width)
+ return init_weight
 
  return init_
 
 
-def wang_init_method(n_layers, dim, use_mup_outer=False, mup_init_scale=1.0):
+def wang_init_method(n_layers, dim, mup_m_width=1.0):
  std = 2 / n_layers / math.sqrt(dim)
 
- def init_(tensor, use_mup=use_mup_outer):
- if use_mup:
-  mup.init.normal_(tensor, mean=0.0, std=std)
+ def init_(tensor, mup_m_width=mup_m_width):
+ init_weight = torch.nn.init.normal_(tensor, mean=0.0, std=std)
+ if mup_m_width != 1:
  with torch.no_grad():
- tensor.mul_(mup_init_scale)
- return tensor
- else:
- return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
+ init_weight.div_(mup_m_width)
+ return init_weight
+
  return init_
 
 
 def get_init_methods(args):
 
- if args.use_mup:
- try:
- import mup
- except ModuleNotFoundError:
- print("Please install mup https://github.com/microsoft/mup")
- raise Exception
-
  def _get(name):
  if name == "normal":
  return init_method_normal(
- args.init_method_std, args.use_mup, args.mup_init_scale
+ sigma=args.init_method_std/math.sqrt(args.mup_m_width)
  )
  elif name == "scaled_normal":
  return scaled_init_method_normal(
- args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale
+ sigma=args.init_method_std/math.sqrt(args.mup_m_width),
+ num_layers=args.num_layers
  )
  elif name == "orthogonal":
- return orthogonal_init_method(args.use_mup, args.mup_init_scale)
+ return orthogonal_init_method(args.mup_m_width)
  elif name == "scaled_orthogonal":
  return orthogonal_init_method(
- args.num_layers, args.use_mup, args.mup_init_scale
+ args.num_layers, args.mup_m_width
  )
  elif name == "xavier_uniform":
- return xavier_uniform_init_method(args.use_mup, args.mup_init_scale)
+ return xavier_uniform_init_method(args.mup_m_width)
  elif name == "xavier_normal":
- return xavier_normal_init_method(args.use_mup, args.mup_init_scale)
+ return xavier_normal_init_method(args.mup_m_width)
  elif name == "wang_init":
  return wang_init_method(
- args.num_layers, args.hidden_size, args.use_mup, args.mup_init_scale
+ args.num_layers, args.hidden_size, args.mup_m_width
  )
  elif name == "small_init":
  return small_init_init_method(
- args.hidden_size, args.use_mup, args.mup_init_scale
+ args.hidden_size, args.mup_m_width
  )
  else:
  raise NotImplementedError(f"Unknown init method {name}")

@@ -306,13 +306,13 @@ def __init__(
  )
 
  coeff = None
- self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
- if self.apply_query_key_layer_scaling:
- coeff = max(1, self.layer_number)
- self.norm_factor *= coeff
-
  if neox_args.use_mup:
  self.norm_factor = self.hidden_size_per_attention_head
+ else:
+ self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+ if self.apply_query_key_layer_scaling:
+ coeff = max(1, self.layer_number)
+ self.norm_factor *= coeff
 
  self.rpe = rpe
 
@@ -960,7 +960,7 @@ def forward(self, args):
  return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None, args=None):
  """LM logits using word embedding weights."""
  # Parallel logits.
  input_parallel = mpu.copy_to_model_parallel_region(input_)
@@ -971,6 +971,9 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=Non
  else:
  logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
+ if args is not None and args.use_mup:
+ logits_parallel /= args.mup_m_width
+
  # Gather if needed.
  if parallel_output:
  return logits_parallel