More formatting

EleutherAI · Quentin-Anthony · Sep 2, 2022 · Sep 2, 2022 · Sep 19, 2022 · Sep 19, 2022
commit 276221c4b213493a273b15fbd97a7117ccf5bb20
@@ -23,10 +23,10 @@ repos:
  args: []
 
  - repo: https://github.com/psf/black
- rev: 21.8b0
+ rev: 22.3.0
  hooks:
  - id: black
- language_version: python3.8
+ language_version: python3
  - repo: https://github.com/codespell-project/codespell
  rev: v2.1.0
  hooks:

@@ -328,7 +328,7 @@ We also support using TensorBoard via the <code><var>tensorboard-dir</var></code
 
 # Running on multi-node
 
-If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile. 
+If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
 
 # Administrative Notes
 

@@ -922,7 +922,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
- Default = 
+ Default =
 
  prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1510,7 +1510,7 @@ Args for deepspeed config
 
  Default = None
 
- 
+
 
 
 
@@ -1636,4 +1636,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
  Default = False
 
  If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
-
@@ -356,7 +356,7 @@ def run_eval(
  description_dict=None,
  use_cache=True,
  name="neox",
- limit=None
+ limit=None,
  ):
  was_training = self.model.training
  self.model.eval()
@@ -389,7 +389,7 @@ def run_eval(
  if use_cache:
  # TODO(jon-tow): Append a subset of `neox_args` to the cache database
  # name arg to distinguish model runs that use different configurations.
- lm = base.CachingLM(lm, 'lm_cache/' + name + '.db')
+ lm = base.CachingLM(lm, "lm_cache/" + name + ".db")
 
  results = evaluator.evaluate(
  lm=lm,
@@ -409,7 +409,7 @@ def run_eval(
  "no_cache": not use_cache,
  "limit": limit,
  "bootstrap_iters": bootstrap_iters,
- "description_dict": description_dict
+ "description_dict": description_dict,
  }
 
  if was_training:

@@ -1,11 +1,11 @@
+deepspeed
 einops==0.3.0
 ftfy==6.0.1
 lm_dataformat==0.0.20
 lm_eval==0.2.0
+mpi4py==3.0.3
 numpy==1.22.0
 pybind11==2.6.2
-deepspeed
-mpi4py==3.0.3
 regex
 sentencepiece
 six

@@ -70,57 +70,77 @@ def merge_model_weights(input_checkpoint_path, output_checkpoint_path):
  merged = {}
 
  # RowParallelLinear
- merged["mlp.dense_4h_to_h.weight"] = torch.cat([
- loaded_tp1["mlp.dense_4h_to_h.weight"],
- loaded_tp2["mlp.dense_4h_to_h.weight"]
- ], dim=1)
- merged["attention.dense.weight"] = torch.cat([
- loaded_tp1["attention.dense.weight"],
- loaded_tp2["attention.dense.weight"]
- ], dim=1)
+ merged["mlp.dense_4h_to_h.weight"] = torch.cat(
+ [
+ loaded_tp1["mlp.dense_4h_to_h.weight"],
+ loaded_tp2["mlp.dense_4h_to_h.weight"],
+ ],
+ dim=1,
+ )
+ merged["attention.dense.weight"] = torch.cat(
+ [
+ loaded_tp1["attention.dense.weight"],
+ loaded_tp2["attention.dense.weight"],
+ ],
+ dim=1,
+ )
  merged["mlp.dense_4h_to_h.bias"] = (
- loaded_tp1["mlp.dense_4h_to_h.bias"]
- + loaded_tp2["mlp.dense_4h_to_h.bias"]
+ loaded_tp1["mlp.dense_4h_to_h.bias"] + loaded_tp2["mlp.dense_4h_to_h.bias"]
  )
  merged["attention.dense.bias"] = (
- loaded_tp1["attention.dense.bias"]
- + loaded_tp2["attention.dense.bias"]
+ loaded_tp1["attention.dense.bias"] + loaded_tp2["attention.dense.bias"]
  )
 
  # Layer Norms
  merged["input_layernorm.weight"] = (
- loaded_tp1["input_layernorm.weight"]
-  + loaded_tp2["input_layernorm.weight"]) / 2
+ loaded_tp1["input_layernorm.weight"] + loaded_tp2["input_layernorm.weight"]
+ ) / 2
  merged["input_layernorm.bias"] = (
- loaded_tp1["input_layernorm.bias"]
-  + loaded_tp2["input_layernorm.bias"]) / 2
+ loaded_tp1["input_layernorm.bias"] + loaded_tp2["input_layernorm.bias"]
+ ) / 2
  merged["post_attention_layernorm.weight"] = (
  loaded_tp1["post_attention_layernorm.weight"]
- + loaded_tp2["post_attention_layernorm.weight"]) / 2
+ + loaded_tp2["post_attention_layernorm.weight"]
+ ) / 2
  merged["post_attention_layernorm.bias"] = (
  loaded_tp1["post_attention_layernorm.bias"]
- + loaded_tp2["post_attention_layernorm.bias"]) / 2
+ + loaded_tp2["post_attention_layernorm.bias"]
+ ) / 2
 
  # ColumnParallelLinear
- merged["mlp.dense_h_to_4h.weight"] = torch.cat([
- loaded_tp1["mlp.dense_h_to_4h.weight"],
- loaded_tp2["mlp.dense_h_to_4h.weight"],
- ], dim=0)
- merged["mlp.dense_h_to_4h.bias"] = torch.cat([
- loaded_tp1["mlp.dense_h_to_4h.bias"],
- loaded_tp2["mlp.dense_h_to_4h.bias"],
- ], dim=0)
- merged["attention.query_key_value.weight"] = torch.cat([
- loaded_tp1["attention.query_key_value.weight"],
- loaded_tp2["attention.query_key_value.weight"],
- ], dim=0)
- merged["attention.query_key_value.bias"] = torch.cat([
- loaded_tp1["attention.query_key_value.bias"],
- loaded_tp2["attention.query_key_value.bias"],
- ], dim=0)
+ merged["mlp.dense_h_to_4h.weight"] = torch.cat(
+ [
+ loaded_tp1["mlp.dense_h_to_4h.weight"],
+ loaded_tp2["mlp.dense_h_to_4h.weight"],
+ ],
+ dim=0,
+ )
+ merged["mlp.dense_h_to_4h.bias"] = torch.cat(
+ [
+ loaded_tp1["mlp.dense_h_to_4h.bias"],
+ loaded_tp2["mlp.dense_h_to_4h.bias"],
+ ],
+ dim=0,
+ )
+ merged["attention.query_key_value.weight"] = torch.cat(
+ [
+ loaded_tp1["attention.query_key_value.weight"],
+ loaded_tp2["attention.query_key_value.weight"],
+ ],
+ dim=0,
+ )
+ merged["attention.query_key_value.bias"] = torch.cat(
+ [
+ loaded_tp1["attention.query_key_value.bias"],
+ loaded_tp2["attention.query_key_value.bias"],
+ ],
+ dim=0,
+ )
 
  # Just take one
- merged["attention.rotary_emb.inv_freq"] = loaded_tp1["attention.rotary_emb.inv_freq"]
+ merged["attention.rotary_emb.inv_freq"] = loaded_tp1[
+ "attention.rotary_emb.inv_freq"
+ ]
 
  torch.save(merged, os.path.join(output_checkpoint_path, filename_tp1))
  del loaded_tp1
@@ -129,41 +149,70 @@ def merge_model_weights(input_checkpoint_path, output_checkpoint_path):
 
  # Load input embedding
  pbar.set_description(f"Merging input embedding")
- loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt"))
- loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt"))
- merged = {"word_embeddings.weight": torch.cat([
- loaded_tp1["word_embeddings.weight"],
- loaded_tp2["word_embeddings.weight"],
- ], dim=0)}
- torch.save(merged, os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"))
+ loaded_tp1 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt")
+ )
+ loaded_tp2 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt")
+ )
+ merged = {
+ "word_embeddings.weight": torch.cat(
+ [
+ loaded_tp1["word_embeddings.weight"],
+ loaded_tp2["word_embeddings.weight"],
+ ],
+ dim=0,
+ )
+ }
+ torch.save(
+ merged,
+ os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"),
+ )
  del loaded_tp1
  del loaded_tp2
  pbar.update(1)
 
  # Load final layer norm
  pbar.set_description(f"Merging final layer norm")
- loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt"))
- loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt"))
+ loaded_tp1 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt")
+ )
+ loaded_tp2 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt")
+ )
  merged = {
  "norm.weight": (loaded_tp1["norm.weight"] + loaded_tp2["norm.weight"]) / 2,
  "norm.bias": (loaded_tp1["norm.bias"] + loaded_tp2["norm.bias"]) / 2,
  }
- torch.save(merged, os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"))
+ torch.save(
+ merged,
+ os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"),
+ )
  del loaded_tp1
  del loaded_tp2
  pbar.update(1)
 
  # Load output embedding
  pbar.set_description(f"Merging output embedding")
- loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt"))
- loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt"))
+ loaded_tp1 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt")
+ )
+ loaded_tp2 = torch.load(
+ os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt")
+ )
  merged = {
- "final_linear.weight": torch.cat([
- loaded_tp1["final_linear.weight"],
- loaded_tp2["final_linear.weight"],
- ], dim=0),
+ "final_linear.weight": torch.cat(
+ [
+ loaded_tp1["final_linear.weight"],
+ loaded_tp2["final_linear.weight"],
+ ],
+ dim=0,
+ ),
  }
- torch.save(merged, os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"))
+ torch.save(
+ merged,
+ os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"),
+ )
  del loaded_tp1
  del loaded_tp2
  pbar.update(1)
@@ -178,9 +227,11 @@ def merge(input_dir, output_dir):
  for i in range(8):
  modify_model_states(
  input_model_state_path=os.path.join(
- input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"),
+ input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
+ ),
  output_model_state_path=os.path.join(
- output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"),
+ output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
+ ),
  )
  modify_config(
  input_config_path=os.path.join(input_dir, "configs", "20B.yml"),
@@ -200,14 +251,18 @@ def merge(input_dir, output_dir):
 
 
 def main():
- parser = argparse.ArgumentParser(description='Merge 20B checkpoint.')
- parser.add_argument('--input_dir', type=str,
- help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")')
- parser.add_argument('--output_dir', type=str,
- help='Output dir, to save the 1-GPU weights configs')
+ parser = argparse.ArgumentParser(description="Merge 20B checkpoint.")
+ parser.add_argument(
+ "--input_dir",
+ type=str,
+ help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")',
+ )
+ parser.add_argument(
+ "--output_dir", type=str, help="Output dir, to save the 1-GPU weights configs"
+ )
  args = parser.parse_args()
  merge(args.input_dir, args.output_dir)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
  main()