Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MoE Support #677

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
More formatting
  • Loading branch information
Quentin-Anthony committed Sep 19, 2022
commit 276221c4b213493a273b15fbd97a7117ccf5bb20
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ repos:
args: []

- repo: https://github.com/psf/black
rev: 21.8b0
rev: 22.3.0
hooks:
- id: black
language_version: python3.8
language_version: python3
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ We also support using TensorBoard via the <code><var>tensorboard-dir</var></code

# Running on multi-node

If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.

# Administrative Notes

Expand Down
5 changes: 2 additions & 3 deletions configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ Text Generation arguments

- **eval_results_prefix**: str

Default =
Default =

prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json

Expand Down Expand Up @@ -1510,7 +1510,7 @@ Args for deepspeed config

Default = None





Expand Down Expand Up @@ -1636,4 +1636,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
Default = False

If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.

6 changes: 3 additions & 3 deletions eval_tasks/eval_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ def run_eval(
description_dict=None,
use_cache=True,
name="neox",
limit=None
limit=None,
):
was_training = self.model.training
self.model.eval()
Expand Down Expand Up @@ -389,7 +389,7 @@ def run_eval(
if use_cache:
# TODO(jon-tow): Append a subset of `neox_args` to the cache database
# name arg to distinguish model runs that use different configurations.
lm = base.CachingLM(lm, 'lm_cache/' + name + '.db')
lm = base.CachingLM(lm, "lm_cache/" + name + ".db")

results = evaluator.evaluate(
lm=lm,
Expand All @@ -409,7 +409,7 @@ def run_eval(
"no_cache": not use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"description_dict": description_dict
"description_dict": description_dict,
}

if was_training:
Expand Down
4 changes: 2 additions & 2 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
deepspeed
einops==0.3.0
ftfy==6.0.1
lm_dataformat==0.0.20
lm_eval==0.2.0
mpi4py==3.0.3
numpy==1.22.0
pybind11==2.6.2
deepspeed
mpi4py==3.0.3
regex
sentencepiece
six
Expand Down
175 changes: 115 additions & 60 deletions tools/merge20b.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,57 +70,77 @@ def merge_model_weights(input_checkpoint_path, output_checkpoint_path):
merged = {}

# RowParallelLinear
merged["mlp.dense_4h_to_h.weight"] = torch.cat([
loaded_tp1["mlp.dense_4h_to_h.weight"],
loaded_tp2["mlp.dense_4h_to_h.weight"]
], dim=1)
merged["attention.dense.weight"] = torch.cat([
loaded_tp1["attention.dense.weight"],
loaded_tp2["attention.dense.weight"]
], dim=1)
merged["mlp.dense_4h_to_h.weight"] = torch.cat(
[
loaded_tp1["mlp.dense_4h_to_h.weight"],
loaded_tp2["mlp.dense_4h_to_h.weight"],
],
dim=1,
)
merged["attention.dense.weight"] = torch.cat(
[
loaded_tp1["attention.dense.weight"],
loaded_tp2["attention.dense.weight"],
],
dim=1,
)
merged["mlp.dense_4h_to_h.bias"] = (
loaded_tp1["mlp.dense_4h_to_h.bias"]
+ loaded_tp2["mlp.dense_4h_to_h.bias"]
loaded_tp1["mlp.dense_4h_to_h.bias"] + loaded_tp2["mlp.dense_4h_to_h.bias"]
)
merged["attention.dense.bias"] = (
loaded_tp1["attention.dense.bias"]
+ loaded_tp2["attention.dense.bias"]
loaded_tp1["attention.dense.bias"] + loaded_tp2["attention.dense.bias"]
)

# Layer Norms
merged["input_layernorm.weight"] = (
loaded_tp1["input_layernorm.weight"]
+ loaded_tp2["input_layernorm.weight"]) / 2
loaded_tp1["input_layernorm.weight"] + loaded_tp2["input_layernorm.weight"]
) / 2
merged["input_layernorm.bias"] = (
loaded_tp1["input_layernorm.bias"]
+ loaded_tp2["input_layernorm.bias"]) / 2
loaded_tp1["input_layernorm.bias"] + loaded_tp2["input_layernorm.bias"]
) / 2
merged["post_attention_layernorm.weight"] = (
loaded_tp1["post_attention_layernorm.weight"]
+ loaded_tp2["post_attention_layernorm.weight"]) / 2
+ loaded_tp2["post_attention_layernorm.weight"]
) / 2
merged["post_attention_layernorm.bias"] = (
loaded_tp1["post_attention_layernorm.bias"]
+ loaded_tp2["post_attention_layernorm.bias"]) / 2
+ loaded_tp2["post_attention_layernorm.bias"]
) / 2

# ColumnParallelLinear
merged["mlp.dense_h_to_4h.weight"] = torch.cat([
loaded_tp1["mlp.dense_h_to_4h.weight"],
loaded_tp2["mlp.dense_h_to_4h.weight"],
], dim=0)
merged["mlp.dense_h_to_4h.bias"] = torch.cat([
loaded_tp1["mlp.dense_h_to_4h.bias"],
loaded_tp2["mlp.dense_h_to_4h.bias"],
], dim=0)
merged["attention.query_key_value.weight"] = torch.cat([
loaded_tp1["attention.query_key_value.weight"],
loaded_tp2["attention.query_key_value.weight"],
], dim=0)
merged["attention.query_key_value.bias"] = torch.cat([
loaded_tp1["attention.query_key_value.bias"],
loaded_tp2["attention.query_key_value.bias"],
], dim=0)
merged["mlp.dense_h_to_4h.weight"] = torch.cat(
[
loaded_tp1["mlp.dense_h_to_4h.weight"],
loaded_tp2["mlp.dense_h_to_4h.weight"],
],
dim=0,
)
merged["mlp.dense_h_to_4h.bias"] = torch.cat(
[
loaded_tp1["mlp.dense_h_to_4h.bias"],
loaded_tp2["mlp.dense_h_to_4h.bias"],
],
dim=0,
)
merged["attention.query_key_value.weight"] = torch.cat(
[
loaded_tp1["attention.query_key_value.weight"],
loaded_tp2["attention.query_key_value.weight"],
],
dim=0,
)
merged["attention.query_key_value.bias"] = torch.cat(
[
loaded_tp1["attention.query_key_value.bias"],
loaded_tp2["attention.query_key_value.bias"],
],
dim=0,
)

# Just take one
merged["attention.rotary_emb.inv_freq"] = loaded_tp1["attention.rotary_emb.inv_freq"]
merged["attention.rotary_emb.inv_freq"] = loaded_tp1[
"attention.rotary_emb.inv_freq"
]

torch.save(merged, os.path.join(output_checkpoint_path, filename_tp1))
del loaded_tp1
Expand All @@ -129,41 +149,70 @@ def merge_model_weights(input_checkpoint_path, output_checkpoint_path):

# Load input embedding
pbar.set_description(f"Merging input embedding")
loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt"))
loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt"))
merged = {"word_embeddings.weight": torch.cat([
loaded_tp1["word_embeddings.weight"],
loaded_tp2["word_embeddings.weight"],
], dim=0)}
torch.save(merged, os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"))
loaded_tp1 = torch.load(
os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt")
)
loaded_tp2 = torch.load(
os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt")
)
merged = {
"word_embeddings.weight": torch.cat(
[
loaded_tp1["word_embeddings.weight"],
loaded_tp2["word_embeddings.weight"],
],
dim=0,
)
}
torch.save(
merged,
os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"),
)
del loaded_tp1
del loaded_tp2
pbar.update(1)

# Load final layer norm
pbar.set_description(f"Merging final layer norm")
loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt"))
loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt"))
loaded_tp1 = torch.load(
os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt")
)
loaded_tp2 = torch.load(
os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt")
)
merged = {
"norm.weight": (loaded_tp1["norm.weight"] + loaded_tp2["norm.weight"]) / 2,
"norm.bias": (loaded_tp1["norm.bias"] + loaded_tp2["norm.bias"]) / 2,
}
torch.save(merged, os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"))
torch.save(
merged,
os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"),
)
del loaded_tp1
del loaded_tp2
pbar.update(1)

# Load output embedding
pbar.set_description(f"Merging output embedding")
loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt"))
loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt"))
loaded_tp1 = torch.load(
os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt")
)
loaded_tp2 = torch.load(
os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt")
)
merged = {
"final_linear.weight": torch.cat([
loaded_tp1["final_linear.weight"],
loaded_tp2["final_linear.weight"],
], dim=0),
"final_linear.weight": torch.cat(
[
loaded_tp1["final_linear.weight"],
loaded_tp2["final_linear.weight"],
],
dim=0,
),
}
torch.save(merged, os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"))
torch.save(
merged,
os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"),
)
del loaded_tp1
del loaded_tp2
pbar.update(1)
Expand All @@ -178,9 +227,11 @@ def merge(input_dir, output_dir):
for i in range(8):
modify_model_states(
input_model_state_path=os.path.join(
input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"),
input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
),
output_model_state_path=os.path.join(
output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"),
output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
),
)
modify_config(
input_config_path=os.path.join(input_dir, "configs", "20B.yml"),
Expand All @@ -200,14 +251,18 @@ def merge(input_dir, output_dir):


def main():
parser = argparse.ArgumentParser(description='Merge 20B checkpoint.')
parser.add_argument('--input_dir', type=str,
help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")')
parser.add_argument('--output_dir', type=str,
help='Output dir, to save the 1-GPU weights configs')
parser = argparse.ArgumentParser(description="Merge 20B checkpoint.")
parser.add_argument(
"--input_dir",
type=str,
help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")',
)
parser.add_argument(
"--output_dir", type=str, help="Output dir, to save the 1-GPU weights configs"
)
args = parser.parse_args()
merge(args.input_dir, args.output_dir)


if __name__ == '__main__':
if __name__ == "__main__":
main()