Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama: implement YaRN RoPE scaling #2268

Merged
merged 36 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
8dec38c
llama: implement NTK-By-Parts (NTKv2) RoPE scaling
cebtenzzre Jul 18, 2023
6aeb46b
CUDA implementation
cebtenzzre Jul 19, 2023
9348aa4
Metal implementation
cebtenzzre Jul 21, 2023
a30ae20
implement new YaRN algorithm
cebtenzzre Sep 5, 2023
b5ced4f
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Sep 5, 2023
826269a
ggml : increase GGML_MAX_OP_PARAMS
cebtenzzre Sep 5, 2023
cf731d5
YaRN : avoid NaN if unused betas are zero
cebtenzzre Sep 5, 2023
dcb058c
YaRN : fix missing parameter in CUDA impl
cebtenzzre Sep 5, 2023
281b26e
convert : reduce unnecessary variables in Params
cebtenzzre Sep 6, 2023
a06c729
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Sep 21, 2023
dc26a0d
llama : simplify use of context params
cebtenzzre Sep 21, 2023
904d4ed
llama : store YaRN parameters in GGUF
cebtenzzre Sep 14, 2023
56abb9a
fix convert scripts
cebtenzzre Sep 21, 2023
43eaf06
llama : fix C compatibility
cebtenzzre Sep 21, 2023
fe788c4
don't hardcode max_pos_emb
cebtenzzre Sep 21, 2023
e0b120c
address review comments
cebtenzzre Sep 21, 2023
19bb74e
restore backwards compatiblity with *.rope.scale_linear
cebtenzzre Sep 21, 2023
4d5fe73
better option descriptions in help
cebtenzzre Sep 21, 2023
7466415
gguf : store scaling type as a string instead of an int
cebtenzzre Oct 7, 2023
4f4e948
improve printing of YaRN parameters
cebtenzzre Oct 7, 2023
5d7a3a5
allow forcing ext_factor to zero if scaling type is YaRN
cebtenzzre Oct 7, 2023
9bd050f
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Oct 7, 2023
babf0e0
fix rope_cuda parameter order
cebtenzzre Oct 8, 2023
0050e1e
default n_yarn_orig_ctx to n_ctx_train
cebtenzzre Oct 8, 2023
09c3102
fix uninitialized cparams
cebtenzzre Oct 8, 2023
57c3442
make printed param formatting more consistent
cebtenzzre Oct 8, 2023
a20b3e6
fix missing import
cebtenzzre Oct 11, 2023
9ef91b1
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Oct 13, 2023
9ae10b3
Fix YaRN inverted scaling and add "rope.scaling.type" to GGUF (#1)
jquesnelle Oct 20, 2023
14cf93b
fix YaRN ramp, make mscale conditional, add --yarn-orig-ctx (#2)
jquesnelle Oct 20, 2023
237f1e7
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Oct 22, 2023
bc8395d
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Oct 23, 2023
4d5ed83
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Oct 24, 2023
9fc8238
fix loading rope.scaling.original_context_length from GGUF (#3)
jquesnelle Oct 30, 2023
15f26ef
implement YaRN for GPT-NeoX RoPE
cebtenzzre Nov 1, 2023
081f738
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into …
cebtenzzre Nov 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix convert scripts
  • Loading branch information
cebtenzzre committed Sep 21, 2023
commit 56abb9a406ef34a995c56be838ebe6529cd50438
3 changes: 2 additions & 1 deletion convert-baichuan-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ def parse_args() -> argparse.Namespace:
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])


# TOKENIZATION
Expand Down
25 changes: 13 additions & 12 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling = config.get("rope_scaling")

if rope_scaling is not None and typ := rope_scaling.get("type"):
if rope_scaling is not None and (typ := rope_scaling.get("type")):
rope_factor = rope_scaling.get("factor")
f_rope_scale = rope_factor
if typ == "linear":
rope_scaling_type = RopeScalingType.LINEAR
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = RopeScalingType.YARN
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
Green-Sky marked this conversation as resolved.
Show resolved Hide resolved
else:
Expand All @@ -231,10 +231,10 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
n_layer = config["num_hidden_layers"],
n_ctx = n_ctx,
n_ff = config["intermediate_size"],
n_head = config["num_attention_heads"],
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head,
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
f_rope_freq_base = config.get("rope_theta"),
cebtenzzre marked this conversation as resolved.
Show resolved Hide resolved
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
rope_finetuned = rope_finetuned,
Expand All @@ -247,7 +247,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))

# hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base == 1000000:
if config.get("rope_theta") == 1000000:
# CodeLlama
n_ctx = 16384
elif config["norm_eps"] == 1e-05:
Expand All @@ -263,10 +263,10 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
n_layer = config["n_layers"],
n_ctx = n_ctx,
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
n_head = config["n_heads"],
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head,
n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head),
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
f_rope_freq_base = config.get("rope_theta"),
)

@staticmethod
Expand Down Expand Up @@ -834,14 +834,15 @@ def add_meta_arch(self, params: Params) -> None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)

if params.rope_scaling_type:
assert params.f_rope_scale is not None
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)

if params.n_orig_ctx is not None:
self.gguf.add_rope_original_context_length(params.n_orig_ctx)
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)

if params.rope_finetuned is not None:
self.gguf.add_rope_finetuned(params.rope_finetuned)
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)

if params.ftype is not None:
self.gguf.add_file_type(params.ftype)
Expand Down
Loading