From 8dec38c35ca6682bcbdde1b55438b1527020a389 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 17 Jul 2023 20:07:15 -0400
Subject: [PATCH 01/28] llama: implement NTK-By-Parts (NTKv2) RoPE scaling

---
 examples/common.cpp        |  16 ++++
 examples/common.h          |   2 +
 examples/server/server.cpp |  18 +++++
 ggml.c                     | 155 +++++++++++++++++++++++++++----------
 ggml.h                     |   8 +-
 llama.cpp                  |  38 ++++++---
 llama.h                    |   2 +
 7 files changed, 189 insertions(+), 50 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 21f4a0357d422..957022d0cb360 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -194,6 +194,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-ntk-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_ntk_factor = std::stof(argv[i]);
+        } else if (arg == "--rope-ext-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_ext_factor = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -566,6 +578,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-ntk-factor N   RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor);
+    fprintf(stdout, "  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
     fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -657,6 +671,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.embedding       = params.embedding;
     lparams.rope_freq_base  = params.rope_freq_base;
     lparams.rope_freq_scale = params.rope_freq_scale;
+    lparams.rope_ntk_factor = params.rope_ntk_factor;
+    lparams.rope_ext_factor = params.rope_ext_factor;
 
     return lparams;
 }
diff --git a/examples/common.h b/examples/common.h
index 375bc0a3db416..677676ad131b5 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -32,6 +32,8 @@ struct gpt_params {
     float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
+    float   rope_ntk_factor                 = 0.0f;     // RoPE NTK mix factor
+    float   rope_ext_factor                 = 0.0f;     // RoPE extrapolation mix factor
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6f7a66da108c8..49d2dd0508e0f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -612,6 +612,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-ntk-factor N   RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor);
+    fprintf(stdout, "  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -764,6 +766,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
+        else if (arg == "--rope-ntk-factor")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_ntk_factor = std::stof(argv[i]);
+        }
+        else if (arg == "--rope-ext-factor")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_ext_factor = std::stof(argv[i]);
+        }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
             params.memory_f16 = false;
diff --git a/ggml.c b/ggml.c
index beb7f464167d5..8c5f7ac2641ef 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,5 +1,6 @@
 #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
 
 #include "ggml.h"
 
@@ -6711,6 +6712,8 @@ static struct ggml_tensor * ggml_rope_impl(
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale,
+        float                 ntk_factor,
+        float                 ext_factor,
         bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
@@ -6721,9 +6724,11 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[6] = { n_past, n_dims, mode, n_ctx };
+    int32_t params[8] = { n_past, n_dims, mode, n_ctx };
     memcpy(params + 4, &freq_base,  sizeof(float));
     memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params + 6, &ntk_factor, sizeof(float));
+    memcpy(params + 7, &ext_factor, sizeof(float));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -6740,7 +6745,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -6750,7 +6755,7 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, true);
 }
 
 struct ggml_tensor * ggml_rope_custom(
@@ -6761,8 +6766,10 @@ struct ggml_tensor * ggml_rope_custom(
         int                   mode,
         int                   n_ctx,
         float                 freq_base,
-        float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
+        float                 freq_scale,
+        float                 ntk_factor,
+        float                 ext_factor) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, false);
 }
 
 struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6773,8 +6780,10 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         int                   mode,
         int                   n_ctx,
         float                 freq_base,
-        float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
+        float                 freq_scale,
+        float                 ntk_factor,
+        float                 ext_factor) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, true);
 }
 
 // ggml_rope_back
@@ -12003,6 +12012,52 @@ static void ggml_compute_forward_clamp(
 
 // ggml_compute_forward_rope
 
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+#define NTKV2_MAX_POS_EMB 2048
+#define NTKV2_CORRECTION_FACTOR(n_rot) (__builtin_logf(NTKV2_MAX_POS_EMB / ((n_rot) * 2 * (float)M_PI)) / 2)
+
+static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MIN(0.001f, high - low);
+    return 1 - MIN(1, MAX(0, y));
+}
+
+// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static float rope_ntkv2(
+        const float theta_base,
+        const float theta_ntk,
+        const float dims_over_base,
+        const float freq_scale,
+        const int64_t i0,
+        const float ntk_factor,
+        const float ext_factor,
+        const int n_dims) {
+    // Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+    // Do not change unless there is a good reason for doing so!
+    static const float BETA_0 = 1.75f;
+    static const float BETA_1 = 1.25f;
+    static const float GAMMA_0 = 16.0f;
+    static const float GAMMA_1 = 2.0f;
+
+    static const float low_1p  = NTKV2_CORRECTION_FACTOR(BETA_0);
+    static const float high_1p = NTKV2_CORRECTION_FACTOR(BETA_1);
+    static const float low_2p  = NTKV2_CORRECTION_FACTOR(GAMMA_0);
+    static const float high_2p = NTKV2_CORRECTION_FACTOR(GAMMA_1);
+
+    // start and end correction factors
+    const float low_1  = MAX(0, floorf(low_1p * dims_over_base));
+    const float high_1 = MIN(n_dims - 1, ceilf(high_1p * dims_over_base));
+    const float low_2  = MAX(0, floorf(low_2p * dims_over_base));
+    const float high_2 = MIN(n_dims - 1, ceilf(high_2p * dims_over_base));
+
+    const float theta_linear = freq_scale * theta_base;
+    const float ramp_mix = rope_ntkv2_ramp(low_1, high_1, i0) * ntk_factor;
+    const float theta_mix = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix;
+    const float ramp_final = rope_ntkv2_ramp(low_2, high_2, i0) * ext_factor;
+    return theta_mix * (1 - ramp_final) + theta_base * ramp_final;
+}
+
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -12014,6 +12069,8 @@ static void ggml_compute_forward_rope_f32(
 
     float freq_base;
     float freq_scale;
+    float ntk_factor;
+    float ext_factor;
 
     const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -12021,6 +12078,8 @@ static void ggml_compute_forward_rope_f32(
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
     memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12050,6 +12109,8 @@ static void ggml_compute_forward_rope_f32(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
+    const float dims_over_base = n_dims / logf(freq_base);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12061,18 +12122,19 @@ static void ggml_compute_forward_rope_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = freq_scale * (float)p;
+                float theta_base = (float)p;
+                float theta_ntk = theta_base;
 
                 if (is_glm) {
-                    theta = MIN(p, n_ctx - 2);
+                    theta_base = MIN(p, n_ctx - 2);
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
                         const float cos_block_theta = cosf(block_theta);
                         const float sin_block_theta = sinf(block_theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
                         block_theta *= theta_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -12090,10 +12152,13 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base,
+                                freq_scale, i0, ntk_factor, ext_factor, n_dims);
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
+                        theta_ntk *= theta_ntk_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -12107,12 +12172,13 @@ static void ggml_compute_forward_rope_f32(
                 } else {
                     // TODO: this is probably wrong, but I can't figure it out ..
                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -12143,6 +12209,8 @@ static void ggml_compute_forward_rope_f16(
 
     float freq_base;
     float freq_scale;
+    float ntk_factor;
+    float ext_factor;
 
     const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -12150,6 +12218,8 @@ static void ggml_compute_forward_rope_f16(
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
     memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12179,6 +12249,8 @@ static void ggml_compute_forward_rope_f16(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
+    const float dims_over_base = n_dims / logf(freq_base);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12190,18 +12262,19 @@ static void ggml_compute_forward_rope_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = freq_scale * (float)p;
+                float theta_base = (float)p;
+                float theta_ntk = theta_base;
 
                 if (is_glm) {
-                    theta = MIN(p, n_ctx - 2);
+                    theta_base = MIN(p, n_ctx - 2);
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
                         const float cos_block_theta = cosf(block_theta);
                         const float sin_block_theta = sinf(block_theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
                         block_theta *= theta_scale;
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -12219,10 +12292,13 @@ static void ggml_compute_forward_rope_f16(
                     }
                 } if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base,
+                                freq_scale, i0, ntk_factor, ext_factor, n_dims);
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
+                        theta_ntk *= theta_ntk_scale;
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -12236,12 +12312,13 @@ static void ggml_compute_forward_rope_f16(
                 } else {
                     // TODO: this is probably wrong, but I can't figure it out ..
                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -12335,14 +12412,14 @@ static void ggml_compute_forward_rope_back_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta_base = (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -12356,10 +12433,10 @@ static void ggml_compute_forward_rope_back_f32(
                 } else {
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -12431,14 +12508,14 @@ static void ggml_compute_forward_rope_back_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta_base = (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -12452,10 +12529,10 @@ static void ggml_compute_forward_rope_back_f16(
                 } else {
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
diff --git a/ggml.h b/ggml.h
index bdbd128004332..459d217df8068 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1194,7 +1194,9 @@ extern "C" {
             int                   mode,
             int                   n_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ntk_factor,
+            float                 ext_factor);
 
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1205,7 +1207,9 @@ extern "C" {
             int                   mode,
             int                   n_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ntk_factor,
+            float                 ext_factor);
 
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
diff --git a/llama.cpp b/llama.cpp
index 39aefd499dd0c..945215de8a31e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -196,6 +196,8 @@ struct llama_hparams {
 
     float rope_freq_base  = 10000.0f;
     float rope_freq_scale = 1.0f;
+    float rope_ntk_factor = 0.0f;
+    float rope_ext_factor = 0.0f;
 
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 
@@ -898,6 +900,8 @@ struct llama_context_params llama_context_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 10000.0f,
         /*.rope_freq_scale             =*/ 1.0f,
+        /*.rope_ntk_factor             =*/ 0.0f,
+        /*.rope_ext_factor             =*/ 0.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
@@ -1032,6 +1036,8 @@ static void llama_model_load_internal(
         const bool mul_mat_q,
         float rope_freq_base,
         float rope_freq_scale,
+        float rope_ntk_factor,
+        float rope_ext_factor,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1083,6 +1089,8 @@ static void llama_model_load_internal(
 
         hparams.rope_freq_base  = rope_freq_base;
         hparams.rope_freq_scale = rope_freq_scale;
+        hparams.rope_ntk_factor = rope_ntk_factor;
+        hparams.rope_ext_factor = rope_ext_factor;
     }
 
     // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
@@ -1106,6 +1114,8 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
         fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
         fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
+        fprintf(stderr, "%s: ntk_factor = %g\n",   __func__, hparams.rope_ntk_factor);
+        fprintf(stderr, "%s: ext_factor = %g\n",   __func__, hparams.rope_ext_factor);
         fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
         fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
     }
@@ -1374,6 +1384,8 @@ static bool llama_model_load(
         const bool mul_mat_q,
         float rope_freq_base,
         float rope_freq_scale,
+        float rope_ntk_factor,
+        float rope_ext_factor,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1382,9 +1394,10 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
-                                  main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
-                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu,
+                                  tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, rope_ntk_factor,
+                                  rope_ext_factor, low_vram, memory_type, use_mmap, use_mlock, vocab_only,
+                                  progress_callback, progress_callback_user_data);
         return true;
     } catch (const std::exception & err) {
         fprintf(stderr, "error loading model: %s\n", err.what());
@@ -1422,6 +1435,8 @@ static struct ggml_cgraph * llama_build_graph(
 
     const float freq_base  = hparams.rope_freq_base;
     const float freq_scale = hparams.rope_freq_scale;
+    const float ntk_factor = hparams.rope_ntk_factor;
+    const float ext_factor = hparams.rope_ext_factor;
     const float rms_norm_eps = hparams.f_rms_norm_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
@@ -1551,11 +1566,15 @@ static struct ggml_cgraph * llama_build_graph(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
+                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base,
+                    freq_scale, ntk_factor, ext_factor);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
+                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base,
+                    freq_scale, ntk_factor, ext_factor);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -3197,10 +3216,11 @@ struct llama_model * llama_load_model_from_file(
 
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
-                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
-                params.progress_callback_user_data)) {
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa,
+                params.rms_norm_eps, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q,
+                params.rope_freq_base, params.rope_freq_scale, params.rope_ntk_factor, params.rope_ext_factor,
+                params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
+                params.progress_callback, params.progress_callback_user_data)) {
         delete model;
         fprintf(stderr, "%s: failed to load model\n", __func__);
         return nullptr;
diff --git a/llama.h b/llama.h
index fa1977f2d9492..25bb3952a73f8 100644
--- a/llama.h
+++ b/llama.h
@@ -100,6 +100,8 @@ extern "C" {
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;  // RoPE base frequency
         float    rope_freq_scale; // RoPE frequency scaling factor
+        float    rope_ntk_factor; // RoPE NTK mix factor
+        float    rope_ext_factor; // RoPE extrapolation mix factor
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;

From 6aeb46b343ee15f28583e381fcf5b5f687fb09aa Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Tue, 18 Jul 2023 22:28:27 -0400
Subject: [PATCH 02/28] CUDA implementation

---
 ggml-cuda.cu | 84 +++++++++++++++++++++++++++++++++++++++++++++-------
 ggml.c       | 70 +++++++++++++++++++++++--------------------
 ggml.h       |  3 ++
 3 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9d42efb0d0b03..91a6edca60011 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3558,9 +3558,49 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
+static __device__ float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / min(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+struct rope_corr_factors {
+    float v[4];
+};
+
+// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static __device__ float rope_ntkv2(
+        const float theta_base,
+        const float theta_linear,
+        const float theta_ntk,
+        const rope_corr_factors corr_factors,
+        const int64_t i0,
+        const float ntk_factor,
+        const float ext_factor) {
+    float ramp_mix;
+    float theta;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors.v[0], corr_factors.v[1], i0) * ntk_factor;
+    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors.v[2], corr_factors.v[3], i0) * ext_factor;
+    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
+    return theta;
+}
+
 // rope == RoPE == rotary positional embedding
-static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
-                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+static __global__ void rope_f32(
+        const float * x,
+        float       * dst,
+        const int     ncols,
+        const float   freq_scale,
+        const float   ntk_factor,
+        const float   ext_factor,
+        const float   theta_scale,
+        const float   theta_ntk_scale,
+        const float   p0,
+        const int     p_delta_rows,
+        const rope_corr_factors corr_factors) {
     const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
 
     if (col >= ncols) {
@@ -3570,7 +3610,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
     const int i = row*ncols + col;
 
-    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const float p = p0 + row / p_delta_rows;
+    const float theta_base   = p*powf(theta_scale,     col/2);
+    const float theta_linear = freq_scale * theta_base;
+    const float theta_ntk    = p*powf(theta_ntk_scale, col/2);
+    const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, col, ntk_factor, ext_factor);
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
 
@@ -4234,13 +4278,26 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 
-static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_f32_cuda(
+        const float * x,
+        float       * dst,
+        const int     ncols,
+        const int     nrows,
+        const float   freq_scale,
+        const float   ntk_factor,
+        const float   ext_factor,
+        const float   theta_scale,
+        const float   theta_ntk_scale,
+        const float   p0,
+        const int     p_delta_rows,
+        const rope_corr_factors corr_factors,
+        cudaStream_t  stream) {
     GGML_ASSERT(nrows % 2 == 0);
     const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
+    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, freq_scale, ntk_factor, ext_factor, theta_scale,
+            theta_ntk_scale, p0, p_delta_rows, corr_factors);
 }
 
 static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -4941,11 +4998,13 @@ inline void ggml_cuda_op_rope(
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    // RoPE alteration for extended context
 
-    float freq_base, freq_scale;
+    // RoPE alteration for extended context
+    float freq_base, freq_scale, ntk_factor, ext_factor;
     memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
@@ -4958,8 +5017,13 @@ inline void ggml_cuda_op_rope(
         const float block_p = max(p - (n_ctx - 2.f), 0.f);
         rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
     } else {
-        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
-        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
+        const float p0 = (mode & 1) == 0 ? n_past : 0;
+        const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
+        rope_corr_factors corr_factors;
+        ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors.v);
+
+        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ntk_factor, ext_factor, theta_scale,
+                theta_ntk_scale, p0, ne01, corr_factors, cudaStream_main);
     }
 
     (void) src1;
diff --git a/ggml.c b/ggml.c
index 8c5f7ac2641ef..8a57391811dcc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12012,11 +12012,6 @@ static void ggml_compute_forward_clamp(
 
 // ggml_compute_forward_rope
 
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-#define NTKV2_MAX_POS_EMB 2048
-#define NTKV2_CORRECTION_FACTOR(n_rot) (__builtin_logf(NTKV2_MAX_POS_EMB / ((n_rot) * 2 * (float)M_PI)) / 2)
-
 static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / MIN(0.001f, high - low);
     return 1 - MIN(1, MAX(0, y));
@@ -12026,36 +12021,43 @@ static inline float rope_ntkv2_ramp(const float low, const float high, const int
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static float rope_ntkv2(
         const float theta_base,
+        const float theta_linear,
         const float theta_ntk,
-        const float dims_over_base,
-        const float freq_scale,
+        const float corr_factors[4],
         const int64_t i0,
         const float ntk_factor,
-        const float ext_factor,
-        const int n_dims) {
+        const float ext_factor) {
+    float ramp_mix;
+    float theta;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor;
+    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor;
+    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
+    return theta;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) {
+    static const float max_pos_emb = 2048;
+    return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) {
     // Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
     // Do not change unless there is a good reason for doing so!
-    static const float BETA_0 = 1.75f;
-    static const float BETA_1 = 1.25f;
+    static const float BETA_0  = 1.75f;
+    static const float BETA_1  = 1.25f;
     static const float GAMMA_0 = 16.0f;
     static const float GAMMA_1 = 2.0f;
 
-    static const float low_1p  = NTKV2_CORRECTION_FACTOR(BETA_0);
-    static const float high_1p = NTKV2_CORRECTION_FACTOR(BETA_1);
-    static const float low_2p  = NTKV2_CORRECTION_FACTOR(GAMMA_0);
-    static const float high_2p = NTKV2_CORRECTION_FACTOR(GAMMA_1);
-
     // start and end correction factors
-    const float low_1  = MAX(0, floorf(low_1p * dims_over_base));
-    const float high_1 = MIN(n_dims - 1, ceilf(high_1p * dims_over_base));
-    const float low_2  = MAX(0, floorf(low_2p * dims_over_base));
-    const float high_2 = MIN(n_dims - 1, ceilf(high_2p * dims_over_base));
-
-    const float theta_linear = freq_scale * theta_base;
-    const float ramp_mix = rope_ntkv2_ramp(low_1, high_1, i0) * ntk_factor;
-    const float theta_mix = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix;
-    const float ramp_final = rope_ntkv2_ramp(low_2, high_2, i0) * ext_factor;
-    return theta_mix * (1 - ramp_final) + theta_base * ramp_final;
+    factors[0] = MAX(0,         floorf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_0,  freq_base)));
+    factors[1] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_1,  freq_base)));
+    factors[2] = MAX(0,         floorf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base)));
+    factors[3] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base)));
 }
 
 static void ggml_compute_forward_rope_f32(
@@ -12110,7 +12112,8 @@ static void ggml_compute_forward_rope_f32(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
     const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-    const float dims_over_base = n_dims / logf(freq_base);
+    float corr_factors[4];
+    ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12152,8 +12155,9 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base,
-                                freq_scale, i0, ntk_factor, ext_factor, n_dims);
+                        const float theta_linear = freq_scale * theta_base;
+                        const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
+                                i0, ntk_factor, ext_factor);
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
 
@@ -12250,7 +12254,8 @@ static void ggml_compute_forward_rope_f16(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
     const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-    const float dims_over_base = n_dims / logf(freq_base);
+    float corr_factors[4];
+    ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12292,8 +12297,9 @@ static void ggml_compute_forward_rope_f16(
                     }
                 } if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base,
-                                freq_scale, i0, ntk_factor, ext_factor, n_dims);
+                        const float theta_linear = freq_scale * theta_base;
+                        const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
+                                i0, ntk_factor, ext_factor);
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
 
diff --git a/ggml.h b/ggml.h
index 459d217df8068..c2c6b7b1d376c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1211,6 +1211,9 @@ extern "C" {
             float                 ntk_factor,
             float                 ext_factor);
 
+    // compute correction factors for NTKv2 RoPE scaling
+    void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]);
+
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
     GGML_API struct ggml_tensor * ggml_rope_back(

From 9348aa4df9889707f8dd4189e544cf4b816ff798 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 21 Jul 2023 17:10:57 -0400
Subject: [PATCH 03/28] Metal implementation

---
 ggml-metal.m     |  7 +++--
 ggml-metal.metal | 67 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index b47a98e214b61..372d3e696e9c3 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1035,10 +1035,11 @@ void ggml_metal_graph_compute(
                             const int n_dims = ((int32_t *) dst->op_params)[1];
                             const int mode   = ((int32_t *) dst->op_params)[2];
 
-                            float freq_base;
-                            float freq_scale;
+                            float freq_base, freq_scale, ntk_factor, ext_factor;
                             memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
                             memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                            memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+                            memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1064,6 +1065,8 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
                             [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                             [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&ntk_factor length:sizeof(float) atIndex:23];
+                            [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 8d26b5ec2dfa4..347fd17ac093e 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -597,6 +597,55 @@ kernel void kernel_alibi_f32(
     }
 }
 
+static float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / min(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static float rope_ntkv2(
+        const float theta_base,
+        const float theta_linear,
+        const float theta_ntk,
+        const float corr_factors[4],
+        const int64_t i0,
+        const float ntk_factor,
+        const float ext_factor) {
+    float ramp_mix;
+    float theta;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor;
+    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
+
+    ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor;
+    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
+    return theta;
+}
+
+// Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+// Do not change unless there is a good reason for doing so!
+constant float BETA_0  = 1.75f;
+constant float BETA_1  = 1.25f;
+constant float GAMMA_0 = 16.0f;
+constant float GAMMA_1 = 2.0f;
+
+constant float max_pos_emb = 2048;
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) {
+    return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+static void rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) {
+    // start and end correction factors
+    factors[0] = max(0.0f,         floor(rope_ntkv2_corr_factor(n_dims, BETA_0,  freq_base)));
+    factors[1] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, BETA_1,  freq_base)));
+    factors[2] = max(0.0f,         floor(rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base)));
+    factors[3] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base)));
+}
+
 kernel void kernel_rope(
         device const  void * src0,
         device       float * dst,
@@ -621,24 +670,33 @@ kernel void kernel_rope(
         constant       int & mode,
         constant     float & freq_base,
         constant     float & freq_scale,
+        constant     float & ntk_factor,
+        constant     float & ext_factor,
         uint3 tpig[[thread_position_in_grid]]) {
     const int64_t i3 = tpig[2];
     const int64_t i2 = tpig[1];
     const int64_t i1 = tpig[0];
 
-    const bool is_neox = mode & 2;
     const float theta_scale = pow(freq_base, -2.0f/n_dims);
+    const float theta_ntk_scale = pow(freq_base * pow(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
+    float corr_factors[4];
+    rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
 
-    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+    float theta_base = (mode & 1) == 0 ? n_past + i2 : i2;
+    float theta_ntk = theta_base;
 
-    float theta = freq_scale * (float)p;
+    const bool is_neox = mode & 2;
 
     if (!is_neox) {
         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float theta_linear = freq_scale * theta_base;
+            const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
+                    i0, ntk_factor, ext_factor);
             const float cos_theta = cos(theta);
             const float sin_theta = sin(theta);
 
-            theta *= theta_scale;
+            theta_base *= theta_scale;
+            theta_ntk *= theta_ntk_scale;
 
             device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
             device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -650,6 +708,7 @@ kernel void kernel_rope(
             dst_data[1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
+        theta_base *= freq_scale;
         // TODO: implement
     }
 }

From a30ae2095c91adbb9e3d626c1bae234fcb43e669 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 4 Sep 2023 20:08:17 -0400
Subject: [PATCH 04/28] implement new YaRN algorithm

---
 examples/common.cpp        |  64 +++++++++------
 examples/common.h          |   4 +-
 examples/server/server.cpp |  28 +++++--
 ggml-cuda.cu               |  81 +++++++++----------
 ggml-metal.m               |  22 +++---
 ggml-metal.metal           |  72 +++++++----------
 ggml.c                     | 158 +++++++++++++++++--------------------
 ggml.h                     |  16 ++--
 llama.cpp                  | 111 +++++++++++++++-----------
 llama.h                    |  10 ++-
 10 files changed, 300 insertions(+), 266 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 957022d0cb360..ef08c403c2eb8 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -194,18 +194,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "--rope-ntk-factor") {
+        } else if (arg == "--rope-ext-factor") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ntk_factor = std::stof(argv[i]);
-        } else if (arg == "--rope-ext-factor") {
+            params.rope_ext_factor = std::stof(argv[i]);
+        } else if (arg == "--rope-attn-factor") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ext_factor = std::stof(argv[i]);
+            params.rope_attn_factor = std::stof(argv[i]);
+        } else if (arg == "--rope-beta-fast") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_beta_fast = std::stof(argv[i]);
+        } else if (arg == "--rope-beta-slow") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_beta_slow = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -578,8 +590,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stdout, "  --rope-ntk-factor N   RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor);
     fprintf(stdout, "  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
+    fprintf(stdout, "  --rope-attn-factor N  RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor);
+    fprintf(stdout, "  --rope-beta-fast N    RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast);
+    fprintf(stdout, "  --rope-beta-slow N    RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow);
     fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -654,25 +668,27 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto lparams = llama_context_default_params();
 
-    lparams.n_ctx           = params.n_ctx;
-    lparams.n_batch         = params.n_batch;
-    lparams.n_gqa           = params.n_gqa;
-    lparams.rms_norm_eps    = params.rms_norm_eps;
-    lparams.n_gpu_layers    = params.n_gpu_layers;
-    lparams.main_gpu        = params.main_gpu;
-    lparams.tensor_split    = params.tensor_split;
-    lparams.low_vram        = params.low_vram;
-    lparams.mul_mat_q       = params.mul_mat_q;
-    lparams.seed            = params.seed;
-    lparams.f16_kv          = params.memory_f16;
-    lparams.use_mmap        = params.use_mmap;
-    lparams.use_mlock       = params.use_mlock;
-    lparams.logits_all      = params.perplexity;
-    lparams.embedding       = params.embedding;
-    lparams.rope_freq_base  = params.rope_freq_base;
-    lparams.rope_freq_scale = params.rope_freq_scale;
-    lparams.rope_ntk_factor = params.rope_ntk_factor;
-    lparams.rope_ext_factor = params.rope_ext_factor;
+    lparams.n_ctx            = params.n_ctx;
+    lparams.n_batch          = params.n_batch;
+    lparams.n_gqa            = params.n_gqa;
+    lparams.rms_norm_eps     = params.rms_norm_eps;
+    lparams.n_gpu_layers     = params.n_gpu_layers;
+    lparams.main_gpu         = params.main_gpu;
+    lparams.tensor_split     = params.tensor_split;
+    lparams.low_vram         = params.low_vram;
+    lparams.mul_mat_q        = params.mul_mat_q;
+    lparams.seed             = params.seed;
+    lparams.f16_kv           = params.memory_f16;
+    lparams.use_mmap         = params.use_mmap;
+    lparams.use_mlock        = params.use_mlock;
+    lparams.logits_all       = params.perplexity;
+    lparams.embedding        = params.embedding;
+    lparams.rope_freq_base   = params.rope_freq_base;
+    lparams.rope_freq_scale  = params.rope_freq_scale;
+    lparams.rope_ext_factor  = params.rope_ext_factor;
+    lparams.rope_attn_factor = params.rope_attn_factor;
+    lparams.rope_beta_fast   = params.rope_beta_fast;
+    lparams.rope_beta_slow   = params.rope_beta_slow;
 
     return lparams;
 }
diff --git a/examples/common.h b/examples/common.h
index 677676ad131b5..8410b38a5f48b 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -32,8 +32,10 @@ struct gpt_params {
     float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
-    float   rope_ntk_factor                 = 0.0f;     // RoPE NTK mix factor
     float   rope_ext_factor                 = 0.0f;     // RoPE extrapolation mix factor
+    float   rope_attn_factor                = 1.0f;     // RoPE magnitude scaling factor
+    float   rope_beta_fast                  = 32.0f;    // RoPE low correction dim
+    float   rope_beta_slow                  = 1.0f;     // RoPE high correction dim
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 49d2dd0508e0f..9721f269233be 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -612,8 +612,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stdout, "  --rope-ntk-factor N   RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor);
     fprintf(stdout, "  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
+    fprintf(stdout, "  --rope-attn-factor N  RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor);
+    fprintf(stdout, "  --rope-beta-fast N    RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast);
+    fprintf(stdout, "  --rope-beta-slow N    RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow);
     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -766,21 +768,37 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
-        else if (arg == "--rope-ntk-factor")
+        else if (arg == "--rope-ext-factor")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ntk_factor = std::stof(argv[i]);
+            params.rope_ext_factor = std::stof(argv[i]);
         }
-        else if (arg == "--rope-ext-factor")
+        else if (arg == "--rope-attn-factor")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ext_factor = std::stof(argv[i]);
+            params.rope_attn_factor = std::stof(argv[i]);
+        }
+        else if (arg == "--rope-beta-fast")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_beta_fast = std::stof(argv[i]);
+        }
+        else if (arg == "--rope-beta-slow")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_beta_slow = std::stof(argv[i]);
         }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 91a6edca60011..dedb87efd2ffa 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3558,34 +3558,31 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
-static __device__ float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / min(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
-struct rope_corr_factors {
+struct rope_corr_dims {
     float v[4];
 };
 
-// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static __device__ float rope_ntkv2(
-        const float theta_base,
-        const float theta_linear,
-        const float theta_ntk,
-        const rope_corr_factors corr_factors,
-        const int64_t i0,
-        const float ntk_factor,
-        const float ext_factor) {
-    float ramp_mix;
-    float theta;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors.v[0], corr_factors.v[1], i0) * ntk_factor;
-    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors.v[2], corr_factors.v[3], i0) * ext_factor;
-    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
-    return theta;
+static __device__ void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+    // Get n-d magnitude scaling corrected for interpolation
+    if (freq_scale > 1.0f)
+        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
 }
 
 // rope == RoPE == rotary positional embedding
@@ -3594,13 +3591,11 @@ static __global__ void rope_f32(
         float       * dst,
         const int     ncols,
         const float   freq_scale,
-        const float   ntk_factor,
         const float   ext_factor,
         const float   theta_scale,
-        const float   theta_ntk_scale,
         const float   p0,
         const int     p_delta_rows,
-        const rope_corr_factors corr_factors) {
+        const rope_corr_dims corr_dims) {
     const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
 
     if (col >= ncols) {
@@ -3612,11 +3607,9 @@ static __global__ void rope_f32(
 
     const float p = p0 + row / p_delta_rows;
     const float theta_base   = p*powf(theta_scale,     col/2);
-    const float theta_linear = freq_scale * theta_base;
-    const float theta_ntk    = p*powf(theta_ntk_scale, col/2);
-    const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, col, ntk_factor, ext_factor);
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + 1];
@@ -4284,20 +4277,19 @@ static void rope_f32_cuda(
         const int     ncols,
         const int     nrows,
         const float   freq_scale,
-        const float   ntk_factor,
         const float   ext_factor,
         const float   theta_scale,
-        const float   theta_ntk_scale,
         const float   p0,
         const int     p_delta_rows,
-        const rope_corr_factors corr_factors,
+        const rope_corr_dims corr_dims,
         cudaStream_t  stream) {
     GGML_ASSERT(nrows % 2 == 0);
     const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, freq_scale, ntk_factor, ext_factor, theta_scale,
-            theta_ntk_scale, p0, p_delta_rows, corr_factors);
+    rope_f32<<<block_nums, block_dims, 0, stream>>>(
+        x, dst, ncols, freq_scale, ext_factor, theta_scale, p0, p_delta_rows, corr_dims
+    );
 }
 
 static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -5000,11 +4992,13 @@ inline void ggml_cuda_op_rope(
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
 
     // RoPE alteration for extended context
-    float freq_base, freq_scale, ntk_factor, ext_factor;
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
@@ -5018,12 +5012,13 @@ inline void ggml_cuda_op_rope(
         rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
     } else {
         const float p0 = (mode & 1) == 0 ? n_past : 0;
-        const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-        rope_corr_factors corr_factors;
-        ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors.v);
+        rope_corr_dims corr_dims;
+        ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v);
 
-        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ntk_factor, ext_factor, theta_scale,
-                theta_ntk_scale, p0, ne01, corr_factors, cudaStream_main);
+        rope_f32_cuda(
+            src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, theta_scale, p0, ne01, corr_dims,
+            cudaStream_main
+        );
     }
 
     (void) src1;
diff --git a/ggml-metal.m b/ggml-metal.m
index 372d3e696e9c3..0e8b0f9a958b3 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1035,11 +1035,13 @@ void ggml_metal_graph_compute(
                             const int n_dims = ((int32_t *) dst->op_params)[1];
                             const int mode   = ((int32_t *) dst->op_params)[2];
 
-                            float freq_base, freq_scale, ntk_factor, ext_factor;
-                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-                            memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
-                            memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                            memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+                            memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+                            memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+                            memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+                            memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1063,10 +1065,12 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
                             [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
                             [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
-                            [encoder setBytes:&ntk_factor length:sizeof(float) atIndex:23];
-                            [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24];
+                            [encoder setBytes:&freq_base   length:sizeof(float) atIndex:21];
+                            [encoder setBytes:&freq_scale  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&ext_factor  length:sizeof(float) atIndex:23];
+                            [encoder setBytes:&attn_factor length:sizeof(float) atIndex:24];
+                            [encoder setBytes:&beta_fast   length:sizeof(float) atIndex:25];
+                            [encoder setBytes:&beta_slow   length:sizeof(float) atIndex:26];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 347fd17ac093e..f5a98d09278e3 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -597,53 +597,41 @@ kernel void kernel_alibi_f32(
     }
 }
 
-static float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / min(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
-// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static float rope_ntkv2(
-        const float theta_base,
-        const float theta_linear,
-        const float theta_ntk,
-        const float corr_factors[4],
-        const int64_t i0,
-        const float ntk_factor,
-        const float ext_factor) {
-    float ramp_mix;
-    float theta;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor;
-    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor;
-    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
-    return theta;
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+    // Get n-d magnitude scaling corrected for interpolation
+    if (freq_scale > 1.0f)
+        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
 }
 
-// Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
-// Do not change unless there is a good reason for doing so!
-constant float BETA_0  = 1.75f;
-constant float BETA_1  = 1.25f;
-constant float GAMMA_0 = 16.0f;
-constant float GAMMA_1 = 2.0f;
-
 constant float max_pos_emb = 2048;
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) {
+static float rope_yarn_corr_factor(const int n_dims, const float n_rot, const float base) {
     return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base));
 }
 
-static void rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) {
-    // start and end correction factors
-    factors[0] = max(0.0f,         floor(rope_ntkv2_corr_factor(n_dims, BETA_0,  freq_base)));
-    factors[1] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, BETA_1,  freq_base)));
-    factors[2] = max(0.0f,         floor(rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base)));
-    factors[3] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base)));
+static void rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) {
+    // start and end correction dims
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, beta_slow, freq_base)));
 }
 
 kernel void kernel_rope(
@@ -670,33 +658,29 @@ kernel void kernel_rope(
         constant       int & mode,
         constant     float & freq_base,
         constant     float & freq_scale,
-        constant     float & ntk_factor,
         constant     float & ext_factor,
+        constant     float & attn_factor,
+        constant     float & beta_fast,
+        constant     float & beta_slow,
         uint3 tpig[[thread_position_in_grid]]) {
     const int64_t i3 = tpig[2];
     const int64_t i2 = tpig[1];
     const int64_t i1 = tpig[0];
 
     const float theta_scale = pow(freq_base, -2.0f/n_dims);
-    const float theta_ntk_scale = pow(freq_base * pow(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-    float corr_factors[4];
-    rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
+    float corr_dims[2];
+    rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
 
     float theta_base = (mode & 1) == 0 ? n_past + i2 : i2;
-    float theta_ntk = theta_base;
 
     const bool is_neox = mode & 2;
 
     if (!is_neox) {
         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-            const float theta_linear = freq_scale * theta_base;
-            const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
-                    i0, ntk_factor, ext_factor);
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
             theta_base *= theta_scale;
-            theta_ntk *= theta_ntk_scale;
 
             device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
             device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
diff --git a/ggml.c b/ggml.c
index 8a57391811dcc..8bf7f52e223c5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6712,8 +6712,10 @@ static struct ggml_tensor * ggml_rope_impl(
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale,
-        float                 ntk_factor,
         float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
         bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
@@ -6724,11 +6726,13 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[8] = { n_past, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &ntk_factor, sizeof(float));
-    memcpy(params + 7, &ext_factor, sizeof(float));
+    int32_t params[10] = { n_past, n_dims, mode, n_ctx };
+    memcpy(params + 4, &freq_base,   sizeof(float));
+    memcpy(params + 5, &freq_scale,  sizeof(float));
+    memcpy(params + 6, &ext_factor,  sizeof(float));
+    memcpy(params + 7, &attn_factor, sizeof(float));
+    memcpy(params + 8, &beta_fast,   sizeof(float));
+    memcpy(params + 9, &beta_slow,   sizeof(float));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -6745,7 +6749,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -6755,7 +6759,7 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true);
 }
 
 struct ggml_tensor * ggml_rope_custom(
@@ -6767,9 +6771,13 @@ struct ggml_tensor * ggml_rope_custom(
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale,
-        float                 ntk_factor,
-        float                 ext_factor) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, false);
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, false
+    );
 }
 
 struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6781,9 +6789,13 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale,
-        float                 ntk_factor,
-        float                 ext_factor) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, true);
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, true
+    );
 }
 
 // ggml_rope_back
@@ -12012,52 +12024,40 @@ static void ggml_compute_forward_clamp(
 
 // ggml_compute_forward_rope
 
-static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) {
+static inline float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / MIN(0.001f, high - low);
     return 1 - MIN(1, MAX(0, y));
 }
 
-// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static float rope_ntkv2(
-        const float theta_base,
-        const float theta_linear,
-        const float theta_ntk,
-        const float corr_factors[4],
-        const int64_t i0,
-        const float ntk_factor,
-        const float ext_factor) {
-    float ramp_mix;
-    float theta;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor;
-    theta = theta_linear * (1 - ramp_mix) + theta_ntk  * ramp_mix;
-
-    ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor;
-    theta = theta        * (1 - ramp_mix) + theta_base * ramp_mix;
-    return theta;
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+    // Get n-d magnitude scaling corrected for interpolation
+    if (freq_scale > 1.0f)
+        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
 }
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) {
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(const int n_dims, const float n_rot, const float base) {
     static const float max_pos_emb = 2048;
     return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 }
 
-void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) {
-    // Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
-    // Do not change unless there is a good reason for doing so!
-    static const float BETA_0  = 1.75f;
-    static const float BETA_1  = 1.25f;
-    static const float GAMMA_0 = 16.0f;
-    static const float GAMMA_1 = 2.0f;
-
-    // start and end correction factors
-    factors[0] = MAX(0,         floorf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_0,  freq_base)));
-    factors[1] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_1,  freq_base)));
-    factors[2] = MAX(0,         floorf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base)));
-    factors[3] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base)));
+void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) {
+    // start and end correction dims
+    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, beta_fast, freq_base)));
+    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, beta_slow, freq_base)));
 }
 
 static void ggml_compute_forward_rope_f32(
@@ -12069,19 +12069,18 @@ static void ggml_compute_forward_rope_f32(
         return;
     }
 
-    float freq_base;
-    float freq_scale;
-    float ntk_factor;
-    float ext_factor;
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
     const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12111,9 +12110,8 @@ static void ggml_compute_forward_rope_f32(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-    float corr_factors[4];
-    ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12126,7 +12124,6 @@ static void ggml_compute_forward_rope_f32(
                 if (ir   > ir1) break;
 
                 float theta_base = (float)p;
-                float theta_ntk = theta_base;
 
                 if (is_glm) {
                     theta_base = MIN(p, n_ctx - 2);
@@ -12155,14 +12152,12 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float theta_linear = freq_scale * theta_base;
-                        const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
-                                i0, ntk_factor, ext_factor);
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
 
                         theta_base *= theta_scale;
-                        theta_ntk *= theta_ntk_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -12211,19 +12206,18 @@ static void ggml_compute_forward_rope_f16(
         return;
     }
 
-    float freq_base;
-    float freq_scale;
-    float ntk_factor;
-    float ext_factor;
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
     const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12253,9 +12247,8 @@ static void ggml_compute_forward_rope_f16(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims);
-    float corr_factors[4];
-    ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors);
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12268,7 +12261,6 @@ static void ggml_compute_forward_rope_f16(
                 if (ir   > ir1) break;
 
                 float theta_base = (float)p;
-                float theta_ntk = theta_base;
 
                 if (is_glm) {
                     theta_base = MIN(p, n_ctx - 2);
@@ -12297,14 +12289,12 @@ static void ggml_compute_forward_rope_f16(
                     }
                 } if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float theta_linear = freq_scale * theta_base;
-                        const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors,
-                                i0, ntk_factor, ext_factor);
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
 
                         theta_base *= theta_scale;
-                        theta_ntk *= theta_ntk_scale;
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
diff --git a/ggml.h b/ggml.h
index c2c6b7b1d376c..06e1cbf7b7312 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1195,8 +1195,10 @@ extern "C" {
             int                   n_ctx,
             float                 freq_base,
             float                 freq_scale,
-            float                 ntk_factor,
-            float                 ext_factor);
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
 
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1208,11 +1210,13 @@ extern "C" {
             int                   n_ctx,
             float                 freq_base,
             float                 freq_scale,
-            float                 ntk_factor,
-            float                 ext_factor);
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
 
-    // compute correction factors for NTKv2 RoPE scaling
-    void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]);
+    // compute correction dims for YaRN RoPE scaling
+    void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
diff --git a/llama.cpp b/llama.cpp
index 945215de8a31e..3ea0650827357 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -194,10 +194,12 @@ struct llama_hparams {
     float f_ffn_mult = 1.0f;
     float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-    float rope_ntk_factor = 0.0f;
-    float rope_ext_factor = 0.0f;
+    float rope_freq_base   = 10000.0f;
+    float rope_freq_scale  = 1.0f;
+    float rope_ext_factor  = 0.0f;
+    float rope_attn_factor = 1.0f;
+    float rope_beta_fast  = 0.0f;
+    float rope_beta_slow  = 0.0f;
 
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 
@@ -900,8 +902,10 @@ struct llama_context_params llama_context_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 10000.0f,
         /*.rope_freq_scale             =*/ 1.0f,
-        /*.rope_ntk_factor             =*/ 0.0f,
         /*.rope_ext_factor             =*/ 0.0f,
+        /*.rope_attn_factor            =*/ 1.0f,
+        /*.rope_beta_fast              =*/ 32.0f,
+        /*.rope_beta_slow              =*/ 1.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
@@ -1036,8 +1040,10 @@ static void llama_model_load_internal(
         const bool mul_mat_q,
         float rope_freq_base,
         float rope_freq_scale,
-        float rope_ntk_factor,
         float rope_ext_factor,
+        float rope_attn_factor,
+        float rope_beta_fast,
+        float rope_beta_slow,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1087,10 +1093,12 @@ static void llama_model_load_internal(
             hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
         }
 
-        hparams.rope_freq_base  = rope_freq_base;
-        hparams.rope_freq_scale = rope_freq_scale;
-        hparams.rope_ntk_factor = rope_ntk_factor;
-        hparams.rope_ext_factor = rope_ext_factor;
+        hparams.rope_freq_base   = rope_freq_base;
+        hparams.rope_freq_scale  = rope_freq_scale;
+        hparams.rope_ext_factor  = rope_ext_factor;
+        hparams.rope_attn_factor = rope_attn_factor;
+        hparams.rope_beta_fast   = rope_beta_fast;
+        hparams.rope_beta_slow   = rope_beta_slow;
     }
 
     // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
@@ -1100,24 +1108,26 @@ static void llama_model_load_internal(
     //const uint32_t n_ff = 28672;
 
     {
-        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_head_kv  = %u\n",   __func__, hparams.n_head_kv);
-        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
-        fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
-        fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
-        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
-        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
-        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
-        fprintf(stderr, "%s: ntk_factor = %g\n",   __func__, hparams.rope_ntk_factor);
-        fprintf(stderr, "%s: ext_factor = %g\n",   __func__, hparams.rope_ext_factor);
-        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: format      = %s\n",   __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: n_vocab     = %u\n",   __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx       = %u\n",   __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd      = %u\n",   __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult      = %u\n",   __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head      = %u\n",   __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_head_kv   = %u\n",   __func__, hparams.n_head_kv);
+        fprintf(stderr, "%s: n_layer     = %u\n",   __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot       = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
+        fprintf(stderr, "%s: n_gqa       = %u\n",   __func__, hparams.n_gqa());
+        fprintf(stderr, "%s: rnorm_eps   = %.1e\n", __func__, hparams.f_rms_norm_eps);
+        fprintf(stderr, "%s: n_ff        = %u\n",   __func__, n_ff);
+        fprintf(stderr, "%s: freq_base   = %.1f\n", __func__, hparams.rope_freq_base);
+        fprintf(stderr, "%s: freq_scale  = %g\n",   __func__, hparams.rope_freq_scale);
+        fprintf(stderr, "%s: ext_factor  = %g\n",   __func__, hparams.rope_ext_factor);
+        fprintf(stderr, "%s: attn_factor = %g\n",   __func__, hparams.rope_attn_factor);
+        fprintf(stderr, "%s: beta_fast   = %g\n",   __func__, hparams.rope_beta_fast);
+        fprintf(stderr, "%s: beta_slow   = %g\n",   __func__, hparams.rope_beta_slow);
+        fprintf(stderr, "%s: ftype       = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
+        fprintf(stderr, "%s: model size  = %s\n",   __func__, llama_model_type_name(model.type));
     }
 
     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1384,8 +1394,10 @@ static bool llama_model_load(
         const bool mul_mat_q,
         float rope_freq_base,
         float rope_freq_scale,
-        float rope_ntk_factor,
         float rope_ext_factor,
+        float rope_attn_factor,
+        float rope_beta_fast,
+        float rope_beta_slow,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1394,10 +1406,11 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu,
-                                  tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, rope_ntk_factor,
-                                  rope_ext_factor, low_vram, memory_type, use_mmap, use_mlock, vocab_only,
-                                  progress_callback, progress_callback_user_data);
+        llama_model_load_internal(
+            fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, mul_mat_q,
+            rope_freq_base, rope_freq_scale, rope_ext_factor, rope_attn_factor, rope_beta_fast, rope_beta_slow,
+            low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data
+        );
         return true;
     } catch (const std::exception & err) {
         fprintf(stderr, "error loading model: %s\n", err.what());
@@ -1433,10 +1446,12 @@ static struct ggml_cgraph * llama_build_graph(
 
     LLAMA_ASSERT(n_embd_head == hparams.n_rot);
 
-    const float freq_base  = hparams.rope_freq_base;
-    const float freq_scale = hparams.rope_freq_scale;
-    const float ntk_factor = hparams.rope_ntk_factor;
-    const float ext_factor = hparams.rope_ext_factor;
+    const float freq_base    = hparams.rope_freq_base;
+    const float freq_scale   = hparams.rope_freq_scale;
+    const float ext_factor   = hparams.rope_ext_factor;
+    const float attn_factor  = hparams.rope_attn_factor;
+    const float beta_fast    = hparams.rope_beta_fast;
+    const float beta_slow    = hparams.rope_beta_slow;
     const float rms_norm_eps = hparams.f_rms_norm_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
@@ -1567,14 +1582,16 @@ static struct ggml_cgraph * llama_build_graph(
             ggml_set_name(tmpq, "tmpq");
 
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
-                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base,
-                    freq_scale, ntk_factor, ext_factor);
+                ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base,
+                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+            );
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
             struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
-                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base,
-                    freq_scale, ntk_factor, ext_factor);
+                ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base,
+                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+            );
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -3216,11 +3233,13 @@ struct llama_model * llama_load_model_from_file(
 
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa,
-                params.rms_norm_eps, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q,
-                params.rope_freq_base, params.rope_freq_scale, params.rope_ntk_factor, params.rope_ext_factor,
-                params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
-                params.progress_callback, params.progress_callback_user_data)) {
+    if (!llama_model_load(
+        path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps,
+        params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base,
+        params.rope_freq_scale, params.rope_ext_factor, params.rope_attn_factor, params.rope_beta_fast,
+        params.rope_beta_slow, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
+        params.progress_callback, params.progress_callback_user_data
+    )) {
         delete model;
         fprintf(stderr, "%s: failed to load model\n", __func__);
         return nullptr;
diff --git a/llama.h b/llama.h
index 25bb3952a73f8..66c78c7619dd5 100644
--- a/llama.h
+++ b/llama.h
@@ -98,10 +98,12 @@ extern "C" {
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;  // RoPE base frequency
-        float    rope_freq_scale; // RoPE frequency scaling factor
-        float    rope_ntk_factor; // RoPE NTK mix factor
-        float    rope_ext_factor; // RoPE extrapolation mix factor
+        float    rope_freq_base;   // RoPE base frequency
+        float    rope_freq_scale;  // RoPE frequency scaling factor
+        float    rope_ext_factor;  // RoPE extrapolation mix factor
+        float    rope_attn_factor; // RoPE magnitude scaling factor
+        float    rope_beta_fast;   // RoPE low correction dim
+        float    rope_beta_slow;   // RoPE high correction dim
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;

From 826269adc52894d4c44165cf1437fd8d1585c653 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 4 Sep 2023 22:15:59 -0400
Subject: [PATCH 05/28] ggml : increase GGML_MAX_OP_PARAMS

---
 ggml.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.h b/ggml.h
index 1d52f8f048a3d..7084687cf8caa 100644
--- a/ggml.h
+++ b/ggml.h
@@ -211,7 +211,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
-#define GGML_MAX_OP_PARAMS     32
+#define GGML_MAX_OP_PARAMS     48
 #define GGML_DEFAULT_N_THREADS 4
 
 #if UINTPTR_MAX == 0xFFFFFFFF

From cf731d56480b8f155cc163d9bd45b681c80fba47 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Tue, 5 Sep 2023 14:14:05 -0400
Subject: [PATCH 06/28] YaRN : avoid NaN if unused betas are zero

---
 ggml-cuda.cu     | 7 +++++--
 ggml-metal.metal | 7 +++++--
 ggml.c           | 7 +++++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 73a3399c539aa..c649e90a13060 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4058,8 +4058,11 @@ static __device__ void rope_yarn(
 ) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
-    float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    }
 
     // Get n-d magnitude scaling corrected for interpolation
     if (freq_scale > 1.0f)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 6b0194d51e4c2..a1eb2d0d8c677 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -688,8 +688,11 @@ static void rope_yarn(
 ) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
-    float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    }
 
     // Get n-d magnitude scaling corrected for interpolation
     if (freq_scale > 1.0f)
diff --git a/ggml.c b/ggml.c
index 94a47faa166eb..85316a3a16812 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12626,8 +12626,11 @@ static void rope_yarn(
 ) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
-    float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-    float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+    }
 
     // Get n-d magnitude scaling corrected for interpolation
     if (freq_scale > 1.0f)

From dcb058ce5dbed14cce4a4e3ee628a5df7802d71a Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Tue, 5 Sep 2023 14:17:50 -0400
Subject: [PATCH 07/28] YaRN : fix missing parameter in CUDA impl

---
 ggml-cuda.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c649e90a13060..87ad07b0bc874 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4073,8 +4073,8 @@ static __device__ void rope_yarn(
 
 // rope == RoPE == rotary positional embedding
 static __global__ void rope_f32(
-    const float * x, float * dst, const int ncols, const float freq_scale, const float ext_factor,
-    const float theta_scale, const float p0, const int p_delta_rows, const rope_corr_dims corr_dims
+    float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale,
+    float p0, int p_delta_rows, rope_corr_dims corr_dims
 ) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
@@ -4086,7 +4086,7 @@ static __global__ void rope_f32(
     const int i = row*ncols + col;
 
     const float p = p0 + row / p_delta_rows;
-    const float theta_base   = p*powf(theta_scale,     col/2);
+    const float theta_base = p*powf(theta_scale, col/2);
 
     float cos_theta, sin_theta;
     rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -5001,15 +5001,15 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
 }
 
 static void rope_f32_cuda(
-    const float * x, float * dst, const int ncols, const int nrows, const float freq_scale, const float ext_factor,
-    const float theta_scale, const float p0, const int p_delta_rows, const rope_corr_dims corr_dims, cudaStream_t stream
+    float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor,
+    float theta_scale, float p0, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
     rope_f32<<<block_nums, block_dims, 0, stream>>>(
-        x, dst, ncols, freq_scale, ext_factor, theta_scale, p0, p_delta_rows, corr_dims
+        x, dst, ncols, freq_scale, ext_factor, attn_factor, theta_scale, p0, p_delta_rows, corr_dims
     );
 }
 
@@ -5785,8 +5785,8 @@ inline void ggml_cuda_op_rope(
         ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v);
 
         rope_f32_cuda(
-            src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, theta_scale, p0, ne01, corr_dims,
-            cudaStream_main
+            src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, attn_factor, theta_scale, p0, ne01,
+            corr_dims, cudaStream_main
         );
     }
 

From 281b26e647cc4f6a8f53fed504a5a490c79cd0cd Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 6 Sep 2023 13:00:04 -0400
Subject: [PATCH 08/28] convert : reduce unnecessary variables in Params

---
 convert.py | 63 ++++++++++++++++--------------------------------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/convert.py b/convert.py
index 5a7483b43d563..5a05a90de3ec4 100755
--- a/convert.py
+++ b/convert.py
@@ -210,23 +210,12 @@ def guessed(model: LazyModel) -> Params:
     def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
-        n_vocab          = config["vocab_size"]
-        n_embd           = config["hidden_size"]
-        n_layer          = config["num_hidden_layers"]
-        n_ff             = config["intermediate_size"]
-        n_head           = config["num_attention_heads"]
-        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
-        f_norm_eps       = config["rms_norm_eps"]
-        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
-
         rope_scaling = config.get("rope_scaling")
         if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
             f_rope_scale = config["rope_scaling"].get("factor")
         else:
             f_rope_scale = None
 
-        n_mult = Params.find_n_mult(n_ff, n_embd)
-
         if "max_sequence_length" in config:
             n_ctx = config["max_sequence_length"]
         elif "max_position_embeddings" in config:
@@ -236,16 +225,16 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
                             "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
 
         return Params(
-            n_vocab          = n_vocab,
-            n_embd           = n_embd,
-            n_mult           = n_mult,
-            n_layer          = n_layer,
+            n_vocab          = config["vocab_size"],
+            n_embd           = config["hidden_size"],
+            n_mult           = Params.find_n_mult(n_ff, n_embd),
+            n_layer          = config["num_hidden_layers"],
             n_ctx            = n_ctx,
-            n_ff             = n_ff,
-            n_head           = n_head,
-            n_head_kv        = n_head_kv,
-            f_norm_eps       = f_norm_eps,
-            f_rope_freq_base = f_rope_freq_base,
+            n_ff             = config["intermediate_size"],
+            n_head           = config["num_attention_heads"],
+            n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head,
+            f_norm_eps       = config["rms_norm_eps"],
+            f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
             f_rope_scale     = f_rope_scale,
         )
 
@@ -255,16 +244,6 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
     def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
-        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
-        n_embd           = config["dim"]
-        n_layer          = config["n_layers"]
-        n_mult           = config["multiple_of"]
-        n_ff             = -1
-        n_head           = config["n_heads"]
-        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
-        f_norm_eps       = config["norm_eps"]
-        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
-
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
         if f_rope_freq_base and f_rope_freq_base == 1000000:
             # CodeLlama
@@ -276,23 +255,17 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
             # LLaMA v1
             n_ctx = 2048
 
-        if n_vocab == -1:
-            n_vocab = model["tok_embeddings.weight"].shape[0]
-
-        if n_ff == -1:
-            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
-
         return Params(
-            n_vocab          = n_vocab,
-            n_embd           = n_embd,
-            n_mult           = n_mult,
-            n_layer          = n_layer,
+            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_embd           = config["dim"],
+            n_mult           = config["multiple_of"],
+            n_layer          = config["n_layers"],
             n_ctx            = n_ctx,
-            n_ff             = n_ff,
-            n_head           = n_head,
-            n_head_kv        = n_head_kv,
-            f_norm_eps       = f_norm_eps,
-            f_rope_freq_base = f_rope_freq_base,
+            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_head           = config["n_heads"],
+            n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head,
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
         )
 
     @staticmethod

From dc26a0dd326cbdcdfd8b2e66536d06fe60189eeb Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 20 Sep 2023 21:33:33 -0400
Subject: [PATCH 09/28] llama : simplify use of context params

---
 llama.cpp | 95 +++++++++++++++++--------------------------------------
 1 file changed, 29 insertions(+), 66 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 3c4d1ef5e5df5..87aea24682087 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1650,22 +1650,21 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     }
 }
 
-static void llm_load_hparams(
-        llama_model_loader & ml,
-        llama_model & model,
-        int n_ctx,
-        float rope_freq_base,
-        float rope_freq_scale,
-        float rope_ext_factor,
-        float rope_attn_factor,
-        float rope_beta_fast,
-        float rope_beta_slow) {
+static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const llama_context_params & params) {
     struct gguf_context * ctx = ml.ctx_gguf;
 
     const auto kv = LLM_KV(model.arch);
 
     auto & hparams = model.hparams;
 
+    hparams.n_ctx            = params.n_ctx;
+    hparams.rope_freq_base   = params.rope_freq_base;
+    hparams.rope_freq_scale  = params.rope_freq_scale;
+    hparams.rope_ext_factor  = params.rope_ext_factor;
+    hparams.rope_attn_factor = params.rope_attn_factor;
+    hparams.rope_beta_fast   = params.rope_beta_fast;
+    hparams.rope_beta_slow   = params.rope_beta_slow;
+
     // get general kv
     GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
 
@@ -1682,16 +1681,17 @@ static void llm_load_hparams(
     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
 
     // rope_freq_base (optional)
-    if (rope_freq_base == 0.0f) {
-        rope_freq_base = 10000.0f;
+    if (hparams.rope_freq_base == 0.0f) {
+        float rope_freq_base = 10000.0f;
         GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+        hparams.rope_freq_base = rope_freq_base;
     }
 
     // rope_freq_scale (inverse of the kv) is optional
-    if (rope_freq_scale == 0.0f) {
+    if (hparams.rope_freq_scale == 0.0f) {
         float ropescale = 1.0f;
         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-        rope_freq_scale = 1.0f/ropescale;
+        hparams.rope_freq_scale = 1.0f/ropescale;
     }
 
     // sanity check for n_rot (optional)
@@ -1759,14 +1759,6 @@ static void llm_load_hparams(
     };
 
     model.ftype = ml.ftype;
-
-    hparams.n_ctx            = n_ctx;
-    hparams.rope_freq_base   = rope_freq_base;
-    hparams.rope_freq_scale  = rope_freq_scale;
-    hparams.rope_ext_factor  = rope_ext_factor;
-    hparams.rope_attn_factor = rope_attn_factor;
-    hparams.rope_beta_fast   = rope_beta_fast;
-    hparams.rope_beta_slow   = rope_beta_slow;
 }
 
 // TODO: This should probably be in llama.h
@@ -2388,37 +2380,13 @@ static void llm_load_tensors(
     model.t_load_us = ggml_time_us() - model.t_start_us;
 }
 
-static bool llama_model_load(
-        const std::string & fname,
-        llama_model & model,
-        int n_ctx,
-        int n_batch,
-        int n_gpu_layers,
-        int main_gpu,
-        const float * tensor_split,
-        const bool mul_mat_q,
-        float rope_freq_base,
-        float rope_freq_scale,
-        float rope_ext_factor,
-        float rope_attn_factor,
-        float rope_beta_fast,
-        float rope_beta_slow,
-        bool low_vram,
-        ggml_type memory_type,
-        bool use_mmap,
-        bool use_mlock,
-        bool vocab_only,
-        llama_progress_callback progress_callback,
-        void *progress_callback_user_data) {
+static bool llama_model_load(const std::string & fname, llama_model & model, const llama_context_params & params) {
     try {
-        std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
+        std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, params.use_mmap));
 
-        llm_load_arch(*ml, model);
-        llm_load_hparams(
-            *ml, model, n_ctx, rope_freq_base, rope_freq_scale, rope_ext_factor, rope_attn_factor, rope_beta_fast,
-            rope_beta_slow
-        );
-        llm_load_vocab(*ml, model);
+        llm_load_arch   (*ml, model);
+        llm_load_hparams(*ml, model, params);
+        llm_load_vocab  (*ml, model);
 
         llm_load_print_meta(*ml, model);
 
@@ -2426,15 +2394,18 @@ static bool llama_model_load(
             throw std::runtime_error("vocab size mismatch");
         }
 
-        if (vocab_only) {
+        if (params.vocab_only) {
             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
             return true;
         }
 
+        ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
         llm_load_tensors(
-                *ml, model, n_batch, n_gpu_layers,
-                main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
-                use_mlock, progress_callback, progress_callback_user_data);
+            *ml, model, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q,
+            params.low_vram, memory_type, params.use_mlock, params.progress_callback,
+            params.progress_callback_user_data
+        );
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
         return false;
@@ -5694,8 +5665,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
 
     llama_model model;
-    llm_load_arch(*ml, model);
-    llm_load_hparams(*ml, model, 0, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+    llm_load_arch   (*ml, model);
+    llm_load_hparams(*ml, model, llama_context_default_params());
 
     if (params->only_copy) {
         ftype = model.ftype;
@@ -6298,8 +6269,6 @@ struct llama_model * llama_load_model_from_file(
 
     llama_model * model = new llama_model;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
@@ -6316,13 +6285,7 @@ struct llama_model * llama_load_model_from_file(
         };
     }
 
-    if (!llama_model_load(
-        path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split,
-        params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, params.rope_ext_factor,
-        params.rope_attn_factor, params.rope_beta_fast, params.rope_beta_slow, params.low_vram, memory_type,
-        params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
-        params.progress_callback_user_data
-    )) {
+    if (!llama_model_load(path_model, *model, params)) {
         LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
         delete model;
         return nullptr;

From 904d4edfa1c3ad79e32e48365168bbe0e5bc36f2 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 14 Sep 2023 13:26:10 -0400
Subject: [PATCH 10/28] llama : store YaRN parameters in GGUF

---
 common/common.cpp          |  73 +++++++++++++---------
 common/common.h            |  10 +--
 convert.py                 |  53 +++++++++++-----
 examples/server/server.cpp |  39 ++++++++----
 gguf-py/gguf/gguf.py       |  27 ++++++--
 llama.cpp                  | 124 ++++++++++++++++++++++++++-----------
 llama.h                    |  34 ++++++----
 7 files changed, 245 insertions(+), 115 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9e4452dabd021..ca4b9c1cc8a16 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -192,36 +192,46 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scaling") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string value(argv[i]);
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else { invalid_param = true; break; }
         } else if (arg == "--rope-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
-        } else if (arg == "--rope-ext-factor") {
+        } else if (arg == "--yarn-ext-factor") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ext_factor = std::stof(argv[i]);
-        } else if (arg == "--rope-attn-factor") {
+            params.yarn_ext_factor = std::stof(argv[i]);
+        } else if (arg == "--yarn-attn-factor") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_attn_factor = std::stof(argv[i]);
-        } else if (arg == "--rope-beta-fast") {
+            params.yarn_attn_factor = std::stof(argv[i]);
+        } else if (arg == "--yarn-beta-fast") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_beta_fast = std::stof(argv[i]);
-        } else if (arg == "--rope-beta-slow") {
+            params.yarn_beta_fast = std::stof(argv[i]);
+        } else if (arg == "--yarn-beta-slow") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_beta_slow = std::stof(argv[i]);
+            params.yarn_beta_slow = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -671,13 +681,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-scaling {none,linear,yarn}\n");
+    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
+    printf("  --rope-scale N        RoPE context scaling factor, inverse of --rope-freq-scale\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
-    printf("  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
-    printf("  --rope-attn-factor N  RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor);
-    printf("  --rope-beta-fast N    RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast);
-    printf("  --rope-beta-slow N    RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow);
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
+    printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
+    printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
+    printf("  --yarn-beta-slow N    YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -758,22 +770,23 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     if (params.n_gpu_layers != -1) {
         lparams.n_gpu_layers = params.n_gpu_layers;
     }
-    lparams.main_gpu         = params.main_gpu;
-    lparams.tensor_split     = params.tensor_split;
-    lparams.low_vram         = params.low_vram;
-    lparams.mul_mat_q        = params.mul_mat_q;
-    lparams.seed             = params.seed;
-    lparams.f16_kv           = params.memory_f16;
-    lparams.use_mmap         = params.use_mmap;
-    lparams.use_mlock        = params.use_mlock;
-    lparams.logits_all       = params.perplexity;
-    lparams.embedding        = params.embedding;
-    lparams.rope_freq_base   = params.rope_freq_base;
-    lparams.rope_freq_scale  = params.rope_freq_scale;
-    lparams.rope_ext_factor  = params.rope_ext_factor;
-    lparams.rope_attn_factor = params.rope_attn_factor;
-    lparams.rope_beta_fast   = params.rope_beta_fast;
-    lparams.rope_beta_slow   = params.rope_beta_slow;
+    lparams.main_gpu          = params.main_gpu;
+    lparams.tensor_split      = params.tensor_split;
+    lparams.low_vram          = params.low_vram;
+    lparams.mul_mat_q         = params.mul_mat_q;
+    lparams.seed              = params.seed;
+    lparams.f16_kv            = params.memory_f16;
+    lparams.use_mmap          = params.use_mmap;
+    lparams.use_mlock         = params.use_mlock;
+    lparams.logits_all        = params.perplexity;
+    lparams.embedding         = params.embedding;
+    lparams.rope_scaling_type = params.rope_scaling_type;
+    lparams.rope_freq_base    = params.rope_freq_base;
+    lparams.rope_freq_scale   = params.rope_freq_scale;
+    lparams.yarn_ext_factor   = params.yarn_ext_factor;
+    lparams.yarn_attn_factor  = params.yarn_attn_factor;
+    lparams.yarn_beta_fast    = params.yarn_beta_fast;
+    lparams.yarn_beta_slow    = params.yarn_beta_slow;
 
     return lparams;
 }
diff --git a/common/common.h b/common/common.h
index 0b45b4278d73c..a1e7da128c113 100644
--- a/common/common.h
+++ b/common/common.h
@@ -50,10 +50,12 @@ struct gpt_params {
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
-    float   rope_ext_factor                 = 0.0f;     // RoPE extrapolation mix factor
-    float   rope_attn_factor                = 1.0f;     // RoPE magnitude scaling factor
-    float   rope_beta_fast                  = 32.0f;    // RoPE low correction dim
-    float   rope_beta_slow                  = 1.0f;     // RoPE high correction dim
+    float   yarn_ext_factor                 = 0.0f;     // YaRN extrapolation mix factor
+    float   yarn_attn_factor                = 1.0f;     // YaRN magnitude scaling factor
+    float   yarn_beta_fast                  = 32.0f;    // YaRN low correction dim
+    float   yarn_beta_slow                  = 1.0f;     // YaRN high correction dim
+
+    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // sampling parameters
     int32_t top_k             = 40;    // <= 0 to use vocab size
diff --git a/convert.py b/convert.py
index 649624cff7e6d..f08cf01c5786e 100755
--- a/convert.py
+++ b/convert.py
@@ -152,8 +152,11 @@ class Params:
     n_head_kv:  int
     f_norm_eps: float
 
+    rope_scaling_type: gguf.RopeScalingType | None = None
     f_rope_freq_base: float | None = None
     f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None
 
     ftype: GGMLFileType | None = None
 
@@ -199,11 +202,20 @@ def guessed(model: LazyModel) -> Params:
     def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
         rope_scaling = config.get("rope_scaling")
-        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
-            f_rope_scale = config["rope_scaling"].get("factor")
-        else:
-            f_rope_scale = None
+
+        if rope_scaling is not None and typ := rope_scaling.get("type"):
+            rope_factor = rope_scaling.get("factor")
+            f_rope_scale = rope_factor
+            if typ == "linear":
+                rope_scaling_type = RopeScalingType.LINEAR
+            elif typ == "yarn":
+                rope_scaling_type = RopeScalingType.YARN
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
+            else:
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
 
         if "max_sequence_length" in config:
             n_ctx = config["max_sequence_length"]
@@ -214,16 +226,18 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
                             "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
 
         return Params(
-            n_vocab          = config["vocab_size"],
-            n_embd           = config["hidden_size"],
-            n_layer          = config["num_hidden_layers"],
-            n_ctx            = n_ctx,
-            n_ff             = config["intermediate_size"],
-            n_head           = config["num_attention_heads"],
-            n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head,
-            f_norm_eps       = config["rms_norm_eps"],
-            f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
-            f_rope_scale     = f_rope_scale,
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = config["num_attention_heads"],
+            n_head_kv         = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head,
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config["rope_theta"] if "rope_theta" in config else None,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
         )
 
     # LLaMA v2 70B params.json
@@ -819,8 +833,15 @@ def add_meta_arch(self, params: Params) -> None:
         if params.f_rope_freq_base is not None:
             self.gguf.add_rope_freq_base(params.f_rope_freq_base)
 
-        if params.f_rope_scale is not None:
-            self.gguf.add_rope_scale_linear(params.f_rope_scale)
+        if params.rope_scaling_type:
+            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
+            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
+
+        if params.n_orig_ctx is not None:
+            self.gguf.add_rope_original_context_length(params.n_orig_ctx)
+
+        if params.rope_finetuned is not None:
+            self.gguf.add_rope_finetuned(params.rope_finetuned)
 
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0fb4e2c324b55..3a1c55b1c939c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -701,12 +701,14 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
     printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-scaling {none,linear,yarn}\n");
+    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
     printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  --rope-ext-factor N   RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor);
-    printf("  --rope-attn-factor N  RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor);
-    printf("  --rope-beta-fast N    RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast);
-    printf("  --rope-beta-slow N    RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow);
+    printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
+    printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
+    printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
+    printf("  --yarn-beta-slow N    YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -824,6 +826,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.n_ctx = std::stoi(argv[i]);
         }
+        else if (arg == "--rope-scaling")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            std::string value(argv[i]);
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else { invalid_param = true; break; }
+        }
         else if (arg == "--rope-freq-base")
         {
             if (++i >= argc)
@@ -842,37 +857,37 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
-        else if (arg == "--rope-ext-factor")
+        else if (arg == "--yarn-ext-factor")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_ext_factor = std::stof(argv[i]);
+            params.yarn_ext_factor = std::stof(argv[i]);
         }
-        else if (arg == "--rope-attn-factor")
+        else if (arg == "--yarn-attn-factor")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_attn_factor = std::stof(argv[i]);
+            params.yarn_attn_factor = std::stof(argv[i]);
         }
-        else if (arg == "--rope-beta-fast")
+        else if (arg == "--yarn-beta-fast")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_beta_fast = std::stof(argv[i]);
+            params.yarn_beta_fast = std::stof(argv[i]);
         }
-        else if (arg == "--rope-beta-slow")
+        else if (arg == "--yarn-beta-slow")
         {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.rope_beta_slow = std::stof(argv[i]);
+            params.yarn_beta_slow = std::stof(argv[i]);
         }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index e0e0dbcbbe840..742ce214356e2 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -52,9 +52,12 @@
 KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 
 # RoPE
-KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
-KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
-KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+KEY_ROPE_DIMENSION_COUNT         = "{arch}.rope.dimension_count"
+KEY_ROPE_FREQ_BASE               = "{arch}.rope.freq_base"
+KEY_ROPE_SCALING_TYPE            = "{arch}.rope.scaling.type"
+KEY_ROPE_SCALING_FACTOR          = "{arch}.rope.scaling.factor"
+KEY_ROPE_SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
+KEY_ROPE_SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"
 
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@@ -407,6 +410,11 @@ class TokenType(IntEnum):
     UNUSED       = 5
     BYTE         = 6
 
+class RopeScalingType(IntEnum):
+    NONE   = 0
+    LINEAR = 1
+    YARN   = 2
+
 #
 # implementation
 #
@@ -760,8 +768,17 @@ def add_rope_dimension_count(self, count: int):
     def add_rope_freq_base(self, value: float):
         self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
 
-    def add_rope_scale_linear(self, value: float):
-        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+    def add_rope_scaling_type(self, value: RopeScalingType):
+        self.add_uint8(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), int(value))
+
+    def add_rope_scaling_factor(self, value: float):
+        self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
+
+    def add_rope_scaling_orig_ctx_len(self, value: int):
+        self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
+
+    def add_rope_scaling_finetuned(self, value: bool):
+        self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
 
     def add_tokenizer_model(self, model: str):
         self.add_string(KEY_TOKENIZER_MODEL, model)
diff --git a/llama.cpp b/llama.cpp
index 87aea24682087..cd545b254d1eb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -204,7 +204,10 @@ enum llm_kv {
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
 
     LLM_KV_TOKENIZER_MODEL,
     LLM_KV_TOKENIZER_LIST,
@@ -246,9 +249,12 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
     { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
 
-    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"  },
-    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"        },
-    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"     },
+    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
+    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
+    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
     { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
     { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
@@ -943,12 +949,17 @@ struct llama_hparams {
     float f_norm_eps;
     float f_norm_rms_eps;
 
-    float rope_freq_base;
-    float rope_freq_scale;
-    float rope_ext_factor;
-    float rope_attn_factor;
-    float rope_beta_fast;
-    float rope_beta_slow;
+    float    rope_freq_base;
+    float    rope_freq_scale;
+    bool     rope_finetuned;
+    uint32_t n_yarn_orig_ctx;
+
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
 
     bool operator!=(const llama_hparams & other) const {
         return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -1660,10 +1671,10 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
     hparams.n_ctx            = params.n_ctx;
     hparams.rope_freq_base   = params.rope_freq_base;
     hparams.rope_freq_scale  = params.rope_freq_scale;
-    hparams.rope_ext_factor  = params.rope_ext_factor;
-    hparams.rope_attn_factor = params.rope_attn_factor;
-    hparams.rope_beta_fast   = params.rope_beta_fast;
-    hparams.rope_beta_slow   = params.rope_beta_slow;
+    hparams.yarn_ext_factor  = params.yarn_ext_factor;
+    hparams.yarn_attn_factor = params.yarn_attn_factor;
+    hparams.yarn_beta_fast   = params.yarn_beta_fast;
+    hparams.yarn_beta_slow   = params.yarn_beta_slow;
 
     // get general kv
     GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
@@ -1680,6 +1691,14 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
     hparams.n_head_kv = hparams.n_head;
     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
 
+    hparams.rope_finetuned = false;
+    GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
+                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
+
+    hparams.n_yarn_orig_ctx = 0;
+    GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
+
     // rope_freq_base (optional)
     if (hparams.rope_freq_base == 0.0f) {
         float rope_freq_base = 10000.0f;
@@ -1687,13 +1706,28 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
         hparams.rope_freq_base = rope_freq_base;
     }
 
+    llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
+        uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
+        GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+        rope_scaling_type = llama_rope_scaling_type(type);
+    }
+    GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
+
     // rope_freq_scale (inverse of the kv) is optional
-    if (hparams.rope_freq_scale == 0.0f) {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
+        hparams.rope_freq_scale = 1.0f;
+    } else if (hparams.rope_freq_scale == 0.0f) {
         float ropescale = 1.0f;
-        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
         hparams.rope_freq_scale = 1.0f/ropescale;
     }
 
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) {
+        hparams.yarn_ext_factor = 1.0f; // enable YaRN
+    }
+
     // sanity check for n_rot (optional)
     {
         hparams.n_rot = hparams.n_embd / hparams.n_head;
@@ -1902,6 +1936,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_ff           = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: freq_base      = %.1f\n",   __func__, hparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale     = %g\n",     __func__, hparams.rope_freq_scale);
+    LLAMA_LOG_INFO("%s: YaRN scaling   = %g\n",     __func__, hparams.yarn_ext_factor);
+    LLAMA_LOG_INFO("%s: YaRN orig ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
+    LLAMA_LOG_INFO("%s: YaRN beta_fast = %f\n",     __func__, hparams.yarn_beta_fast);
+    LLAMA_LOG_INFO("%s: YaRN beta_slow = %f\n",     __func__, hparams.yarn_beta_slow);
+    LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "no");
     LLAMA_LOG_INFO("%s: model type     = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype    = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params   = %.2f B\n", __func__, ml.n_elements*1e-9);
@@ -2444,10 +2483,10 @@ static struct ggml_cgraph * llm_build_llama(
 
     const float freq_base    = hparams.rope_freq_base;
     const float freq_scale   = hparams.rope_freq_scale;
-    const float ext_factor   = hparams.rope_ext_factor;
-    const float attn_factor  = hparams.rope_attn_factor;
-    const float beta_fast    = hparams.rope_beta_fast;
-    const float beta_slow    = hparams.rope_beta_slow;
+    const float ext_factor   = hparams.yarn_ext_factor;
+    const float attn_factor  = hparams.yarn_attn_factor;
+    const float beta_fast    = hparams.yarn_beta_fast;
+    const float beta_slow    = hparams.yarn_beta_slow;
     const float norm_rms_eps = hparams.f_norm_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
@@ -2561,15 +2600,13 @@ static struct ggml_cgraph * llm_build_llama(
 
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
                 ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base,
-                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-            );
+                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
             struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
                 ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base,
-                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-            );
+                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -2786,6 +2823,10 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     const float freq_base    = hparams.rope_freq_base;
     const float freq_scale   = hparams.rope_freq_scale;
+    const float ext_factor   = hparams.yarn_ext_factor;
+    const float attn_factor  = hparams.yarn_attn_factor;
+    const float beta_fast    = hparams.yarn_beta_fast;
+    const float beta_slow    = hparams.yarn_beta_slow;
     const float norm_rms_eps = hparams.f_norm_rms_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
@@ -2901,8 +2942,16 @@ static struct ggml_cgraph * llm_build_baichaun(
             struct ggml_tensor * Qcur;
             switch (model.type) {
                 case MODEL_7B:
-                    Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Kcur = ggml_rope_custom_inplace(
+                        ctx0,
+                        ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N),
+                        n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+                    Qcur = ggml_rope_custom_inplace(
+                        ctx0,
+                        ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),
+                        n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                    );
                     break;
                 case MODEL_13B:
                     Kcur  = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
@@ -3146,10 +3195,10 @@ static struct ggml_cgraph * llm_build_falcon(
 
     const float freq_base   = hparams.rope_freq_base;
     const float freq_scale  = hparams.rope_freq_scale;
-    const float ext_factor  = hparams.rope_ext_factor;
-    const float attn_factor = hparams.rope_attn_factor;
-    const float beta_fast   = hparams.rope_beta_fast;
-    const float beta_slow   = hparams.rope_beta_slow;
+    const float ext_factor  = hparams.yarn_ext_factor;
+    const float attn_factor = hparams.yarn_attn_factor;
+    const float beta_fast   = hparams.yarn_beta_fast;
+    const float beta_slow   = hparams.yarn_beta_slow;
     const float norm_eps    = hparams.f_norm_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
@@ -3302,11 +3351,13 @@ static struct ggml_cgraph * llm_build_falcon(
 
             // using mode = 2 for neox mode
             struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
-                ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                ctx0, tmpq, n_past, n_embd_head, 2, 0,
+                freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
             );
             offload_func_kq(Qcur);
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
-                ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                ctx0, tmpk, n_past, n_embd_head, 2, 0,
+                freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
             );
             offload_func_kq(Kcur);
 
@@ -6186,10 +6237,11 @@ struct llama_context_params llama_context_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
-        /*.rope_ext_factor             =*/ 0.0f,
-        /*.rope_attn_factor            =*/ 1.0f,
-        /*.rope_beta_fast              =*/ 32.0f,
-        /*.rope_beta_slow              =*/ 1.0f,
+        /*.yarn_ext_factor             =*/ 0.0f,
+        /*.yarn_attn_factor            =*/ 1.0f,
+        /*.yarn_beta_fast              =*/ 32.0f,
+        /*.yarn_beta_slow              =*/ 1.0f,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
diff --git a/llama.h b/llama.h
index 762362bc6b51e..5d69997bf5a4b 100644
--- a/llama.h
+++ b/llama.h
@@ -108,6 +108,14 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
 
+    enum llama_rope_scaling_type: int8_t {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+    };
+
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -134,10 +142,12 @@ extern "C" {
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency
         float    rope_freq_scale;  // RoPE frequency scaling factor
-        float    rope_ext_factor;  // RoPE extrapolation mix factor
-        float    rope_attn_factor; // RoPE magnitude scaling factor
-        float    rope_beta_fast;   // RoPE low correction dim
-        float    rope_beta_slow;   // RoPE high correction dim
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+
+        llama_rope_scaling_type rope_scaling_type;
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
@@ -145,14 +155,14 @@ extern "C" {
         void * progress_callback_user_data;
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
+        bool low_vram;      // if true, reduce VRAM usage at the cost of performance
+        bool mul_mat_q;     // if true, use experimental mul_mat_q kernels
+        bool f16_kv;        // use fp16 for KV cache
+        bool logits_all;    // the llama_eval() call computes all logits, not just the last one
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool embedding;     // embedding mode only
     };
 
     // Signature for logging events

From 56abb9a406ef34a995c56be838ebe6529cd50438 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 20 Sep 2023 22:25:21 -0400
Subject: [PATCH 11/28] fix convert scripts

---
 convert-baichuan-hf-to-gguf.py |  3 ++-
 convert.py                     | 25 +++++++++++++------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 8bd34dc440769..ef68d5819fef2 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -154,7 +154,8 @@ def parse_args() -> argparse.Namespace:
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
     if "type" in hparams["rope_scaling"]:
         if hparams["rope_scaling"]["type"] == "linear":
-            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
 
 
 # TOKENIZATION
diff --git a/convert.py b/convert.py
index f08cf01c5786e..e098e9eef38ee 100755
--- a/convert.py
+++ b/convert.py
@@ -205,13 +205,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
         rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
         rope_scaling = config.get("rope_scaling")
 
-        if rope_scaling is not None and typ := rope_scaling.get("type"):
+        if rope_scaling is not None and (typ := rope_scaling.get("type")):
             rope_factor = rope_scaling.get("factor")
             f_rope_scale = rope_factor
             if typ == "linear":
-                rope_scaling_type = RopeScalingType.LINEAR
+                rope_scaling_type = gguf.RopeScalingType.LINEAR
             elif typ == "yarn":
-                rope_scaling_type = RopeScalingType.YARN
+                rope_scaling_type = gguf.RopeScalingType.YARN
                 n_orig_ctx = rope_scaling['original_max_position_embeddings']
                 rope_finetuned = rope_scaling['finetuned']
             else:
@@ -231,10 +231,10 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
             n_layer           = config["num_hidden_layers"],
             n_ctx             = n_ctx,
             n_ff              = config["intermediate_size"],
-            n_head            = config["num_attention_heads"],
-            n_head_kv         = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head,
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
             f_norm_eps        = config["rms_norm_eps"],
-            f_rope_freq_base  = config["rope_theta"] if "rope_theta" in config else None,
+            f_rope_freq_base  = config.get("rope_theta"),
             f_rope_scale      = f_rope_scale,
             n_orig_ctx        = n_orig_ctx,
             rope_finetuned    = rope_finetuned,
@@ -247,7 +247,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if f_rope_freq_base == 1000000:
+        if config.get("rope_theta") == 1000000:
             # CodeLlama
             n_ctx = 16384
         elif config["norm_eps"] == 1e-05:
@@ -263,10 +263,10 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
             n_layer          = config["n_layers"],
             n_ctx            = n_ctx,
             n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
-            n_head           = config["n_heads"],
-            n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head,
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
             f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None,
+            f_rope_freq_base = config.get("rope_theta"),
         )
 
     @staticmethod
@@ -834,14 +834,15 @@ def add_meta_arch(self, params: Params) -> None:
             self.gguf.add_rope_freq_base(params.f_rope_freq_base)
 
         if params.rope_scaling_type:
+            assert params.f_rope_scale is not None
             self.gguf.add_rope_scaling_type(params.rope_scaling_type)
             self.gguf.add_rope_scaling_factor(params.f_rope_scale)
 
         if params.n_orig_ctx is not None:
-            self.gguf.add_rope_original_context_length(params.n_orig_ctx)
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
 
         if params.rope_finetuned is not None:
-            self.gguf.add_rope_finetuned(params.rope_finetuned)
+            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
 
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)

From 43eaf06a2f27e9a8dc65109a03b6025441c22f9a Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 20 Sep 2023 23:29:08 -0400
Subject: [PATCH 12/28] llama : fix C compatibility

---
 common/common.h |  3 +--
 llama.cpp       |  6 +++---
 llama.h         | 27 +++++++++++++--------------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/common/common.h b/common/common.h
index a1e7da128c113..e49db6fedcdaf 100644
--- a/common/common.h
+++ b/common/common.h
@@ -54,8 +54,7 @@ struct gpt_params {
     float   yarn_attn_factor                = 1.0f;     // YaRN magnitude scaling factor
     float   yarn_beta_fast                  = 32.0f;    // YaRN low correction dim
     float   yarn_beta_slow                  = 1.0f;     // YaRN high correction dim
-
-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // sampling parameters
     int32_t top_k             = 40;    // <= 0 to use vocab size
diff --git a/llama.cpp b/llama.cpp
index cd545b254d1eb..56c511b594333 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
         hparams.rope_freq_base = rope_freq_base;
     }
 
-    llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
+    int8_t rope_scaling_type = params.rope_scaling_type;
 
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
         GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
-        rope_scaling_type = llama_rope_scaling_type(type);
+        rope_scaling_type = int8_t(type);
     }
     GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
 
@@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_batch                     =*/ 512,
         /*.n_gpu_layers                =*/ 0,
         /*.main_gpu                    =*/ 0,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
@@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_attn_factor            =*/ 1.0f,
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
diff --git a/llama.h b/llama.h
index 5d69997bf5a4b..6528254cb9f5d 100644
--- a/llama.h
+++ b/llama.h
@@ -108,7 +108,7 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
 
-    enum llama_rope_scaling_type: int8_t {
+    enum llama_rope_scaling_type {
         LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
         LLAMA_ROPE_SCALING_NONE        = 0,
         LLAMA_ROPE_SCALING_LINEAR      = 1,
@@ -131,23 +131,22 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
     struct llama_context_params {
-        uint32_t seed;         // RNG seed, -1 for random
-        int32_t  n_ctx;        // text context
-        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gpu_layers; // number of layers to store in VRAM
-        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+        uint32_t seed;              // RNG seed, -1 for random
+        int32_t  n_ctx;             // text context
+        int32_t  n_batch;           // prompt processing batch size
+        int32_t  n_gpu_layers;      // number of layers to store in VRAM
+        int32_t  main_gpu;          // the GPU that is used for scratch and small tensors
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency
-        float    rope_freq_scale;  // RoPE frequency scaling factor
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-
-        llama_rope_scaling_type rope_scaling_type;
+        float rope_freq_base;   // RoPE base frequency
+        float rope_freq_scale;  // RoPE frequency scaling factor
+        float yarn_ext_factor;  // YaRN extrapolation mix factor
+        float yarn_attn_factor; // YaRN magnitude scaling factor
+        float yarn_beta_fast;   // YaRN low correction dim
+        float yarn_beta_slow;   // YaRN high correction dim
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;

From fe788c45c8d5a2c6e6c2f0ec04978419555c65b2 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 21 Sep 2023 00:01:48 -0400
Subject: [PATCH 13/28] don't hardcode max_pos_emb

---
 .../train-text-from-scratch.cpp               |   2 +-
 ggml-cuda.cu                                  |  27 ++---
 ggml-metal.m                                  |  19 ++--
 ggml-metal.metal                              |  17 +--
 ggml.c                                        | 100 ++++++++++--------
 ggml.h                                        |   7 +-
 llama.cpp                                     |  67 +++++++-----
 7 files changed, 131 insertions(+), 108 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 8a5ad82bf3441..36415398e6fee 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -687,7 +687,7 @@ struct ggml_tensor * llama_build_train_graphs(
         const int rope_mode = 0;
 
         return ggml_rope_custom(
-            ctx, t, n_past, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+            ctx, t, n_past, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
         );
     };
 
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 394fd81fa3cd3..bd788ce4da4e1 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4386,7 +4386,7 @@ static __device__ void rope_yarn(
 
 // rope == RoPE == rotary positional embedding
 static __global__ void rope_f32(
-    float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale,
+    const float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale,
     float p0, int p_delta_rows, rope_corr_dims corr_dims
 ) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -5396,7 +5396,7 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
 }
 
 static void rope_f32_cuda(
-    float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor,
+    const float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor,
     float theta_scale, float p0, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
     GGML_ASSERT(ncols % 2 == 0);
@@ -6109,19 +6109,20 @@ inline void ggml_cuda_op_rope(
     const int64_t ne01 = src0->ne[1];
     const int64_t nrows = ggml_nrows(src0);
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
 
     // RoPE alteration for extended context
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
     const float p0 = (mode & 1) == 0 ? n_past : 0;
@@ -6137,7 +6138,7 @@ inline void ggml_cuda_op_rope(
         rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
     } else {
         rope_corr_dims corr_dims;
-        ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v);
+        ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
 
         rope_f32_cuda(
             src0_dd, dst_dd, ne00, nrows, freq_scale, ext_factor, attn_factor, theta_scale, p0, ne01, corr_dims,
diff --git a/ggml-metal.m b/ggml-metal.m
index 06d97695bc0b2..c0607e844d146 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1176,17 +1176,18 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_ROPE:
                         {
-                            const int n_past = ((int32_t *) dst->op_params)[0];
-                            const int n_dims = ((int32_t *) dst->op_params)[1];
-                            const int mode   = ((int32_t *) dst->op_params)[2];
+                            const int n_past     = ((int32_t *) dst->op_params)[0];
+                            const int n_dims     = ((int32_t *) dst->op_params)[1];
+                            const int mode       = ((int32_t *) dst->op_params)[2];
+                            const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
 
                             float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
-                            memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
-                            memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
-                            memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
-                            memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
-                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
+                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
diff --git a/ggml-metal.metal b/ggml-metal.metal
index c5e0ee8a042f9..ddf81fb7b9e04 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -832,18 +832,18 @@ static void rope_yarn(
     *sin_theta = sinf(theta) * mscale;
 }
 
-constant float max_pos_emb = 2048;
-
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_yarn_corr_factor(const int n_dims, const float n_rot, const float base) {
-    return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
 }
 
-static void rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) {
+static void rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
     // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, beta_slow, freq_base)));
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
 }
 
 kernel void kernel_rope(
@@ -868,6 +868,7 @@ kernel void kernel_rope(
         constant       int & n_past,
         constant       int & n_dims,
         constant       int & mode,
+        constant       int & n_orig_ctx,
         constant     float & freq_base,
         constant     float & freq_scale,
         constant     float & ext_factor,
@@ -884,7 +885,7 @@ kernel void kernel_rope(
     const bool is_neox = mode & 2;
 
     float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
+    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
     const int64_t p = (mode & 1) == 0 ? n_past + i2 : i2;
 
diff --git a/ggml.c b/ggml.c
index 53137924d0014..56b9fdd291aa0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6973,6 +6973,7 @@ static struct ggml_tensor * ggml_rope_impl(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -6991,15 +6992,15 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[12] = { n_past, n_dims, mode, n_ctx };
-    memcpy(params +  4, &freq_base,    sizeof(float));
-    memcpy(params +  5, &freq_scale,   sizeof(float));
-    memcpy(params +  6, &ext_factor,   sizeof(float));
-    memcpy(params +  7, &attn_factor,  sizeof(float));
-    memcpy(params +  8, &beta_fast,    sizeof(float));
-    memcpy(params +  9, &beta_slow,    sizeof(float));
-    memcpy(params + 10, &xpos_base,    sizeof(float));
-    memcpy(params + 11, &xpos_down,    sizeof(bool));
+    int32_t params[13] = { n_past, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -7017,7 +7018,7 @@ struct ggml_tensor * ggml_rope(
         int                   mode,
         int                   n_ctx) {
     return ggml_rope_impl(
-        ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
+        ctx, a, n_past, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
     );
 }
 
@@ -7029,7 +7030,7 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   mode,
         int                   n_ctx) {
     return ggml_rope_impl(
-        ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
+        ctx, a, n_past, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
     );
 }
 
@@ -7040,6 +7041,7 @@ struct ggml_tensor * ggml_rope_custom(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -7047,8 +7049,8 @@ struct ggml_tensor * ggml_rope_custom(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f,
-        false, false
+        ctx, a, n_past, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
     );
 }
 
@@ -7059,6 +7061,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -7066,8 +7069,8 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f,
-        false, true
+        ctx, a, n_past, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
     );
 }
 
@@ -7078,7 +7081,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
         int                   n_dims,
         float                 base,
         bool                  down) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
 }
 
 // ggml_rope_back
@@ -12675,15 +12678,16 @@ static void rope_yarn(
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(const int n_dims, const float n_rot, const float base) {
-    static const float max_pos_emb = 2048;
-    return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 }
 
-void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) {
+void ggml_rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
     // start and end correction dims
-    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, beta_fast, freq_base)));
-    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, beta_slow, freq_base)));
+    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
 }
 
 static void ggml_compute_forward_rope_f32(
@@ -12701,18 +12705,20 @@ static void ggml_compute_forward_rope_f32(
     float xpos_base;
     bool xpos_down;
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  4, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&xpos_base,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&xpos_down,   (int32_t *) dst->op_params + 11, sizeof(bool));
+    const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
+    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
 
     assert(n_past >= 0);
 
@@ -12743,7 +12749,7 @@ static void ggml_compute_forward_rope_f32(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
     float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12844,16 +12850,17 @@ static void ggml_compute_forward_rope_f16(
 
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
+    const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12884,7 +12891,7 @@ static void ggml_compute_forward_rope_f16(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
     float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims);
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -16641,6 +16648,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 n_past,
                                 n_dims,
                                 mode,
+                                0,
                                 n_ctx,
                                 freq_base,
                                 freq_scale,
diff --git a/ggml.h b/ggml.h
index 5078fb7b5c4b7..26f7cf024f8ab 100644
--- a/ggml.h
+++ b/ggml.h
@@ -219,7 +219,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
-#define GGML_MAX_OP_PARAMS     48
+#define GGML_MAX_OP_PARAMS     64
 #define GGML_DEFAULT_N_THREADS 4
 
 #if UINTPTR_MAX == 0xFFFFFFFF
@@ -1248,6 +1248,7 @@ extern "C" {
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1263,6 +1264,7 @@ extern "C" {
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1271,7 +1273,8 @@ extern "C" {
             float                 beta_slow);
 
     // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]);
+    void ggml_rope_yarn_corr_dims(
+        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // xPos RoPE, in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
diff --git a/llama.cpp b/llama.cpp
index 56c511b594333..7184c376c8a45 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2471,13 +2471,14 @@ static struct ggml_cgraph * llm_build_llama(
 
     GGML_ASSERT(!!kv_self.ctx);
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = hparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int32_t n_embd      = hparams.n_embd;
+    const int32_t n_layer     = hparams.n_layer;
+    const int32_t n_ctx       = hparams.n_ctx;
+    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_head      = hparams.n_head;
+    const int32_t n_head_kv   = hparams.n_head_kv;
+    const int32_t n_embd_head = hparams.n_embd_head();
+    const int32_t n_embd_gqa  = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -2599,14 +2600,18 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(tmpq, "tmpq");
 
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
-                ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base,
-                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N),
+                n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+            );
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
             struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
-                ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base,
-                freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),
+                n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+            );
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -2811,13 +2816,14 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     GGML_ASSERT(!!kv_self.ctx);
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = hparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int32_t n_embd      = hparams.n_embd;
+    const int32_t n_layer     = hparams.n_layer;
+    const int32_t n_ctx       = hparams.n_ctx;
+    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_head      = hparams.n_head;
+    const int32_t n_head_kv   = hparams.n_head_kv;
+    const int32_t n_embd_head = hparams.n_embd_head();
+    const int32_t n_embd_gqa  = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -2945,12 +2951,14 @@ static struct ggml_cgraph * llm_build_baichaun(
                     Kcur = ggml_rope_custom_inplace(
                         ctx0,
                         ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N),
-                        n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                        n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
                     );
                     Qcur = ggml_rope_custom_inplace(
                         ctx0,
                         ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),
-                        n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                        n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
                     );
                     break;
                 case MODEL_13B:
@@ -3183,13 +3191,14 @@ static struct ggml_cgraph * llm_build_falcon(
 
     GGML_ASSERT(!!kv_self.ctx);
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = hparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int32_t n_embd      = hparams.n_embd;
+    const int32_t n_layer     = hparams.n_layer;
+    const int32_t n_ctx       = hparams.n_ctx;
+    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_head      = hparams.n_head;
+    const int32_t n_head_kv   = hparams.n_head_kv;
+    const int32_t n_embd_head = hparams.n_embd_head();
+    const int32_t n_embd_gqa  = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -3351,12 +3360,12 @@ static struct ggml_cgraph * llm_build_falcon(
 
             // using mode = 2 for neox mode
             struct ggml_tensor * Qcur = ggml_rope_custom_inplace(
-                ctx0, tmpq, n_past, n_embd_head, 2, 0,
+                ctx0, tmpq, n_past, n_embd_head, 2, 0, n_orig_ctx,
                 freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
             );
             offload_func_kq(Qcur);
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(
-                ctx0, tmpk, n_past, n_embd_head, 2, 0,
+                ctx0, tmpk, n_past, n_embd_head, 2, 0, n_orig_ctx,
                 freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
             );
             offload_func_kq(Kcur);

From e0b120c3cae9e38f3a0b5eae26d43fda87c1c910 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 21 Sep 2023 15:00:08 -0400
Subject: [PATCH 14/28] address review comments

---
 common/common.cpp          | 4 ++--
 examples/server/server.cpp | 2 +-
 ggml.c                     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ca4b9c1cc8a16..3e3bdf18b348a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -683,9 +683,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     printf("  --rope-scaling {none,linear,yarn}\n");
     printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-scale N        RoPE context scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
     printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
     printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
     printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3a1c55b1c939c..e87bfd508fcdc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -704,7 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --rope-scaling {none,linear,yarn}\n");
     printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
     printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
     printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
     printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
     printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
diff --git a/ggml.c b/ggml.c
index 56b9fdd291aa0..398b88cbe37d2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12650,7 +12650,7 @@ static void ggml_compute_forward_clamp(
 
 // ggml_compute_forward_rope
 
-static inline float rope_yarn_ramp(const float low, const float high, const int i0) {
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / MIN(0.001f, high - low);
     return 1 - MIN(1, MAX(0, y));
 }

From 19bb74e74c036e8eddfcfe101cfd8940f42df078 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 21 Sep 2023 15:10:39 -0400
Subject: [PATCH 15/28] restore backwards compatiblity with *.rope.scale_linear

---
 llama.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7184c376c8a45..d862541d1f80c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -204,6 +204,7 @@ enum llm_kv {
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
     LLM_KV_ROPE_SCALING_FACTOR,
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
@@ -251,6 +252,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
 
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
     { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
@@ -1719,9 +1721,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
     if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
         hparams.rope_freq_scale = 1.0f;
     } else if (hparams.rope_freq_scale == 0.0f) {
-        float ropescale = 1.0f;
+        float ropescale = 0.0f;
         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
-        hparams.rope_freq_scale = 1.0f/ropescale;
+        if (ropescale == 0.0f) { // try the old key name
+            GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+        }
+        hparams.rope_freq_scale = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
     }
 
     if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) {

From 4d5fe73449ae71f6163b0ec165cae5de56415efc Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 21 Sep 2023 15:41:06 -0400
Subject: [PATCH 16/28] better option descriptions in help

---
 common/common.cpp          | 8 ++++----
 examples/server/server.cpp | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3e3bdf18b348a..6159df5654211 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -686,10 +686,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
-    printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
-    printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --yarn-beta-slow N    YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
+    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
+    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e87bfd508fcdc..33bc8c2b8c76f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -705,10 +705,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
     printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N   YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor);
-    printf("  --yarn-attn-factor N  YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor);
-    printf("  --yarn-beta-fast N    YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --yarn-beta-slow N    YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
+    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
+    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");

From 746641574a5597a3c0d4c11fcea05327e9c9556c Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sat, 7 Oct 2023 12:57:55 -0400
Subject: [PATCH 17/28] gguf : store scaling type as a string instead of an int

---
 gguf-py/gguf/gguf.py | 10 +++++-----
 llama.cpp            | 22 +++++++++++++++++++---
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 742ce214356e2..6bcb33701f71e 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -410,10 +410,10 @@ class TokenType(IntEnum):
     UNUSED       = 5
     BYTE         = 6
 
-class RopeScalingType(IntEnum):
-    NONE   = 0
-    LINEAR = 1
-    YARN   = 2
+class RopeScalingType(Enum):
+    NONE   = 'none'
+    LINEAR = 'linear'
+    YARN   = 'yarn'
 
 #
 # implementation
@@ -769,7 +769,7 @@ def add_rope_freq_base(self, value: float):
         self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
 
     def add_rope_scaling_type(self, value: RopeScalingType):
-        self.add_uint8(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), int(value))
+        self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
 
     def add_rope_scaling_factor(self, value: float):
         self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
diff --git a/llama.cpp b/llama.cpp
index d862541d1f80c..4f68ba69259bc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -470,6 +470,22 @@ struct LLM_TN {
     } \
 }
 
+static std::map<llama_rope_scaling_type, std::string> LLAMA_ROPE_SCALING_TYPES = {
+    { LLAMA_ROPE_SCALING_NONE,   "none"   },
+    { LLAMA_ROPE_SCALING_LINEAR, "linear" },
+    { LLAMA_ROPE_SCALING_YARN,   "yarn"   },
+};
+
+static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
+    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+
+    return LLAMA_ROPE_SCALING_UNSPECIFIED;
+}
+
 //
 // ggml helpers
 //
@@ -1711,9 +1727,9 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
     int8_t rope_scaling_type = params.rope_scaling_type;
 
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
-        uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
-        GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
-        rope_scaling_type = int8_t(type);
+        std::string type("linear");
+        GGUF_GET_KEY(ctx, type, gguf_get_val_str, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+        rope_scaling_type = int8_t(llama_rope_scaling_type_from_string(type));
     }
     GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
 

From 4f4e94804dd986e00c7a8978900f015c5ac08c7e Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sat, 7 Oct 2023 12:59:27 -0400
Subject: [PATCH 18/28] improve printing of YaRN parameters

---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4f68ba69259bc..25f129e570205 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1957,11 +1957,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_ff           = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: freq_base      = %.1f\n",   __func__, hparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale     = %g\n",     __func__, hparams.rope_freq_scale);
-    LLAMA_LOG_INFO("%s: YaRN scaling   = %g\n",     __func__, hparams.yarn_ext_factor);
+    LLAMA_LOG_INFO("%s: YaRN extension = %g\n",     __func__, hparams.yarn_ext_factor);
     LLAMA_LOG_INFO("%s: YaRN orig ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
-    LLAMA_LOG_INFO("%s: YaRN beta_fast = %f\n",     __func__, hparams.yarn_beta_fast);
-    LLAMA_LOG_INFO("%s: YaRN beta_slow = %f\n",     __func__, hparams.yarn_beta_slow);
-    LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "no");
+    LLAMA_LOG_INFO("%s: YaRN beta_fast = %.1f\n",   __func__, hparams.yarn_beta_fast);
+    LLAMA_LOG_INFO("%s: YaRN beta_slow = %.1f\n",   __func__, hparams.yarn_beta_slow);
+    LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
     LLAMA_LOG_INFO("%s: model type     = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype    = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params   = %.2f B\n", __func__, ml.n_elements*1e-9);

From 5d7a3a5c0dcfb4ad31a2532d3223e43dde78741b Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sat, 7 Oct 2023 13:20:33 -0400
Subject: [PATCH 19/28] allow forcing ext_factor to zero if scaling type is
 YaRN

---
 llama.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 25f129e570205..e6ce4486b17ca 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -54,6 +54,7 @@
 #include <cassert>
 #include <cinttypes>
 #include <climits>
+#include <cmath>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
@@ -1735,7 +1736,7 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
 
     // rope_freq_scale (inverse of the kv) is optional
     if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
-        hparams.rope_freq_scale = 1.0f;
+        hparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     } else if (hparams.rope_freq_scale == 0.0f) {
         float ropescale = 0.0f;
         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
@@ -1745,8 +1746,8 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
         hparams.rope_freq_scale = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
     }
 
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) {
-        hparams.yarn_ext_factor = 1.0f; // enable YaRN
+    if (std::isnan(hparams.yarn_ext_factor)) { // NaN indicates 'not set'
+        hparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
     }
 
     // sanity check for n_rot (optional)
@@ -6268,7 +6269,7 @@ struct llama_context_params llama_context_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ 0.0f,
+        /*.yarn_ext_factor             =*/ NAN,
         /*.yarn_attn_factor            =*/ 1.0f,
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,

From babf0e0c8fce8c2045c36e1ba26c456968967664 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 8 Oct 2023 14:31:35 -0400
Subject: [PATCH 20/28] fix rope_cuda parameter order

---
 ggml-cuda.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 487ccdd18c7d4..8e1ad45620b73 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5469,8 +5469,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
 
 template<typename T>
 static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor,
-    float theta_scale, const int32_t * pos, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float theta_scale, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);

From 0050e1ecc6fcb3f659ccbc964866350c35376114 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 8 Oct 2023 16:51:51 -0400
Subject: [PATCH 21/28] default n_yarn_orig_ctx to n_ctx_train

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index c5403902816eb..6b3627c0fab3d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2011,7 +2011,7 @@ static void llm_load_hparams(
     GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
                  kv(LLM_KV_ROPE_SCALING_FINETUNED));
 
-    hparams.n_yarn_orig_ctx = 0;
+    hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
     GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
                  kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
 

From 09c31027db2e620d7b97b827fe5b6e3945fd7504 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 8 Oct 2023 16:59:23 -0400
Subject: [PATCH 22/28] fix uninitialized cparams

---
 llama.cpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 6b3627c0fab3d..ab2342bf390b9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7888,14 +7888,18 @@ struct llama_context * llama_new_context_with_model(
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
 
-    cparams.n_batch         = params.n_batch;
-    cparams.n_ctx           = params.n_ctx == 0              ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base  = params.rope_freq_base == 0.0f  ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-    cparams.yarn_ext_factor = params.yarn_ext_factor;
-    cparams.n_threads       = params.n_threads;
-    cparams.n_threads_batch = params.n_threads_batch;
-    cparams.mul_mat_q       = params.mul_mat_q;
+    cparams.n_batch          = params.n_batch;
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.mul_mat_q        = params.mul_mat_q;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     auto rope_scaling_type = params.rope_scaling_type;
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {

From 57c3442ea5e0a84339e932807671835181220e63 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 8 Oct 2023 18:10:02 -0400
Subject: [PATCH 23/28] make printed param formatting more consistent

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ab2342bf390b9..107dca5a63a2b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2256,11 +2256,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
     LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
-    LLAMA_LOG_INFO("%s: RoPE scaling     = %s\n",     __func__, rope_scaling_type.c_str());
+    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-    LLAMA_LOG_INFO("%s: YaRN orig ctx    = %u\n",     __func__, hparams.n_yarn_orig_ctx);
-    LLAMA_LOG_INFO("%s: RoPE finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+    LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
+    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);

From a20b3e6cf05d7d4d9135bb418ceef4cdf1f34c65 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 11 Oct 2023 17:36:11 -0400
Subject: [PATCH 24/28] fix missing import

---
 gguf-py/gguf/gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 3a6cd75dd0afe..0a54bc7da3800 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -7,7 +7,7 @@
 import struct
 import sys
 import tempfile
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from io import BufferedWriter
 from pathlib import Path
 from typing import IO, Any, BinaryIO, Callable, Sequence

From 9ae10b3aee5e104a8f6477e9ed0f9708281e16e2 Mon Sep 17 00:00:00 2001
From: Jeffrey Quesnelle <jquesnelle@gmail.com>
Date: Thu, 19 Oct 2023 19:36:16 -0700
Subject: [PATCH 25/28] Fix YaRN inverted scaling and add "rope.scaling.type"
 to GGUF (#1)

---
 convert.py       | 1 +
 ggml-cuda.cu     | 4 ++--
 ggml-metal.metal | 4 ++--
 ggml.c           | 4 ++--
 llama.cpp        | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/convert.py b/convert.py
index 6e294b503e785..175f4b14b588c 100755
--- a/convert.py
+++ b/convert.py
@@ -234,6 +234,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
             n_head_kv         = config.get("num_key_value_heads", n_head),
             f_norm_eps        = config["rms_norm_eps"],
             f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
             f_rope_scale      = f_rope_scale,
             n_orig_ctx        = n_orig_ctx,
             rope_finetuned    = rope_finetuned,
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 95e1ae4c6c9f2..ff7b1e90a2758 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4429,8 +4429,8 @@ static __device__ void rope_yarn(
     }
 
     // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale > 1.0f)
-        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    if (freq_scale < 1.0f)
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 6fd3f9aa006ff..2064884fffe41 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -899,8 +899,8 @@ static void rope_yarn(
     }
 
     // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale > 1.0f)
-        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    if (freq_scale < 1.0f)
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/ggml.c b/ggml.c
index 4b40a4e71fa6b..a243418106ee3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13364,8 +13364,8 @@ static void rope_yarn(
     }
 
     // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale > 1.0f)
-        mscale *= 1.0f + 0.1f * logf(freq_scale);
+    if (freq_scale < 1.0f)
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/llama.cpp b/llama.cpp
index faeee0d3ac073..cbab5f580969c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2055,7 +2055,7 @@ static void llm_load_hparams(
     GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
 
     std::string rope_scaling("linear");
-    GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+    GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
     hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
     GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
 

From 14cf93b14c3e5160b383136f09bbb1344c1bf0ba Mon Sep 17 00:00:00 2001
From: Jeffrey Quesnelle <jquesnelle@gmail.com>
Date: Fri, 20 Oct 2023 06:18:17 -0700
Subject: [PATCH 26/28] fix YaRN ramp, make mscale conditional, add
 --yarn-orig-ctx (#2)

---
 common/common.cpp |  8 ++++++++
 common/common.h   |  5 +++--
 ggml-cuda.cu      |  7 +++----
 ggml-metal.metal  |  7 +++----
 ggml.c            |  7 +++----
 llama.cpp         | 10 ++++++----
 llama.h           | 13 +++++++------
 7 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3fafdfb38fef1..d0b05c1ba0a83 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -220,6 +220,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
+        } else if (arg == "--yarn-orig-ctx") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_orig_ctx = std::stoi(argv[i]);
         } else if (arg == "--yarn-ext-factor") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -737,6 +743,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
+    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
     printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
     printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
     printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
@@ -861,6 +868,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.yarn_attn_factor  = params.yarn_attn_factor;
     cparams.yarn_beta_fast    = params.yarn_beta_fast;
     cparams.yarn_beta_slow    = params.yarn_beta_slow;
+    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
 
     return cparams;
 }
diff --git a/common/common.h b/common/common.h
index 91993dba1850f..01c2661b03fe9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -57,8 +57,9 @@ struct gpt_params {
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
     float   yarn_ext_factor                 = NAN;  // YaRN extrapolation mix factor
     float   yarn_attn_factor                = 1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
+    float   yarn_beta_fast                  = 32.0f;// YaRN low correction dim
+    float   yarn_beta_slow                  = 1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
     int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // // sampling parameters
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index ff7b1e90a2758..4c6a36ca11d3a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4406,7 +4406,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 }
 
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / min(0.001f, high - low);
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
@@ -4426,11 +4426,10 @@ static __device__ void rope_yarn(
     if (ext_factor != 0.0f) {
         float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
         theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-    }
 
-    // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale < 1.0f)
+        // Get n-d magnitude scaling corrected for interpolation
         mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 2064884fffe41..42729177433ba 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -880,7 +880,7 @@ kernel void kernel_alibi_f32(
 }
 
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / min(0.001f, high - low);
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
 }
 
@@ -896,11 +896,10 @@ static void rope_yarn(
     if (ext_factor != 0.0f) {
         ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
         theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-    }
 
-    // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale < 1.0f)
+        // Get n-d magnitude scaling corrected for interpolation
         mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/ggml.c b/ggml.c
index a243418106ee3..111ee3e56fe75 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13345,7 +13345,7 @@ static void ggml_compute_forward_clamp(
 // ggml_compute_forward_rope
 
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / MIN(0.001f, high - low);
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
     return 1 - MIN(1, MAX(0, y));
 }
 
@@ -13361,11 +13361,10 @@ static void rope_yarn(
     if (ext_factor != 0.0f) {
         float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
         theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-    }
 
-    // Get n-d magnitude scaling corrected for interpolation
-    if (freq_scale < 1.0f)
+        // Get n-d magnitude scaling corrected for interpolation
         mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/llama.cpp b/llama.cpp
index cbab5f580969c..01e219a48e523 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1113,6 +1113,7 @@ struct llama_cparams {
     float    rope_freq_base;
     float    rope_freq_scale;
 
+    uint32_t n_yarn_orig_ctx;
     // These hyperparameters are not exposed in GGUF, because all
     // existing YaRN models use the same values for them.
     float yarn_ext_factor;
@@ -3028,7 +3029,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int32_t n_embd      = hparams.n_embd;
     const int32_t n_layer     = hparams.n_layer;
     const int32_t n_ctx       = cparams.n_ctx;
-    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
     const int32_t n_head      = hparams.n_head;
     const int32_t n_head_kv   = hparams.n_head_kv;
     const int32_t n_embd_head = hparams.n_embd_head();
@@ -3430,7 +3431,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int32_t n_embd      = hparams.n_embd;
     const int32_t n_layer     = hparams.n_layer;
     const int32_t n_ctx       = cparams.n_ctx;
-    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
     const int32_t n_head      = hparams.n_head;
     const int32_t n_head_kv   = hparams.n_head_kv;
     const int32_t n_embd_head = hparams.n_embd_head();
@@ -4194,7 +4195,7 @@ static struct ggml_cgraph * llm_build_falcon(
     const int32_t n_embd      = hparams.n_embd;
     const int32_t n_layer     = hparams.n_layer;
     const int32_t n_ctx       = cparams.n_ctx;
-    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
     const int32_t n_head      = hparams.n_head;
     const int32_t n_head_kv   = hparams.n_head_kv;
     const int32_t n_embd_head = hparams.n_embd_head();
@@ -4818,7 +4819,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     const int64_t n_embd      = hparams.n_embd;
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
-    const int32_t n_orig_ctx  = hparams.n_yarn_orig_ctx;
+    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
     const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_head      = hparams.n_head;
     const int64_t n_embd_head = hparams.n_embd_head();
@@ -8676,6 +8677,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.mul_mat_q        = params.mul_mat_q;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx   == 0    ? hparams.n_ctx_train           : params.yarn_orig_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
diff --git a/llama.h b/llama.h
index 48e12cfea79d5..5f6b14e192c7a 100644
--- a/llama.h
+++ b/llama.h
@@ -182,12 +182,13 @@ extern "C" {
         int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;   // RoPE base frequency, 0 = from model
-        float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
-        float yarn_attn_factor; // YaRN magnitude scaling factor
-        float yarn_beta_fast;   // YaRN low correction dim
-        float yarn_beta_slow;   // YaRN high correction dim
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool mul_mat_q;  // if true, use experimental mul_mat_q kernels

From 9fc823826e815f0fba3544f0818326fb0045bfb7 Mon Sep 17 00:00:00 2001
From: Jeffrey Quesnelle <jquesnelle@gmail.com>
Date: Mon, 30 Oct 2023 08:35:51 -0700
Subject: [PATCH 27/28] fix loading rope.scaling.original_context_length from
 GGUF (#3)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
---
 llama.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index e0794f90cac32..a28c6f9601ae2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8968,10 +8968,13 @@ struct llama_context * llama_new_context_with_model(
     cparams.mul_mat_q        = params.mul_mat_q;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx   == 0    ? hparams.n_ctx_train           : params.yarn_orig_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
+    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
+                                                              hparams.n_ctx_train;
+
     auto rope_scaling_type = params.rope_scaling_type;
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;

From 15f26efdb129c8ed116c58bb276897c760a8bbe2 Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 1 Nov 2023 16:44:49 -0400
Subject: [PATCH 28/28] implement YaRN for GPT-NeoX RoPE

---
 ggml-cuda.cu     | 81 ++++++++++++++++++++++++++++++------------------
 ggml-metal.metal |  9 ++++--
 ggml.c           | 22 ++++++++++---
 3 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index a43a5873d4a2d..4c117344bc849 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4439,7 +4439,7 @@ static __device__ void rope_yarn(
 // rope == RoPE == rotary positional embedding
 template<typename T, bool has_pos>
 static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float theta_scale,
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
     float ext_factor, float attn_factor, rope_corr_dims corr_dims
 ) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -4453,7 +4453,7 @@ static __global__ void rope(
     const int i2 = row/p_delta_rows;
 
     const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(theta_scale, col/2);
+    const float theta_base = p*powf(freq_base, -col/ncols);
 
     float cos_theta, sin_theta;
     rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -4466,8 +4466,10 @@ static __global__ void rope(
 }
 
 template<typename T, bool has_pos>
-static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                 const int p_delta_rows, const float theta_scale) {
+static __global__ void rope_neox(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4478,11 +4480,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
     const int i = row*ncols + col/2;
     const int i2 = row/p_delta_rows;
 
+    // simplified from `(row * ncols + col) * (-1 / ncols)`
+    const float cur_rot = -col/ncols - row;
+
     const int p = has_pos ? pos[i2] : 0;
-    const float p0 = p*freq_scale;
-    const float theta = p0*powf(theta_scale, col/2);
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
+    const float theta_base = p*powf(freq_base, cur_rot);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + ncols/2];
@@ -4491,8 +4496,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
     dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
+static __global__ void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+) {
     const int col = blockDim.x*blockIdx.x + threadIdx.x;
     const int half_n_dims = ncols/4;
 
@@ -4504,7 +4511,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
     const int i = row*ncols + col;
     const int i2 = row/p_delta_rows;
 
-    const float col_theta_scale = powf(theta_scale, col);
+    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
      // FIXME: this is likely wrong
     const int p = pos != nullptr ? pos[i2] : 0;
 
@@ -5525,7 +5532,7 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
 template<typename T>
 static void rope_cuda(
     const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float theta_scale, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
@@ -5533,36 +5540,44 @@ static void rope_cuda(
     const dim3 block_nums(nrows, num_blocks_x, 1);
     if (pos == nullptr) {
         rope<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, ext_factor, attn_factor, corr_dims
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
         );
     } else {
         rope<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, ext_factor, attn_factor, corr_dims
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
         );
     }
 }
 
 template<typename T>
-static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
-                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_neox_cuda(
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
     if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     }
 }
 
-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
-                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
+static void rope_glm_f32_cuda(
+    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, int n_ctx, cudaStream_t stream
+) {
     GGML_ASSERT(ncols % 4 == 0);
     const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
     const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
     const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
 }
 
 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -6425,8 +6440,6 @@ inline void ggml_cuda_op_rope(
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
     const int32_t * pos = nullptr;
     if ((mode & 1) == 0) {
         GGML_ASSERT(src1->type == GGML_TYPE_I32);
@@ -6437,31 +6450,37 @@ inline void ggml_cuda_op_rope(
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
 
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
     // compute
     if (is_glm) {
         GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
     } else if (is_neox) {
         GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
         if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_neox_cuda(
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else {
             GGML_ASSERT(false);
         }
     } else {
-        rope_corr_dims corr_dims;
-        ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
         if (src0->type == GGML_TYPE_F32) {
             rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, ext_factor,
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
             rope_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, ext_factor,
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else {
diff --git a/ggml-metal.metal b/ggml-metal.metal
index ddfe378131ce6..791b0c0c47de9 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1125,9 +1125,12 @@ kernel void kernel_rope(
         for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
             for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
 
-                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                // simplified from `(ib * n_dims + ic) * inv_ndims`
+                const float cur_rot = inv_ndims*ic - ib;
+
+                const float theta = theta_0 * pow(freq_base, cur_rot);
+                float cos_theta, sin_theta;
+                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
                 const int64_t i0 = ib*n_dims + ic/2;
 
diff --git a/ggml.c b/ggml.c
index 0034067d55a3f..2c6555e15964c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13486,6 +13486,7 @@ static void ggml_compute_forward_rope_f32(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
@@ -13556,8 +13557,14 @@ static void ggml_compute_forward_rope_f32(
                     theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
+
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
 
                             theta_base *= theta_scale;
 
@@ -13628,6 +13635,7 @@ static void ggml_compute_forward_rope_f16(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
@@ -13694,8 +13702,14 @@ static void ggml_compute_forward_rope_f16(
                     theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
+
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
 
                             theta_base *= theta_scale;