rustformers · philpax · Apr 25, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 30, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/ggml-raw/ggml/ggml.c b/ggml-raw/ggml/ggml.c
@@ -397,6 +397,53 @@ static inline __m128i packNibbles( __m256i bytes )
 }
 #endif
 
+// method 5
+// blocks of QK elements
+// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
+ assert(k % QK == 0);
+ const int nb = k / QK;
+
+ const size_t bs = sizeof(float) + QK/2;
+
+ uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+ uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
+
+ uint8_t pp[QK/2];
+
+ for (int i = 0; i < nb; i++) {
+ float amax = 0.0f; // absolute max
+
+ for (int l = 0; l < QK; l++) {
+ const float v = x[i*QK + l];
+ amax = MAX(amax, fabsf(v));
+ }
+
+ const float d = amax / ((1 << 3) - 1);
+ const float id = d ? 1.0f/d : 0.0f;
+
+ *(float *)pd = d;
+ pd += bs;
+
+ for (int l = 0; l < QK; l += 2) {
+ const float v0 = x[i*QK + l + 0]*id;
+ const float v1 = x[i*QK + l + 1]*id;
+
+ const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
+ const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
+
+ assert(vi0 < 16);
+ assert(vi1 < 16);
+
+ pp[l/2] = vi0 | (vi1 << 4);
+ }
+
+ memcpy(pb, pp, sizeof(pp));
+ pb += bs;
+ }
+}
 
 // method 5
 // blocks of QK elements
@@ -640,8 +687,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
  const uint8_t vi0 = round(v0);
  const uint8_t vi1 = round(v1);
 
- assert(vi0 >= 0 && vi0 < 16);
- assert(vi1 >= 0 && vi1 < 16);
+ assert(vi0 < 16);
+ assert(vi1 < 16);
 
  pp[l/2] = vi0 | (vi1 << 4);
  }
@@ -10630,6 +10677,69 @@ enum ggml_opt_result ggml_opt(
  return result;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+ const int nb = k / qk;
+ const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
+ const size_t row_size = nb*bs;
+
+ assert(k % qk == 0);
+
+ char * pdst = (char *) dst;
+
+ for (int j = 0; j < n; j += k) {
+ uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
+ uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
+
+ quantize_row_q4_0_reference(src + j, pd, k);
+
+ for (int i = 0; i < nb; i++) {
+ for (int l = 0; l < qk; l += 2) {
+ const uint8_t vi0 = pb[l/2] & 0xF;
+ const uint8_t vi1 = pb[l/2] >> 4;
+
+ hist[vi0]++;
+ hist[vi1]++;
+ }
+ pb += bs;
+ }
+ }
+
+ return (n/k)*row_size;
+}
+
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+ const int nb = k / qk;
+ const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
+ const size_t row_size = nb*bs;
+
+ assert(k % qk == 0);
+
+ char * pdst = (char *) dst;
+
+ for (int j = 0; j < n; j += k) {
+ uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
+ uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
+
+ quantize_row_q4_1(src + j, pd, k);
+
+ for (int i = 0; i < nb; i++) {
+ for (int l = 0; l < qk; l += 2) {
+ const uint8_t vi0 = pb[l/2] & 0xF;
+ const uint8_t vi1 = pb[l/2] >> 4;
+
+ hist[vi0]++;
+ hist[vi1]++;
+ }
+ pb += bs;
+ }
+ }
+
+ return (n/k)*row_size;
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {

diff --git a/ggml-raw/ggml/ggml.h b/ggml-raw/ggml/ggml.h
@@ -741,6 +741,14 @@ enum ggml_opt_result ggml_opt(
  struct ggml_opt_params params,
  struct ggml_tensor * f);
 
+//
+// quantization
+//
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+
+
 //
 // system info
 //

diff --git a/ggml-raw/src/lib.rs b/ggml-raw/src/lib.rs
@@ -228,4 +228,22 @@ extern "C" {
  pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
 
  pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph);
+
+ pub fn ggml_quantize_q4_0(
+ src: *mut f32,
+ work: *mut c_void,
+ n: i32,
+ k: i32,
+ qk: i32,
+ hist: *mut i64,
+ ) -> usize;
+
+ pub fn ggml_quantize_q4_1(
+ src: *mut f32,
+ work: *mut c_void,
+ n: i32,
+ k: i32,
+ qk: i32,
+ hist: *mut i64,
+ ) -> usize;
 }
diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml
@@ -8,6 +8,7 @@ rust-version = "1.65"
 
 [dependencies]
 bytemuck = "1.13.1"
+half = "2.2.1"
 ggml-raw = { path = "../ggml-raw" }
 partial_sort = "0.2.0"
 thiserror = "1.0"

diff --git a/llama-rs/src/ggml.rs b/llama-rs/src/ggml.rs
@@ -6,8 +6,8 @@ use std::{
 
 pub use ggml_raw::ggml_type as Type;
 
-pub const FILE_MAGIC: i32 = 0x67676d66;
-pub const FILE_MAGIC_UNVERSIONED: i32 = 0x67676d6c;
+pub const FILE_MAGIC: u32 = 0x67676d66;
+pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;
 
 pub const FORMAT_VERSION: u32 = 1;
 
@@ -291,3 +291,43 @@ pub fn type_sizef(x: ggml_raw::ggml_type) -> f64 {
 pub fn blck_size(t: Type) -> i32 {
  unsafe { ggml_raw::ggml_blck_size(t) }
 }
+
+pub fn quantize_q4_0(
+ src: &mut Vec<f32>,
+ work: &mut Vec<f32>,
+ n: i32,
+ k: i32,
+ qk: i32,
+ hist: &mut Vec<i64>,
+) -> usize {
+ unsafe {
+ ggml_raw::ggml_quantize_q4_0(
+ src.as_mut_ptr(),
+ work.as_mut_ptr() as *mut c_void,
+ n,
+ k,
+ qk,
+ hist.as_mut_ptr(),
+ )
+ }
+}
+
+pub fn quantize_q4_1(
+ src: &mut Vec<f32>,
+ work: &mut Vec<f32>,
+ n: i32,
+ k: i32,
+ qk: i32,
+ hist: &mut Vec<i64>,
+) -> usize {
+ unsafe {
+ ggml_raw::ggml_quantize_q4_1(
+ src.as_mut_ptr(),
+ work.as_mut_ptr() as *mut c_void,
+ n,
+ k,
+ qk,
+ hist.as_mut_ptr(),
+ )
+ }
+}
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
@@ -1,4 +1,5 @@
 mod ggml;
+mod quantize;
 
 use std::{
  collections::{HashMap, VecDeque},
@@ -14,6 +15,7 @@ use thiserror::Error;
 use partial_sort::PartialSort;
 use rand::{distributions::WeightedIndex, prelude::Distribution};
 
+pub use quantize::llama_model_quantize;
 pub const EOD_TOKEN_ID: TokenId = 2; // Hardcoded (for now?)
 
 #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
@@ -345,6 +347,11 @@ pub enum LoadError {
  source: std::io::Error,
  path: PathBuf,
  },
+ #[error("could not create file {path:?}")]
+ CreateFileFailed {
+ source: std::io::Error,
+ path: PathBuf,
+ },
  #[error("no parent path for {path:?}")]
  NoParentPath { path: PathBuf },
  #[error("unable to read exactly {bytes} bytes")]
@@ -374,6 +381,8 @@ pub enum LoadError {
  TensorWrongSize { tensor_name: String, path: PathBuf },
  #[error("invalid ftype {ftype} in {path:?}")]
  InvalidFtype { ftype: i32, path: PathBuf },
+ #[error("itype supplied was invalid: {0}")]
+ InvalidItype(u8),
 }
 
 #[derive(Error, Debug)]
@@ -465,7 +474,7 @@ impl Model {
  }
 
  // Verify magic
- let is_legacy_model: bool = match read_i32(&mut reader)? {
+ let is_legacy_model: bool = match read_u32(&mut reader)? {
  ggml::FILE_MAGIC => false,
  ggml::FILE_MAGIC_UNVERSIONED => true,
  _ => {