Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Ported quantize.cpp #84

Merged
merged 22 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

114 changes: 112 additions & 2 deletions ggml-raw/ggml/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,53 @@ static inline __m128i packNibbles( __m256i bytes )
}
#endif

// method 5
// blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)

// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
philpax marked this conversation as resolved.
Show resolved Hide resolved
assert(k % QK == 0);
const int nb = k / QK;

const size_t bs = sizeof(float) + QK/2;

uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));

uint8_t pp[QK/2];

for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max

for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
}

const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;

*(float *)pd = d;
pd += bs;

for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;

const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;

assert(vi0 < 16);
assert(vi1 < 16);

pp[l/2] = vi0 | (vi1 << 4);
}

memcpy(pb, pp, sizeof(pp));
pb += bs;
}
}

// method 5
// blocks of QK elements
Expand Down Expand Up @@ -640,8 +687,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
assert(vi0 < 16);
assert(vi1 < 16);

pp[l/2] = vi0 | (vi1 << 4);
}
Expand Down Expand Up @@ -10630,6 +10677,69 @@ enum ggml_opt_result ggml_opt(
return result;
}

////////////////////////////////////////////////////////////////////////////////

size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;

assert(k % qk == 0);

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));

quantize_row_q4_0_reference(src + j, pd, k);

for (int i = 0; i < nb; i++) {
for (int l = 0; l < qk; l += 2) {
const uint8_t vi0 = pb[l/2] & 0xF;
const uint8_t vi1 = pb[l/2] >> 4;

hist[vi0]++;
hist[vi1]++;
}
pb += bs;
}
}

return (n/k)*row_size;
}

size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;

assert(k % qk == 0);

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));

quantize_row_q4_1(src + j, pd, k);

for (int i = 0; i < nb; i++) {
for (int l = 0; l < qk; l += 2) {
const uint8_t vi0 = pb[l/2] & 0xF;
const uint8_t vi1 = pb[l/2] >> 4;

hist[vi0]++;
hist[vi1]++;
}
pb += bs;
}
}

return (n/k)*row_size;
}


////////////////////////////////////////////////////////////////////////////////

int ggml_cpu_has_avx(void) {
Expand Down
8 changes: 8 additions & 0 deletions ggml-raw/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,14 @@ enum ggml_opt_result ggml_opt(
struct ggml_opt_params params,
struct ggml_tensor * f);

//
// quantization
//

size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);


//
// system info
//
Expand Down
18 changes: 18 additions & 0 deletions ggml-raw/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,22 @@ extern "C" {
pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);

pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph);

pub fn ggml_quantize_q4_0(
src: *mut f32,
work: *mut c_void,
n: i32,
k: i32,
qk: i32,
hist: *mut i64,
) -> usize;

pub fn ggml_quantize_q4_1(
src: *mut f32,
work: *mut c_void,
n: i32,
k: i32,
qk: i32,
hist: *mut i64,
) -> usize;
}
1 change: 1 addition & 0 deletions llama-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ rust-version = "1.65"

[dependencies]
bytemuck = "1.13.1"
half = "2.2.1"
ggml-raw = { path = "../ggml-raw" }
partial_sort = "0.2.0"
thiserror = "1.0"
Expand Down
44 changes: 42 additions & 2 deletions llama-rs/src/ggml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use std::{

pub use ggml_raw::ggml_type as Type;

pub const FILE_MAGIC: i32 = 0x67676d66;
pub const FILE_MAGIC_UNVERSIONED: i32 = 0x67676d6c;
pub const FILE_MAGIC: u32 = 0x67676d66;
pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;

pub const FORMAT_VERSION: u32 = 1;

Expand Down Expand Up @@ -291,3 +291,43 @@ pub fn type_sizef(x: ggml_raw::ggml_type) -> f64 {
pub fn blck_size(t: Type) -> i32 {
unsafe { ggml_raw::ggml_blck_size(t) }
}

pub fn quantize_q4_0(
src: &mut Vec<f32>,
work: &mut Vec<f32>,
n: i32,
philpax marked this conversation as resolved.
Show resolved Hide resolved
k: i32,
qk: i32,
hist: &mut Vec<i64>,
) -> usize {
unsafe {
philpax marked this conversation as resolved.
Show resolved Hide resolved
ggml_raw::ggml_quantize_q4_0(
src.as_mut_ptr(),
work.as_mut_ptr() as *mut c_void,
n,
k,
qk,
hist.as_mut_ptr(),
)
}
}

pub fn quantize_q4_1(
src: &mut Vec<f32>,
work: &mut Vec<f32>,
n: i32,
k: i32,
qk: i32,
hist: &mut Vec<i64>,
) -> usize {
unsafe {
ggml_raw::ggml_quantize_q4_1(
philpax marked this conversation as resolved.
Show resolved Hide resolved
src.as_mut_ptr(),
work.as_mut_ptr() as *mut c_void,
n,
k,
qk,
hist.as_mut_ptr(),
)
}
}
11 changes: 10 additions & 1 deletion llama-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod ggml;
mod quantize;

use std::{
collections::{HashMap, VecDeque},
Expand All @@ -14,6 +15,7 @@ use thiserror::Error;
use partial_sort::PartialSort;
use rand::{distributions::WeightedIndex, prelude::Distribution};

pub use quantize::llama_model_quantize;
pub const EOD_TOKEN_ID: TokenId = 2; // Hardcoded (for now?)

#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
Expand Down Expand Up @@ -345,6 +347,11 @@ pub enum LoadError {
source: std::io::Error,
path: PathBuf,
},
#[error("could not create file {path:?}")]
CreateFileFailed {
source: std::io::Error,
path: PathBuf,
},
#[error("no parent path for {path:?}")]
NoParentPath { path: PathBuf },
#[error("unable to read exactly {bytes} bytes")]
Expand Down Expand Up @@ -374,6 +381,8 @@ pub enum LoadError {
TensorWrongSize { tensor_name: String, path: PathBuf },
#[error("invalid ftype {ftype} in {path:?}")]
InvalidFtype { ftype: i32, path: PathBuf },
#[error("itype supplied was invalid: {0}")]
InvalidItype(u8),
}

#[derive(Error, Debug)]
Expand Down Expand Up @@ -465,7 +474,7 @@ impl Model {
}

// Verify magic
let is_legacy_model: bool = match read_i32(&mut reader)? {
let is_legacy_model: bool = match read_u32(&mut reader)? {
ggml::FILE_MAGIC => false,
ggml::FILE_MAGIC_UNVERSIONED => true,
_ => {
Expand Down
Loading