Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Ported quantize.cpp #84

Merged
merged 22 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat(quantize): rewrite to use ggml-format
  • Loading branch information
philpax committed Apr 25, 2023
commit 6f86e32d8c5c90cd9640f9f201edcdbe37d93492
13 changes: 13 additions & 0 deletions ggml-format/src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,19 @@ impl TensorInfo {
pub fn calc_size(&self) -> usize {
data_size(self.element_type, self.dims().iter().product())
}

/// Reads the tensor's data from the given reader in an owned fashion.
///
/// The behaviour is undefined if the reader does not correspond to this info.
///
/// Do not use this if loading with `mmap`.
pub fn read_data<R: BufRead + Seek>(&self, reader: &mut R) -> std::io::Result<Vec<u8>> {
let n_bytes = self.n_elements * ggml::type_size(self.element_type);
let mut data = vec![0; n_bytes];
reader.seek(SeekFrom::Start(self.start_offset))?;
reader.read_exact(&mut data)?;
Ok(data)
}
}

/// Returns the size occupied by a tensor's data in bytes given the element type and number of elements.
Expand Down
26 changes: 9 additions & 17 deletions ggml-format/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,23 +162,15 @@ impl LoadHandler<DummyError> for MockLoadHandler<'_> {
}

fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), DummyError> {
self.loaded_model.tensors.insert(
info.name,
TensorData {
n_dims: info.n_dims,
dims: info.dims,
element_type: info.element_type,
data: {
let n_bytes = info.n_elements * ggml::type_size(info.element_type);
let mut data = vec![0; n_bytes];
data.copy_from_slice(
&self.data
[info.start_offset as usize..info.start_offset as usize + n_bytes],
);
data
},
},
);
let data = TensorData {
n_dims: info.n_dims,
dims: info.dims,
element_type: info.element_type,
data: info
.read_data(&mut std::io::Cursor::new(self.data))
.unwrap(),
};
self.loaded_model.tensors.insert(info.name, data);
Ok(())
}
}
37 changes: 0 additions & 37 deletions ggml-format/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,43 +48,6 @@ pub fn write_f32(writer: &mut dyn Write, value: f32) -> Result<(), std::io::Erro
writer.write_all(&value.to_le_bytes())
}

/// Read and write a `i32` from a reader to a writer.
pub fn rw_i32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<i32, std::io::Error> {
Ok(i32::from_le_bytes(rw::<4>(reader, writer)?))
}

/// Read and write a `u32` from a reader to a writer.
pub fn rw_u32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<u32, std::io::Error> {
Ok(u32::from_le_bytes(rw::<4>(reader, writer)?))
}

/// Read and write a `f32` from a reader to a writer.
pub fn rw_f32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<f32, std::io::Error> {
Ok(f32::from_le_bytes(rw::<4>(reader, writer)?))
}

/// Read and write a variable-length array of bytes from a reader to a writer.
pub fn rw_bytes_with_len(
reader: &mut impl BufRead,
writer: &mut impl Write,
len: usize,
) -> Result<Vec<u8>, std::io::Error> {
let mut buf = vec![0; len];
reader.read_exact(&mut buf)?;
writer.write_all(&buf)?;
Ok(buf)
}

/// Read and write a fixed-size array of bytes from a reader to a writer.
fn rw<const N: usize>(
reader: &mut impl BufRead,
writer: &mut impl Write,
) -> Result<[u8; N], std::io::Error> {
let bytes: [u8; N] = read_bytes(reader)?;
writer.write_all(&bytes)?;
Ok(bytes)
}

// NOTE: Implementation from #![feature(buf_read_has_data_left)]
/// Check if there is any data left in the reader.
pub fn has_data_left(reader: &mut impl BufRead) -> Result<bool, std::io::Error> {
Expand Down
20 changes: 10 additions & 10 deletions ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -697,17 +697,17 @@ fn i64_to_usize(val: i64) -> usize {
/// You must ensure the arrays passed in are of the correct size.
philpax marked this conversation as resolved.
Show resolved Hide resolved
pub unsafe fn quantize_q4_0(
src: &[f32],
dst: &mut [f32],
n: i32,
k: i32,
dst: &mut [u8],
n: usize,
k: usize,
hist: &mut [i64],
) -> usize {
unsafe {
ggml_sys::ggml_quantize_q4_0(
src.as_ptr(),
dst.as_mut_ptr() as *mut c_void,
n,
k,
n.try_into().unwrap(),
k.try_into().unwrap(),
hist.as_mut_ptr(),
)
}
Expand All @@ -720,17 +720,17 @@ pub unsafe fn quantize_q4_0(
/// You must ensure the arrays passed in are of the correct size.
pub unsafe fn quantize_q4_1(
src: &[f32],
dst: &mut [f32],
n: i32,
k: i32,
dst: &mut [u8],
n: usize,
k: usize,
hist: &mut [i64],
) -> usize {
unsafe {
ggml_sys::ggml_quantize_q4_1(
src.as_ptr(),
dst.as_mut_ptr() as *mut c_void,
n,
k,
n.try_into().unwrap(),
k.try_into().unwrap(),
hist.as_mut_ptr(),
)
}
Expand Down
43 changes: 31 additions & 12 deletions llama-cli/src/cli_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,18 +379,6 @@ pub struct Convert {
#[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)]
pub file_type: FileType,
}

#[derive(Parser, Debug)]
pub struct Quantize {
/// The path to the model to quantize
#[arg()]
pub source: PathBuf,

/// The path to save the quantized model to
#[arg()]
pub destination: PathBuf,
}

#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
pub enum FileType {
/// Quantized 4-bit (type 0).
Expand All @@ -412,3 +400,34 @@ impl From<FileType> for llama_rs::FileType {
}
}
}

#[derive(Parser, Debug)]
pub struct Quantize {
/// The path to the model to quantize
#[arg()]
pub source: PathBuf,

/// The path to save the quantized model to
#[arg()]
pub destination: PathBuf,

/// The format to convert to
pub target: QuantizationTarget,
}

#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
#[clap(rename_all = "snake_case")]
pub enum QuantizationTarget {
/// Quantized 4-bit (type 0).
Q4_0,
/// Quantized 4-bit (type 1).
Q4_1,
}
impl From<QuantizationTarget> for llama_rs::ElementType {
fn from(t: QuantizationTarget) -> Self {
match t {
QuantizationTarget::Q4_0 => llama_rs::ElementType::Q4_0,
QuantizationTarget::Q4_1 => llama_rs::ElementType::Q4_1,
}
}
}
42 changes: 34 additions & 8 deletions llama-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{convert::Infallible, io::Write};

use clap::Parser;
use cli_args::Args;
use color_eyre::eyre::Result;
use color_eyre::eyre::{Context, Result};
use llama_rs::{convert::convert_pth_to_ggml, InferenceError};
use rustyline::error::ReadlineError;

Expand All @@ -23,7 +23,7 @@ fn main() -> Result<()> {
Args::Repl(args) => interactive(&args, false)?,
Args::ChatExperimental(args) => interactive(&args, true)?,
Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()),
Args::Quantize(args) => quantize(&args),
Args::Quantize(args) => quantize(&args)?,
}

Ok(())
Expand Down Expand Up @@ -185,16 +185,42 @@ fn interactive(
Ok(())
}

fn quantize(args: &cli_args::Quantize) {
llama_rs::quantize::quantize(
fn quantize(args: &cli_args::Quantize) -> Result<()> {
use llama_rs::quantize::{quantize, QuantizeProgress::*};
quantize(
&args.source,
&args.destination,
llama_rs::ElementType::Q4_0,
|p| {
println!("{p:?}");
args.target.into(),
|progress| match progress {
HyperparametersLoaded(_) => log::info!("Loaded hyperparameters"),
TensorLoading {
name,
dims,
element_type,
n_elements,
} => log::info!(
"Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
),
TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
TensorQuantized {
name,
original_size,
reduced_size,
history,
} => log::info!(
"Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
),
TensorSkipped { name, size } => log::info!("Skipped tensor `{name}` ({size} bytes)"),
Finished {
original_size,
reduced_size,
history,
} => log::info!(
"Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
),
},
)
.unwrap();
.wrap_err("failed to quantize model")
}

fn load_prompt_file_with_prompt(
Expand Down
24 changes: 10 additions & 14 deletions llama-rs/src/loader2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ pub(crate) fn load(
total_parts: 1,
});

let mut loader = Loader::new(n_context_tokens, prefer_mmap, load_progress_callback);
let use_mmap = loader.mmap_active();
let mut loader = Loader::new(n_context_tokens, load_progress_callback);

ggml_format::load_model(&mut reader, &mut loader)
.map_err(|err| LoadError::from_format_error(err, path.clone()))?;
Expand All @@ -83,12 +82,15 @@ pub(crate) fn load(
vocabulary,
tensors,
mut load_progress_callback,
container_type,
..
} = loader;

let Hyperparameters { n_embd, n_mult, .. } = hyperparameters;
let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;

let use_mmap = prefer_mmap && container_type.support_mmap();

let ctx_size = tensors
.values()
.map(|ti| {
Expand Down Expand Up @@ -192,23 +194,21 @@ pub(crate) fn load(
Ok(model)
}

struct Loader<F: FnMut(LoadProgress)> {
pub(crate) struct Loader<F: FnMut(LoadProgress)> {
// Input
n_ctx: usize,
prefer_mmap: bool,
load_progress_callback: F,

// Output
container_type: ContainerType,
hyperparameters: Hyperparameters,
vocabulary: Vocabulary,
tensors: HashMap<String, TensorInfo>,
pub(crate) container_type: ContainerType,
pub(crate) hyperparameters: Hyperparameters,
pub(crate) vocabulary: Vocabulary,
pub(crate) tensors: HashMap<String, TensorInfo>,
}
impl<F: FnMut(LoadProgress)> Loader<F> {
fn new(n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self {
pub(crate) fn new(n_ctx: usize, load_progress_callback: F) -> Self {
Self {
n_ctx,
prefer_mmap,
load_progress_callback,

container_type: ContainerType::Ggjt,
Expand All @@ -217,10 +217,6 @@ impl<F: FnMut(LoadProgress)> Loader<F> {
tensors: HashMap::default(),
}
}

fn mmap_active(&mut self) -> bool {
self.prefer_mmap && self.container_type.support_mmap()
}
}
impl<F: FnMut(LoadProgress)> ggml_format::LoadHandler<LoadError> for Loader<F> {
fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
Expand Down
Loading