feat(quantize): rewrite to use ggml-format

rustformers · philpax · Apr 25, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 30, 2023
commit 6f86e32d8c5c90cd9640f9f201edcdbe37d93492
diff --git a/ggml-format/src/loader.rs b/ggml-format/src/loader.rs
@@ -68,6 +68,19 @@ impl TensorInfo {
  pub fn calc_size(&self) -> usize {
  data_size(self.element_type, self.dims().iter().product())
  }
+
+ /// Reads the tensor's data from the given reader in an owned fashion.
+ ///
+ /// The behaviour is undefined if the reader does not correspond to this info.
+ ///
+ /// Do not use this if loading with `mmap`.
+ pub fn read_data<R: BufRead + Seek>(&self, reader: &mut R) -> std::io::Result<Vec<u8>> {
+ let n_bytes = self.n_elements * ggml::type_size(self.element_type);
+ let mut data = vec![0; n_bytes];
+ reader.seek(SeekFrom::Start(self.start_offset))?;
+ reader.read_exact(&mut data)?;
+ Ok(data)
+ }
 }
 
 /// Returns the size occupied by a tensor's data in bytes given the element type and number of elements.

diff --git a/ggml-format/src/tests.rs b/ggml-format/src/tests.rs
@@ -162,23 +162,15 @@ impl LoadHandler<DummyError> for MockLoadHandler<'_> {
  }
 
  fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), DummyError> {
- self.loaded_model.tensors.insert(
- info.name,
- TensorData {
- n_dims: info.n_dims,
- dims: info.dims,
- element_type: info.element_type,
- data: {
- let n_bytes = info.n_elements * ggml::type_size(info.element_type);
- let mut data = vec![0; n_bytes];
- data.copy_from_slice(
- &self.data
- [info.start_offset as usize..info.start_offset as usize + n_bytes],
- );
- data
- },
- },
- );
+ let data = TensorData {
+ n_dims: info.n_dims,
+ dims: info.dims,
+ element_type: info.element_type,
+ data: info
+ .read_data(&mut std::io::Cursor::new(self.data))
+ .unwrap(),
+ };
+ self.loaded_model.tensors.insert(info.name, data);
  Ok(())
  }
 }
diff --git a/ggml-format/src/util.rs b/ggml-format/src/util.rs
@@ -48,43 +48,6 @@ pub fn write_f32(writer: &mut dyn Write, value: f32) -> Result<(), std::io::Erro
  writer.write_all(&value.to_le_bytes())
 }
 
-/// Read and write a `i32` from a reader to a writer.
-pub fn rw_i32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<i32, std::io::Error> {
- Ok(i32::from_le_bytes(rw::<4>(reader, writer)?))
-}
-
-/// Read and write a `u32` from a reader to a writer.
-pub fn rw_u32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<u32, std::io::Error> {
- Ok(u32::from_le_bytes(rw::<4>(reader, writer)?))
-}
-
-/// Read and write a `f32` from a reader to a writer.
-pub fn rw_f32(reader: &mut impl BufRead, writer: &mut impl Write) -> Result<f32, std::io::Error> {
- Ok(f32::from_le_bytes(rw::<4>(reader, writer)?))
-}
-
-/// Read and write a variable-length array of bytes from a reader to a writer.
-pub fn rw_bytes_with_len(
- reader: &mut impl BufRead,
- writer: &mut impl Write,
- len: usize,
-) -> Result<Vec<u8>, std::io::Error> {
- let mut buf = vec![0; len];
- reader.read_exact(&mut buf)?;
- writer.write_all(&buf)?;
- Ok(buf)
-}
-
-/// Read and write a fixed-size array of bytes from a reader to a writer.
-fn rw<const N: usize>(
- reader: &mut impl BufRead,
- writer: &mut impl Write,
-) -> Result<[u8; N], std::io::Error> {
- let bytes: [u8; N] = read_bytes(reader)?;
- writer.write_all(&bytes)?;
- Ok(bytes)
-}
-
 // NOTE: Implementation from #![feature(buf_read_has_data_left)]
 /// Check if there is any data left in the reader.
 pub fn has_data_left(reader: &mut impl BufRead) -> Result<bool, std::io::Error> {

diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
@@ -697,17 +697,17 @@ fn i64_to_usize(val: i64) -> usize {
 /// You must ensure the arrays passed in are of the correct size.
 pub unsafe fn quantize_q4_0(
  src: &[f32],
- dst: &mut [f32],
- n: i32,
- k: i32,
+ dst: &mut [u8],
+ n: usize,
+ k: usize,
  hist: &mut [i64],
 ) -> usize {
  unsafe {
  ggml_sys::ggml_quantize_q4_0(
  src.as_ptr(),
  dst.as_mut_ptr() as *mut c_void,
- n,
- k,
+ n.try_into().unwrap(),
+ k.try_into().unwrap(),
  hist.as_mut_ptr(),
  )
  }
@@ -720,17 +720,17 @@ pub unsafe fn quantize_q4_0(
 /// You must ensure the arrays passed in are of the correct size.
 pub unsafe fn quantize_q4_1(
  src: &[f32],
- dst: &mut [f32],
- n: i32,
- k: i32,
+ dst: &mut [u8],
+ n: usize,
+ k: usize,
  hist: &mut [i64],
 ) -> usize {
  unsafe {
  ggml_sys::ggml_quantize_q4_1(
  src.as_ptr(),
  dst.as_mut_ptr() as *mut c_void,
- n,
- k,
+ n.try_into().unwrap(),
+ k.try_into().unwrap(),
  hist.as_mut_ptr(),
  )
  }

diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
@@ -379,18 +379,6 @@ pub struct Convert {
  #[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)]
  pub file_type: FileType,
 }
-
-#[derive(Parser, Debug)]
-pub struct Quantize {
- /// The path to the model to quantize
- #[arg()]
- pub source: PathBuf,
-
- /// The path to save the quantized model to
- #[arg()]
- pub destination: PathBuf,
-}
-
 #[derive(Parser, Debug, ValueEnum, Clone, Copy)]
 pub enum FileType {
  /// Quantized 4-bit (type 0).
@@ -412,3 +400,34 @@ impl From<FileType> for llama_rs::FileType {
  }
  }
 }
+
+#[derive(Parser, Debug)]
+pub struct Quantize {
+ /// The path to the model to quantize
+ #[arg()]
+ pub source: PathBuf,
+
+ /// The path to save the quantized model to
+ #[arg()]
+ pub destination: PathBuf,
+
+ /// The format to convert to
+ pub target: QuantizationTarget,
+}
+
+#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
+#[clap(rename_all = "snake_case")]
+pub enum QuantizationTarget {
+ /// Quantized 4-bit (type 0).
+ Q4_0,
+ /// Quantized 4-bit (type 1).
+ Q4_1,
+}
+impl From<QuantizationTarget> for llama_rs::ElementType {
+ fn from(t: QuantizationTarget) -> Self {
+ match t {
+ QuantizationTarget::Q4_0 => llama_rs::ElementType::Q4_0,
+ QuantizationTarget::Q4_1 => llama_rs::ElementType::Q4_1,
+ }
+ }
+}
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
@@ -2,7 +2,7 @@ use std::{convert::Infallible, io::Write};
 
 use clap::Parser;
 use cli_args::Args;
-use color_eyre::eyre::Result;
+use color_eyre::eyre::{Context, Result};
 use llama_rs::{convert::convert_pth_to_ggml, InferenceError};
 use rustyline::error::ReadlineError;
 
@@ -23,7 +23,7 @@ fn main() -> Result<()> {
  Args::Repl(args) => interactive(&args, false)?,
  Args::ChatExperimental(args) => interactive(&args, true)?,
  Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()),
- Args::Quantize(args) => quantize(&args),
+ Args::Quantize(args) => quantize(&args)?,
  }
 
  Ok(())
@@ -185,16 +185,42 @@ fn interactive(
  Ok(())
 }
 
-fn quantize(args: &cli_args::Quantize) {
- llama_rs::quantize::quantize(
+fn quantize(args: &cli_args::Quantize) -> Result<()> {
+ use llama_rs::quantize::{quantize, QuantizeProgress::*};
+ quantize(
  &args.source,
  &args.destination,
- llama_rs::ElementType::Q4_0,
- |p| {
- println!("{p:?}");
+ args.target.into(),
+ |progress| match progress {
+ HyperparametersLoaded(_) => log::info!("Loaded hyperparameters"),
+ TensorLoading {
+ name,
+ dims,
+ element_type,
+ n_elements,
+ } => log::info!(
+ "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
+ ),
+ TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
+ TensorQuantized {
+ name,
+ original_size,
+ reduced_size,
+ history,
+ } => log::info!(
+ "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
+ ),
+ TensorSkipped { name, size } => log::info!("Skipped tensor `{name}` ({size} bytes)"),
+ Finished {
+ original_size,
+ reduced_size,
+ history,
+ } => log::info!(
+ "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
+ ),
  },
  )
- .unwrap();
+ .wrap_err("failed to quantize model")
 }
 
 fn load_prompt_file_with_prompt(

diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs
@@ -72,8 +72,7 @@ pub(crate) fn load(
  total_parts: 1,
  });
 
- let mut loader = Loader::new(n_context_tokens, prefer_mmap, load_progress_callback);
- let use_mmap = loader.mmap_active();
+ let mut loader = Loader::new(n_context_tokens, load_progress_callback);
 
  ggml_format::load_model(&mut reader, &mut loader)
  .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
@@ -83,12 +82,15 @@ pub(crate) fn load(
  vocabulary,
  tensors,
  mut load_progress_callback,
+ container_type,
  ..
  } = loader;
 
  let Hyperparameters { n_embd, n_mult, .. } = hyperparameters;
  let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
 
+ let use_mmap = prefer_mmap && container_type.support_mmap();
+
  let ctx_size = tensors
  .values()
  .map(|ti| {
@@ -192,23 +194,21 @@ pub(crate) fn load(
  Ok(model)
 }
 
-struct Loader<F: FnMut(LoadProgress)> {
+pub(crate) struct Loader<F: FnMut(LoadProgress)> {
  // Input
  n_ctx: usize,
- prefer_mmap: bool,
  load_progress_callback: F,
 
  // Output
- container_type: ContainerType,
- hyperparameters: Hyperparameters,
- vocabulary: Vocabulary,
- tensors: HashMap<String, TensorInfo>,
+ pub(crate) container_type: ContainerType,
+ pub(crate) hyperparameters: Hyperparameters,
+ pub(crate) vocabulary: Vocabulary,
+ pub(crate) tensors: HashMap<String, TensorInfo>,
 }
 impl<F: FnMut(LoadProgress)> Loader<F> {
- fn new(n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self {
+ pub(crate) fn new(n_ctx: usize, load_progress_callback: F) -> Self {
  Self {
  n_ctx,
- prefer_mmap,
  load_progress_callback,
 
  container_type: ContainerType::Ggjt,
@@ -217,10 +217,6 @@ impl<F: FnMut(LoadProgress)> Loader<F> {
  tensors: HashMap::default(),
  }
  }
-
- fn mmap_active(&mut self) -> bool {
- self.prefer_mmap && self.container_type.support_mmap()
- }
 }
 impl<F: FnMut(LoadProgress)> ggml_format::LoadHandler<LoadError> for Loader<F> {
  fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {