Q4_0 scale selection using RMSE

ggerganov · sw · Apr 7, 2023 · Apr 7, 2023 · Apr 8, 2023 · Apr 8, 2023
commit 40ebf819b0fd367e65c97c6d9cef3863dd54f882
diff --git a/Makefile b/Makefile
@@ -133,7 +133,7 @@ $(info I CC: $(CCV))
 $(info I CXX: $(CXXV))
 $(info )
 
-default: main quantize perplexity embedding
+default: main quantize quantize-stats perplexity embedding
 
 #
 # Build library

diff --git a/SHA256SUMS b/SHA256SUMS
@@ -1,7 +1,11 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
+0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin
+5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
+7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin
+4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -17,12 +17,15 @@
 static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
 static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
 
+static const char * impl_strs[] = { "simd", "reference", "rmse" };
+static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
+
 struct quantize_stats_params {
  std::string model = "models/7B/ggml-model-f16.bin";
  bool verbose = false;
  bool per_layer_stats = false;
  bool print_histogram = false;
- bool reference = false;
+ std::vector<ggml_quantize_impl_t> include_impl;
  std::vector<std::string> include_layers;
  std::vector<std::string> exclude_layers;
  std::vector<enum ggml_type> include_types;
@@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
  fprintf(stderr, " -h, --help show this help message and exit\n");
  fprintf(stderr, " -m FNAME, --model FNAME\n");
  fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
- fprintf(stderr, " -r, --reference\n");
- fprintf(stderr, " use reference implementation (default: false)\n");
+ fprintf(stderr, " -i, --implementation\n");
+ fprintf(stderr, " select implementation (simd, reference, rmse)\n");
  fprintf(stderr, " -v, --verbose\n");
  fprintf(stderr, " verbose output (default: false)\n");
  fprintf(stderr, " -p, --per-layer-stats\n");
@@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
  return INFINITY;
 }
 
-void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
  double rmse = sqrt(stats.total_error / (double) stats.num_samples);
  double median = find_quantile(stats, .5);
  double pct95 = find_quantile(stats, .95);
- printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+ printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
+ name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
  if (print_histogram) {
  printf("Error distribution:\n");
  for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
@@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
  std::string & name,
  bool print_layer_stats,
  const quantize_fns_t & qfns,
- bool use_reference,
+ ggml_quantize_impl_t impl,
  const ggml_tensor * layer,
  float * input_scratch,
  char *quantized_scratch,
@@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
  input_scratch = ggml_get_data_f32(layer) + offset;
  }
 
- if (use_reference) {
- qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
- } else {
- qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
- }
+ qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
  qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
 
  update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
@@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
  }
  }
  if (print_layer_stats) {
- print_error_stats(name, layer_error, false);
+ print_error_stats(name, impl, layer_error, false);
  }
 }
 
@@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
  if (arg == "-h" || arg == "--help") {
  quantize_stats_print_usage(argc, argv);
  exit(0);
- } else if (arg == "-r" || arg == "--reference") {
- params.reference = true;
+ } else if (arg == "-i" || arg == "--implementation") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ int j;
+ for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
+ // find match
+ }
+ if (j < GGML_QUANTIZE_IMPL_COUNT) {
+ params.include_impl.push_back((ggml_quantize_impl_t)j);
+ } else {
+ fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
+ invalid_param = true;
+ }
  } else if (arg == "-v") {
  params.verbose = true;
  } else if (arg == "-p" || arg == "--per-layer-stats") {
@@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
  std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
  std::vector<float> output_scratch(SCRATCH_ELEMENTS);
 
- // loop throught quantization types
- for (int i = 0; i < GGML_TYPE_COUNT; i++) {
- if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+ // loop through quantization types
+ for (int type = 0; type < GGML_TYPE_COUNT; type++) {
+ if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
  continue;
  }
- quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+ quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
  if (qfns.quantize_row_q && qfns.dequantize_row_q) {
- if (params.verbose) {
- printf("testing %s ...\n", type_strs[i]);
- }
-
- error_stats global_stats {};
-
- for (const auto& kv_tensor : tensors_sorted) {
- if (!layer_included(params, kv_tensor.first)) {
+ for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
+ if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
  continue;
  }
+
  if (params.verbose) {
- printf("  %s ...\n",  kv_tensor.first.c_str());
+ printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
  }
- std::string layer_name { type_strs[i] };
- layer_name += "::" + kv_tensor.first;
- test_roundtrip_on_layer(
- layer_name,
- params.per_layer_stats,
- qfns,
- params.reference,
- kv_tensor.second,
- input_scratch.data(),
- quantized_scratch.data(),
- output_scratch.data(),
- global_stats
- );
- }
 
- print_error_stats(type_strs[i], global_stats, params.print_histogram);
+ error_stats global_stats {};
+
+ for (const auto& kv_tensor : tensors_sorted) {
+ if (!layer_included(params, kv_tensor.first)) {
+ continue;
+ }
+ if (params.verbose) {
+ printf(" %s ...\n", kv_tensor.first.c_str());
+ }
+ std::string layer_name { type_strs[type] };
+ layer_name += "::" + kv_tensor.first;
+ test_roundtrip_on_layer(
+ layer_name,
+ params.per_layer_stats,
+ qfns,
+ (ggml_quantize_impl_t)impl,
+ kv_tensor.second,
+ input_scratch.data(),
+ quantized_scratch.data(),
+ output_scratch.data(),
+ global_stats
+ );
+ }
+
+ print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
+ }
  }
  }
 

diff --git a/examples/quantize/scale.py b/examples/quantize/scale.py
@@ -0,0 +1,76 @@
+import matplotlib.pyplot as plt
+
+# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
+# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
+data = (
+ (-10.0, 0),
+ (-9.9, 1),
+ (-9.8, 3),
+ (-9.7, 65),
+ (-9.6, 738),
+ (-9.5, 5779),
+ (-9.4, 30880),
+ (-9.3, 121078),
+ (-9.2, 375674),
+ (-9.1, 941350),
+ (-9.0, 1990278),
+ (-8.9, 3635317),
+ (-8.8, 5891752),
+ (-8.7, 8678748),
+ (-8.6, 11771759),
+ (-8.5, 14873993),
+ (-8.4, 17594260),
+ (-8.3, 19553100),
+ (-8.2, 20415428),
+ (-8.1, 20017134),
+ (-8.0, 18357204),
+ (-7.9, 15597612),
+ (-7.8, 11993688),
+ (-7.7, 7842970),
+ (-7.6, 2880878),
+ (-7.5, 3478),
+ (-7.4, 2648437),
+ (-7.3, 5641970),
+ (-7.2, 5935890),
+ (-7.1, 4910790),
+ (-7.0, 3425891),
+ (-6.9, 2068250),
+ (-6.8, 1089883),
+ (-6.7, 502462),
+ (-6.6, 156356),
+ (-6.5, 205),
+ (-6.4, 163500),
+ (-6.3, 386291),
+ (-6.2, 423018),
+ (-6.1, 319360),
+ (-6.0, 180783),
+ (-5.9, 78822),
+ (-5.8, 28254),
+ (-5.7, 8698),
+ (-5.6, 1969),
+ (-5.5, 0),
+ (-5.4, 2069),
+ (-5.3, 5722),
+ (-5.2, 7107),
+ (-5.1, 5113),
+ (-5.0, 2332),
+ (-4.9, 636),
+ (-4.8, 130),
+ (-4.7, 12),
+ (-4.6, 1),
+ (-4.5, 0),
+ (-4.4, 3),
+ (-4.3, 4),
+ (-4.2, 8),
+ (-4.1, 8),
+ (-4.0, 27),
+)
+x, y = zip(*data)
+
+fig, ax = plt.subplots()
+b = ax.bar(x, y, 0.1, bottom=1)
+ax.set_yscale("log")
+ax.set_xlabel("scale")
+ax.set_ylabel("N")
+plt.title("Quantization scale factor with lowest RMS error")
+plt.show()