Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Q4_0 scale selection using RMSE #835

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Q4_0 scale selection using RMSE
  • Loading branch information
sw committed Apr 8, 2023
commit 40ebf819b0fd367e65c97c6d9cef3863dd54f882
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

default: main quantize perplexity embedding
default: main quantize quantize-stats perplexity embedding

#
# Build library
Expand Down
4 changes: 4 additions & 0 deletions SHA256SUMS
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin
5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please remove quantized models, since everyone would have their unique quantized models.

Copy link
Collaborator Author

@sw sw Apr 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea would be that model generation is deterministic across platforms and SIMD optimizations, so the files should be identical. Of course if you keep your Q4_0 files without updating to minor version 1, this wouldn't match. I might remove it for this PR, but in the long term I think it's a good idea to ensure everyone uses the same inputs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I have generated new quantized model and checksum mathes with yours.

Copy link
Collaborator

@ivanstepanovftw ivanstepanovftw Apr 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sorry, is this checksum is for q4_0 that have no minor version yet?

Edit: Oh, I see, for minor v1. 4 bytes long than previous version 😅

7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin
4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
Expand Down
103 changes: 61 additions & 42 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");

static const char * impl_strs[] = { "simd", "reference", "rmse" };
static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");

struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;
bool reference = false;
std::vector<ggml_quantize_impl_t> include_impl;
std::vector<std::string> include_layers;
std::vector<std::string> exclude_layers;
std::vector<enum ggml_type> include_types;
Expand All @@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -r, --reference\n");
fprintf(stderr, " use reference implementation (default: false)\n");
fprintf(stderr, " -i, --implementation\n");
fprintf(stderr, " select implementation (simd, reference, rmse)\n");
fprintf(stderr, " -v, --verbose\n");
fprintf(stderr, " verbose output (default: false)\n");
fprintf(stderr, " -p, --per-layer-stats\n");
Expand Down Expand Up @@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY;
}

void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95);
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
if (print_histogram) {
printf("Error distribution:\n");
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
Expand Down Expand Up @@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
bool use_reference,
ggml_quantize_impl_t impl,
const ggml_tensor * layer,
float * input_scratch,
char *quantized_scratch,
Expand All @@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
input_scratch = ggml_get_data_f32(layer) + offset;
}

if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);

update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
Expand All @@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
}
}
if (print_layer_stats) {
print_error_stats(name, layer_error, false);
print_error_stats(name, impl, layer_error, false);
}
}

Expand All @@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
if (arg == "-h" || arg == "--help") {
quantize_stats_print_usage(argc, argv);
exit(0);
} else if (arg == "-r" || arg == "--reference") {
params.reference = true;
} else if (arg == "-i" || arg == "--implementation") {
if (++i >= argc) {
invalid_param = true;
break;
}
int j;
for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
// find match
}
if (j < GGML_QUANTIZE_IMPL_COUNT) {
params.include_impl.push_back((ggml_quantize_impl_t)j);
} else {
fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-v") {
Copy link
Collaborator

@ivanstepanovftw ivanstepanovftw Apr 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add || --verbose?

params.verbose = true;
} else if (arg == "-p" || arg == "--per-layer-stats") {
Expand Down Expand Up @@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
std::vector<float> output_scratch(SCRATCH_ELEMENTS);

// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
// loop through quantization types
for (int type = 0; type < GGML_TYPE_COUNT; type++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
}

error_stats global_stats {};

for (const auto& kv_tensor : tensors_sorted) {
if (!layer_included(params, kv_tensor.first)) {
for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
continue;
}

if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
}
std::string layer_name { type_strs[i] };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
params.reference,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
);
}

print_error_stats(type_strs[i], global_stats, params.print_histogram);
error_stats global_stats {};

for (const auto& kv_tensor : tensors_sorted) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { type_strs[type] };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
(ggml_quantize_impl_t)impl,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
);
}

print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
}
}
}

Expand Down
76 changes: 76 additions & 0 deletions examples/quantize/scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import matplotlib.pyplot as plt

# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
data = (
(-10.0, 0),
(-9.9, 1),
(-9.8, 3),
(-9.7, 65),
(-9.6, 738),
(-9.5, 5779),
(-9.4, 30880),
(-9.3, 121078),
(-9.2, 375674),
(-9.1, 941350),
(-9.0, 1990278),
(-8.9, 3635317),
(-8.8, 5891752),
(-8.7, 8678748),
(-8.6, 11771759),
(-8.5, 14873993),
(-8.4, 17594260),
(-8.3, 19553100),
(-8.2, 20415428),
(-8.1, 20017134),
(-8.0, 18357204),
(-7.9, 15597612),
(-7.8, 11993688),
(-7.7, 7842970),
(-7.6, 2880878),
(-7.5, 3478),
(-7.4, 2648437),
(-7.3, 5641970),
(-7.2, 5935890),
(-7.1, 4910790),
(-7.0, 3425891),
(-6.9, 2068250),
(-6.8, 1089883),
(-6.7, 502462),
(-6.6, 156356),
(-6.5, 205),
(-6.4, 163500),
(-6.3, 386291),
(-6.2, 423018),
(-6.1, 319360),
(-6.0, 180783),
(-5.9, 78822),
(-5.8, 28254),
(-5.7, 8698),
(-5.6, 1969),
(-5.5, 0),
(-5.4, 2069),
(-5.3, 5722),
(-5.2, 7107),
(-5.1, 5113),
(-5.0, 2332),
(-4.9, 636),
(-4.8, 130),
(-4.7, 12),
(-4.6, 1),
(-4.5, 0),
(-4.4, 3),
(-4.3, 4),
(-4.2, 8),
(-4.1, 8),
(-4.0, 27),
)
x, y = zip(*data)

fig, ax = plt.subplots()
b = ax.bar(x, y, 0.1, bottom=1)
ax.set_yscale("log")
ax.set_xlabel("scale")
ax.set_ylabel("N")
plt.title("Quantization scale factor with lowest RMS error")
plt.show()
Loading