-
Notifications
You must be signed in to change notification settings - Fork 9.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Q4_0 scale selection using RMSE #835
Draft
sw
wants to merge
3
commits into
ggerganov:master
Choose a base branch
from
sw:scale-rmse
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 1 commit
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next
Next commit
Q4_0 scale selection using RMSE
- Loading branch information
commit 40ebf819b0fd367e65c97c6d9cef3863dd54f882
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,12 +17,15 @@ | |
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; | ||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); | ||
|
||
static const char * impl_strs[] = { "simd", "reference", "rmse" }; | ||
static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list"); | ||
|
||
struct quantize_stats_params { | ||
std::string model = "models/7B/ggml-model-f16.bin"; | ||
bool verbose = false; | ||
bool per_layer_stats = false; | ||
bool print_histogram = false; | ||
bool reference = false; | ||
std::vector<ggml_quantize_impl_t> include_impl; | ||
std::vector<std::string> include_layers; | ||
std::vector<std::string> exclude_layers; | ||
std::vector<enum ggml_type> include_types; | ||
|
@@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { | |
fprintf(stderr, " -h, --help show this help message and exit\n"); | ||
fprintf(stderr, " -m FNAME, --model FNAME\n"); | ||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); | ||
fprintf(stderr, " -r, --reference\n"); | ||
fprintf(stderr, " use reference implementation (default: false)\n"); | ||
fprintf(stderr, " -i, --implementation\n"); | ||
fprintf(stderr, " select implementation (simd, reference, rmse)\n"); | ||
fprintf(stderr, " -v, --verbose\n"); | ||
fprintf(stderr, " verbose output (default: false)\n"); | ||
fprintf(stderr, " -p, --per-layer-stats\n"); | ||
|
@@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) { | |
return INFINITY; | ||
} | ||
|
||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { | ||
void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) { | ||
double rmse = sqrt(stats.total_error / (double) stats.num_samples); | ||
double median = find_quantile(stats, .5); | ||
double pct95 = find_quantile(stats, .95); | ||
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); | ||
printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", | ||
name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median); | ||
if (print_histogram) { | ||
printf("Error distribution:\n"); | ||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { | ||
|
@@ -136,7 +140,7 @@ void test_roundtrip_on_layer( | |
std::string & name, | ||
bool print_layer_stats, | ||
const quantize_fns_t & qfns, | ||
bool use_reference, | ||
ggml_quantize_impl_t impl, | ||
const ggml_tensor * layer, | ||
float * input_scratch, | ||
char *quantized_scratch, | ||
|
@@ -158,11 +162,7 @@ void test_roundtrip_on_layer( | |
input_scratch = ggml_get_data_f32(layer) + offset; | ||
} | ||
|
||
if (use_reference) { | ||
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); | ||
} else { | ||
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); | ||
} | ||
qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size); | ||
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); | ||
|
||
update_error_stats(chunk_size, input_scratch, output_scratch, total_error); | ||
|
@@ -171,7 +171,7 @@ void test_roundtrip_on_layer( | |
} | ||
} | ||
if (print_layer_stats) { | ||
print_error_stats(name, layer_error, false); | ||
print_error_stats(name, impl, layer_error, false); | ||
} | ||
} | ||
|
||
|
@@ -190,8 +190,21 @@ int main(int argc, char ** argv) { | |
if (arg == "-h" || arg == "--help") { | ||
quantize_stats_print_usage(argc, argv); | ||
exit(0); | ||
} else if (arg == "-r" || arg == "--reference") { | ||
params.reference = true; | ||
} else if (arg == "-i" || arg == "--implementation") { | ||
if (++i >= argc) { | ||
invalid_param = true; | ||
break; | ||
} | ||
int j; | ||
for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) { | ||
// find match | ||
} | ||
if (j < GGML_QUANTIZE_IMPL_COUNT) { | ||
params.include_impl.push_back((ggml_quantize_impl_t)j); | ||
} else { | ||
fprintf(stderr, "error: %s not in list of implementations\n", argv[i]); | ||
invalid_param = true; | ||
} | ||
} else if (arg == "-v") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add |
||
params.verbose = true; | ||
} else if (arg == "-p" || arg == "--per-layer-stats") { | ||
|
@@ -302,42 +315,48 @@ int main(int argc, char ** argv) { | |
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4); | ||
std::vector<float> output_scratch(SCRATCH_ELEMENTS); | ||
|
||
// loop throught quantization types | ||
for (int i = 0; i < GGML_TYPE_COUNT; i++) { | ||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { | ||
// loop through quantization types | ||
for (int type = 0; type < GGML_TYPE_COUNT; type++) { | ||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) { | ||
continue; | ||
} | ||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); | ||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type); | ||
if (qfns.quantize_row_q && qfns.dequantize_row_q) { | ||
if (params.verbose) { | ||
printf("testing %s ...\n", type_strs[i]); | ||
} | ||
|
||
error_stats global_stats {}; | ||
|
||
for (const auto& kv_tensor : tensors_sorted) { | ||
if (!layer_included(params, kv_tensor.first)) { | ||
for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) { | ||
if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) { | ||
continue; | ||
} | ||
|
||
if (params.verbose) { | ||
printf(" %s ...\n", kv_tensor.first.c_str()); | ||
printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]); | ||
} | ||
std::string layer_name { type_strs[i] }; | ||
layer_name += "::" + kv_tensor.first; | ||
test_roundtrip_on_layer( | ||
layer_name, | ||
params.per_layer_stats, | ||
qfns, | ||
params.reference, | ||
kv_tensor.second, | ||
input_scratch.data(), | ||
quantized_scratch.data(), | ||
output_scratch.data(), | ||
global_stats | ||
); | ||
} | ||
|
||
print_error_stats(type_strs[i], global_stats, params.print_histogram); | ||
error_stats global_stats {}; | ||
|
||
for (const auto& kv_tensor : tensors_sorted) { | ||
if (!layer_included(params, kv_tensor.first)) { | ||
continue; | ||
} | ||
if (params.verbose) { | ||
printf(" %s ...\n", kv_tensor.first.c_str()); | ||
} | ||
std::string layer_name { type_strs[type] }; | ||
layer_name += "::" + kv_tensor.first; | ||
test_roundtrip_on_layer( | ||
layer_name, | ||
params.per_layer_stats, | ||
qfns, | ||
(ggml_quantize_impl_t)impl, | ||
kv_tensor.second, | ||
input_scratch.data(), | ||
quantized_scratch.data(), | ||
output_scratch.data(), | ||
global_stats | ||
); | ||
} | ||
|
||
print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram); | ||
} | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import matplotlib.pyplot as plt | ||
|
||
# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor. | ||
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE. | ||
data = ( | ||
(-10.0, 0), | ||
(-9.9, 1), | ||
(-9.8, 3), | ||
(-9.7, 65), | ||
(-9.6, 738), | ||
(-9.5, 5779), | ||
(-9.4, 30880), | ||
(-9.3, 121078), | ||
(-9.2, 375674), | ||
(-9.1, 941350), | ||
(-9.0, 1990278), | ||
(-8.9, 3635317), | ||
(-8.8, 5891752), | ||
(-8.7, 8678748), | ||
(-8.6, 11771759), | ||
(-8.5, 14873993), | ||
(-8.4, 17594260), | ||
(-8.3, 19553100), | ||
(-8.2, 20415428), | ||
(-8.1, 20017134), | ||
(-8.0, 18357204), | ||
(-7.9, 15597612), | ||
(-7.8, 11993688), | ||
(-7.7, 7842970), | ||
(-7.6, 2880878), | ||
(-7.5, 3478), | ||
(-7.4, 2648437), | ||
(-7.3, 5641970), | ||
(-7.2, 5935890), | ||
(-7.1, 4910790), | ||
(-7.0, 3425891), | ||
(-6.9, 2068250), | ||
(-6.8, 1089883), | ||
(-6.7, 502462), | ||
(-6.6, 156356), | ||
(-6.5, 205), | ||
(-6.4, 163500), | ||
(-6.3, 386291), | ||
(-6.2, 423018), | ||
(-6.1, 319360), | ||
(-6.0, 180783), | ||
(-5.9, 78822), | ||
(-5.8, 28254), | ||
(-5.7, 8698), | ||
(-5.6, 1969), | ||
(-5.5, 0), | ||
(-5.4, 2069), | ||
(-5.3, 5722), | ||
(-5.2, 7107), | ||
(-5.1, 5113), | ||
(-5.0, 2332), | ||
(-4.9, 636), | ||
(-4.8, 130), | ||
(-4.7, 12), | ||
(-4.6, 1), | ||
(-4.5, 0), | ||
(-4.4, 3), | ||
(-4.3, 4), | ||
(-4.2, 8), | ||
(-4.1, 8), | ||
(-4.0, 27), | ||
) | ||
x, y = zip(*data) | ||
|
||
fig, ax = plt.subplots() | ||
b = ax.bar(x, y, 0.1, bottom=1) | ||
ax.set_yscale("log") | ||
ax.set_xlabel("scale") | ||
ax.set_ylabel("N") | ||
plt.title("Quantization scale factor with lowest RMS error") | ||
plt.show() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please remove quantized models, since everyone would have their unique quantized models.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea would be that model generation is deterministic across platforms and SIMD optimizations, so the files should be identical. Of course if you keep your Q4_0 files without updating to minor version 1, this wouldn't match. I might remove it for this PR, but in the long term I think it's a good idea to ensure everyone uses the same inputs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I have generated new quantized model and checksum mathes with yours.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am sorry, is this checksum is for q4_0 that have no minor version yet?
Edit: Oh, I see, for minor v1. 4 bytes long than previous version 😅