Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FP8 work in progress #678

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9755510
Advanced Copy & Transpose - mostly done, including new fused absmax k…
ademeure Jul 8, 2024
1ff0dad
Disable fused absmax rescale by default
ademeure Jul 8, 2024
6d4546b
Added in-place rescaling for fused absmax/scale (not properly tested …
ademeure Jul 8, 2024
da3597a
more bugfixes for fused absmax+scale+rescale
ademeure Jul 8, 2024
7d4f058
extra comments + disable rescale for fused absmax by default
ademeure Jul 8, 2024
25e3f6e
Simplified by moving code to update_global_absmax() and update_local_…
ademeure Jul 8, 2024
d86e14c
WIP FP8 with history! (but always on-the-fly conversion for now)
ademeure Jul 10, 2024
e9ebf19
WIP FP8: now includes absmax history and FP8 matmul weights (only wor…
ademeure Jul 11, 2024
f7c53e3
FCH activations in FP8!!! (+all matmul weights + ...)
ademeure Jul 11, 2024
524b0a4
FP8 Layernorm/Fused Residual outputs!!! ... but now loss is broken ag…
ademeure Jul 12, 2024
3b286d7
setting default to more FP8
ademeure Jul 12, 2024
18ca6ee
Extra warnings for absmax_history if called without init in a step + …
ademeure Jul 12, 2024
fa89a8a
FP8 critical bugfixes - how to handle weights especially with multi-G…
ademeure Jul 21, 2024
a161a00
FP8 transpose cache + non-fused GELU optimisations + ...
ademeure Jul 24, 2024
349ff33
Transpose & GELU optimisations (+failed GELU fusion attempt, cuBLAS n…
ademeure Jul 25, 2024
154c2ea
WIP misc. FP8 (this is the point of my full 124M run)
ademeure Jul 25, 2024
414c3e6
FP8 WIP + better outlier detection
ademeure Jul 28, 2024
c7e08c9
Make -sg 8/-sl 8 the default (skip updates for outliers), and fix pri…
ademeure Jul 28, 2024
f06c6c5
More FP8 WIP - now with competitive training loss :)
ademeure Jul 29, 2024
8d6c68f
FP8 WIP layernorm backward activation gradient in native FP8
ademeure Jul 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Disable fused absmax rescale by default
  • Loading branch information
ademeure committed Jul 8, 2024
commit 1ff0dadf3f9f79f2cf0dca1c733e17274ff40506
22 changes: 15 additions & 7 deletions dev/cuda/advanced_copy_transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Usage example: ./transpose 12
//#define WIDTH 8192
//#define HEIGHT 768
//#define ABSMAX_ITERATIONS_PER_THREAD 1
//#define FUSED_RESCALE_IN_PLACE true

#if !defined(IN_TYPE)
#define IN_TYPE __nv_bfloat16
Expand Down Expand Up @@ -92,13 +93,17 @@ Usage example: ./transpose 12
#define WIDTH 32768
#endif
#if !defined(HEIGHT)
#define HEIGHT 16384
#define HEIGHT 3072
#endif

#if !defined(ABSMAX_ITERATIONS_PER_THREAD)
#define ABSMAX_ITERATIONS_PER_THREAD 2
#endif

#if !defined(FUSED_RESCALE_IN_PLACE)
#define FUSED_RESCALE_IN_PLACE false // WIP not ready yet
#endif

// ----------------------------------------------------------------------------
// these are passed as default kernel parameters to avoid making everything too messy
unsigned int* d_absmax_estimate = NULL;
Expand Down Expand Up @@ -709,9 +714,10 @@ __global__ void __launch_bounds__(1024, 2) fused_absmax_scale_persistent(TOut* _
}

// todo - this is a WIP path that rescales the tensor in-place
// it should work except it requires a separate absmax_counter parameter
// and we need to make sure the numerics are good enough, especially with subnormals
//#if FUSED_RESCALE_IN_PLACE == true
// right now, this will result in overflowed values just being scaled down, which is obviously not what we want
// it would require separate metadata to track the scaling factor used for each part of the tensor
// or just stop scaling as soon as we detect a value that is too big locally, and read BF16 version here instead
#if FUSED_RESCALE_IN_PLACE == true
if (warp_id == 0) {
if (threadIdx.x == 0) {
unsigned int old = atomicInc(absmax_counter, gridDim.x-1);
Expand Down Expand Up @@ -756,7 +762,7 @@ __global__ void __launch_bounds__(1024, 2) fused_absmax_scale_persistent(TOut* _
float rescaled_absmax = estimated_absmax * ratio_power_of_2;
*absmax_scaling = __float_as_uint(rescaled_absmax);
}
//#endif
#endif
}


Expand Down Expand Up @@ -1076,7 +1082,9 @@ int main(int argc, const char **argv) {

// check absmax if it was calculated
if (enable_absmax || kernel_num >= FIRST_ABSMAX_ONLY_KERNEL) {
validate_result((float*)d_absmax_estimate, (float*)&absmax_storage, "absmax", 1, 1e-5f);
if (kernel_num != 30) { // don't check for the WIP fused absmax kernel yet
validate_result((float*)d_absmax_estimate, (float*)&absmax_storage, "absmax", 1, 1e-5f);
}
}
}
printf("All results match. Starting benchmarks.\n\n");
Expand All @@ -1094,7 +1102,7 @@ int main(int argc, const char **argv) {
memory_ops += W * H * sizeof(OUT_TYPE);
}
#endif
if (kernel_num >= FIRST_ABSMAX_ONLY_KERNEL) {
if (kernel_num >= FIRST_ABSMAX_ONLY_KERNEL && kernel_num != 30) {
if (kernel_num < 23) {
memory_ops = 0; // 20/21/22 only do the absmax, no copy
}
Expand Down