Disable fused absmax rescale by default

karpathy · ademeure · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
commit 1ff0dadf3f9f79f2cf0dca1c733e17274ff40506
diff --git a/dev/cuda/advanced_copy_transpose.cu b/dev/cuda/advanced_copy_transpose.cu
@@ -50,6 +50,7 @@ Usage example: ./transpose 12
 //#define WIDTH 8192
 //#define HEIGHT 768
 //#define ABSMAX_ITERATIONS_PER_THREAD 1
+//#define FUSED_RESCALE_IN_PLACE true
 
 #if !defined(IN_TYPE)
 #define IN_TYPE __nv_bfloat16
@@ -92,13 +93,17 @@ Usage example: ./transpose 12
 #define WIDTH 32768
 #endif
 #if !defined(HEIGHT)
-#define HEIGHT 16384
+#define HEIGHT 3072
 #endif
 
 #if !defined(ABSMAX_ITERATIONS_PER_THREAD)
 #define ABSMAX_ITERATIONS_PER_THREAD 2
 #endif
 
+#if !defined(FUSED_RESCALE_IN_PLACE)
+#define FUSED_RESCALE_IN_PLACE false // WIP not ready yet
+#endif
+
 // ----------------------------------------------------------------------------
 // these are passed as default kernel parameters to avoid making everything too messy
 unsigned int* d_absmax_estimate = NULL;
@@ -709,9 +714,10 @@ __global__ void __launch_bounds__(1024, 2) fused_absmax_scale_persistent(TOut* _
  }
 
  // todo - this is a WIP path that rescales the tensor in-place
- // it should work except it requires a separate absmax_counter parameter
- // and we need to make sure the numerics are good enough, especially with subnormals
- //#if FUSED_RESCALE_IN_PLACE == true
+ // right now, this will result in overflowed values just being scaled down, which is obviously not what we want
+ // it would require separate metadata to track the scaling factor used for each part of the tensor
+ // or just stop scaling as soon as we detect a value that is too big locally, and read BF16 version here instead
+ #if FUSED_RESCALE_IN_PLACE == true
  if (warp_id == 0) {
  if (threadIdx.x == 0) {
  unsigned int old = atomicInc(absmax_counter, gridDim.x-1);
@@ -756,7 +762,7 @@ __global__ void __launch_bounds__(1024, 2) fused_absmax_scale_persistent(TOut* _
  float rescaled_absmax = estimated_absmax * ratio_power_of_2;
  *absmax_scaling = __float_as_uint(rescaled_absmax);
  }
- //#endif
+ #endif
 }
 
 
@@ -1076,7 +1082,9 @@ int main(int argc, const char **argv) {
 
  // check absmax if it was calculated
  if (enable_absmax || kernel_num >= FIRST_ABSMAX_ONLY_KERNEL) {
- validate_result((float*)d_absmax_estimate, (float*)&absmax_storage, "absmax", 1, 1e-5f);
+ if (kernel_num != 30) { // don't check for the WIP fused absmax kernel yet
+ validate_result((float*)d_absmax_estimate, (float*)&absmax_storage, "absmax", 1, 1e-5f);
+ }
  }
  }
  printf("All results match. Starting benchmarks.\n\n");
@@ -1094,7 +1102,7 @@ int main(int argc, const char **argv) {
  memory_ops += W * H * sizeof(OUT_TYPE);
  }
  #endif
- if (kernel_num >= FIRST_ABSMAX_ONLY_KERNEL) {
+ if (kernel_num >= FIRST_ABSMAX_ONLY_KERNEL && kernel_num != 30) {
  if (kernel_num < 23) {
  memory_ops = 0; // 20/21/22 only do the absmax, no copy
  }