added cudaCheck where missing for proper error checking.

karpathy · karpathy · Jul 15, 2024 · Jul 13, 2024 · Jul 13, 2024 · 6bb562bd88a6d55c4f4db33b33c434fcfc3d007f
commit 6bb562bd88a6d55c4f4db33b33c434fcfc3d007f
diff --git a/llmc/layernorm.cuh b/llmc/layernorm.cuh
@@ -444,7 +444,7 @@ void layernorm_forward(floatX* out, float* mean, float* rstd,
  // this may fail, in which case we fall back to the smem free implementation.
  cudaCheck(cudaGetLastError());
  auto status = cudaFuncSetAttribute(layernorm_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
- cudaGetLastError();
+ cudaCheck(cudaGetLastError());
  if (status == cudaSuccess) {
  layernorm_forward_kernel6<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(out, mean, rstd, inp, weight, bias, N, C);
  } else {
@@ -477,7 +477,7 @@ void fused_residual_forward5(floatX* residual, floatX* normed, float* mean, floa
  // this may fail, in which case we fall back to the smem free implementation.
  cudaCheck(cudaGetLastError());
  auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
- cudaGetLastError();
+ cudaCheck(cudaGetLastError());
  if(status == cudaSuccess) {
  fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(residual, normed,
  mean, rstd, inp1, inp2,

diff --git a/train_gpt2.cu b/train_gpt2.cu
@@ -1153,14 +1153,14 @@ void gpt2_free(GPT2 *model) {
 void common_start(bool override_enable_tf32 = true, bool print_device_info = true) {
 
  // get CUDA device infos
- cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
+ cudaCheck(cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx));
  if (print_device_info) {
  printf("[System]\n");
  printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
  }
 
  // set up the cuda streams. atm everything is on the single main stream
- cudaStreamCreate(&main_stream);
+ cudaCheck(cudaStreamCreate(&main_stream));
  nvtxNameCudaStreamA(main_stream, "main stream");
 
  // set up cuBLAS and cuBLASLt
@@ -1788,7 +1788,7 @@ int main(int argc, char *argv[]) {
  dataloader_reset(&train_loader);
  }
  // do one training step, doing forward/backward/update on total_batch_size tokens
- cudaEventRecord(start);
+ cudaCheck(cudaEventRecord(start));
  // gradient and loss accumulation loop over micro-batches
  for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) {
  // fetch the next data batch