metal : utilize max shared memory for mul_mat_id (llama/7935)

ggerganov · ggerganov · Jun 15, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
commit ca9e5242c9a7c773882a3137cb742bf495a45129
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
  // ne21 = n_rows
  const int dst_rows = ne20*ne21;
  const int dst_rows_min = n_as;
+ const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
 
  // max size of the rowids array in the kernel shared buffer
- GGML_ASSERT(dst_rows <= 2048);
+ GGML_ASSERT(dst_rows <= dst_rows_max);
 
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel