ROCm Port (#1087)

* use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (ggerganov#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-authored-by: YellowRoseCx <[email protected]> Co-authored-by: ardfork <[email protected]> Co-authored-by: funnbot <[email protected]> Co-authored-by: Engininja2 <[email protected]> Co-authored-by: Kerfuffle <[email protected]> Co-authored-by: jammm <[email protected]> Co-authored-by: jdecourval <[email protected]>
CCLDArjun · Aug 25, 2023 · 6bbc598 · 6bbc598
1 parent 3f460a2
commit 6bbc598
Show file tree

Hide file tree

Showing 12 changed files with 335 additions and 59 deletions.
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+ gfx803 \
+ gfx900 \
+ gfx906 \
+ gfx908 \
+ gfx90a \
+ gfx1010 \
+ gfx1030 \
+ gfx1100 \
+ gfx1101 \
+ gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+ && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+ gfx803 \
+ gfx900 \
+ gfx906 \
+ gfx908 \
+ gfx90a \
+ gfx1010 \
+ gfx1030 \
+ gfx1100 \
+ gfx1101 \
+ gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+ && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT [ "/app/main" ]
diff --git a/.dockerignore b/.dockerignore
@@ -5,14 +5,7 @@
 .vscode/
 .DS_Store
 
-build/
-build-em/
-build-debug/
-build-release/
-build-static/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/
 
 models/*
 

diff --git a/.gitignore b/.gitignore
@@ -16,20 +16,7 @@
 .vs/
 .vscode/
 
-build/
-build-em/
-build-debug/
-build-release/
-build-ci-debug/
-build-ci-release/
-build-static/
-build-cublas/
-build-opencl/
-build-metal/
-build-mpi/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/
 out/
 tmp/
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
 set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
+option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
 option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
 option(LLAMA_METAL "llama: use Metal" OFF)
 option(LLAMA_MPI "llama: use MPI" OFF)
@@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
  endif()
 endif()
 
+if (LLAMA_HIPBLAS)
+ list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+
+ if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+ message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+ endif()
+ if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+ message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+ endif()
+
+ find_package(hip)
+ find_package(hipblas)
+ find_package(rocblas)
+
+ if (${hipblas_FOUND} AND ${hip_FOUND})
+ message(STATUS "HIP and hipBLAS found")
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+ add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+ if (LLAMA_CUDA_FORCE_DMMV)
+ target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
+ endif()
+ target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+ target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+ target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+ target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
+ set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+ target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+
+ if (LLAMA_STATIC)
+ message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+ endif()
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
+ else()
+ message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+ endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
  if (NOT MSVC)
  set(c_flags

diff --git a/Makefile b/Makefile
@@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
  $(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 
+ifdef LLAMA_HIPBLAS
+ ROCM_PATH ?= /opt/rocm
+ HIPCC ?= $(ROCM_PATH)/bin/hipcc
+ GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+ LLAMA_CUDA_DMMV_X ?= 32
+ LLAMA_CUDA_MMV_Y ?= 1
+ LLAMA_CUDA_KQUANTS_ITER ?= 2
+ CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+ CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+ LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+ LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+ HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
+ HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+ HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+ HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+ HIPFLAGS += -DCC_TURING=1000000000
+ifdef LLAMA_CUDA_FORCE_DMMV
+ HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # LLAMA_CUDA_FORCE_DMMV
+ OBJS += ggml-cuda.o
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # LLAMA_HIPBLAS
+
 ifdef LLAMA_METAL
  CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
  CXXFLAGS += -DGGML_USE_METAL

diff --git a/README.md b/README.md
@@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
+- #### hipBLAS
+
+ This provide BLAS acceleation on HIP supported GPU like AMD GPU.
+ Make sure to have ROCm installed.
+ You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+ Windows support is coming soon...
+
+ - Using `make`:
+ ```bash
+ make LLAMA_HIPBLAS=1
+ ```
+ - Using `CMake`:
+ ```bash
+ mkdir build
+ cd build
+ CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+ cmake --build .
+ ```
+
+ The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+ If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
+ The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
+
+ | Option | Legal values | Default | Description |
+ |-------------------------|------------------------|---------|-------------|
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+
 - #### CLBlast
 
  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

diff --git a/common/common.cpp b/common/common.cpp
@@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
  fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
+#ifdef GGML_USE_CUBLAS
  fprintf(stdout, " -nommq, --no-mul-mat-q\n");
- fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+ fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
  fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CUBLAS
 #endif
  fprintf(stdout, " --mtest compute maximum memory usage\n");
  fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -18,9 +18,7 @@
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
-#ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
-#endif
 
 // utils
 static uint64_t get_time_ns() {
@@ -504,7 +502,7 @@ struct test {
 
  static std::string get_backend() {
  if (cuda) {
- return "CUDA";
+ return GGML_CUDA_NAME;
  }
  if (opencl) {
  return "OpenCL";