ggerganov · ggerganov · Jan 28, 2024 · Dec 27, 2023 · Dec 28, 2023 · Dec 29, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -143,6 +143,47 @@ jobs:
           cd build
           ctest --verbose
 
+  ubuntu-22-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -103,6 +104,8 @@ option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@@ -121,8 +124,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
+if (LLAMA_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()
 
-set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -454,6 +461,35 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+
+if (LLAMA_SYCL)
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+    message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    endif()
+    #todo: AOT
+
+    find_package(IntelSYCL REQUIRED)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
+    add_compile_definitions(GGML_USE_SYCL)
+
+    add_compile_options(-I./) #include DPCT
+    add_compile_options(-I/${SYCL_INCLUDE_DIR})
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+
+endif()
+
+
+
 function(get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
@@ -479,10 +515,12 @@ function(get_flags CCID CCVER)
             set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
         endif()
     elseif (CCID MATCHES "Intel")
-        # enable max optimization level when using Intel compiler
-        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        add_link_options(-fuse-ld=lld -static-intel)
+        if (NOT LLAMA_SYCL)
+          # enable max optimization level when using Intel compiler
+          set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+          set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+          add_link_options(-fuse-ld=lld -static-intel)
+        endif()
     endif()
 
     set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
@@ -795,6 +833,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}  ${GGML_HEADERS_SYCL}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
-- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support
 
 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@@ -597,6 +597,15 @@ Building the program with BLAS support may lead to some performance improvements
 
   You can get a list of platforms and devices from the `clinfo -l` command, etc.
 
+- #### SYCL
+
+  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+  For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
+
+
 ### Prepare Data & Run
 
 ```bash