Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · Jun 29, 2023 · 69a0c25 · 69a0c25
2 parents 096f0b0 + 1347d3a
commit 69a0c25
Show file tree

Hide file tree

Showing 54 changed files with 5,154 additions and 860 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 *.o
 *.a
+*.so
 .DS_Store
 .build/
 .cache/
@@ -36,6 +37,7 @@ out/
 /vdot
 /server
 /Pipfile
+/embd-input-test
 /libllama.so
 
 arm_neon.h
@@ -64,4 +66,5 @@ koboldcpp.dll
 koboldcpp_failsafe.dll
 koboldcpp_openblas.dll
 koboldcpp_openblas_noavx2.dll
-koboldcpp_clblast.dll
+koboldcpp_clblast.dll
+koboldcpp_cublas.dll
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
-# DO NOT USE THIS FILE. 
-# IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO. 
+# DO NOT USE THIS FILE.
+# IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO.
 # IT WILL NOT BE UPDATED OR MAINTAINED !!!
 
 message(STATUS "============== ============== ==============")
@@ -69,6 +69,7 @@ if (LLAMA_CUBLAS)
 
  set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
  set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
+ set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
 
  add_compile_definitions(GGML_USE_CUBLAS)
 
@@ -259,7 +260,8 @@ set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(ggml_v2 OBJECT
  otherarch/ggml_v2.c
  otherarch/ggml_v2.h
- ${GGML_V2_CUDA_SOURCES})
+ ${GGML_V2_CUDA_SOURCES}
+ ${GGML_V2_LEGACY_CUDA_SOURCES})
 target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
 target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@@ -273,7 +275,7 @@ target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-add_library(gpttype_adapter 
+add_library(gpttype_adapter
  gpttype_adapter.cpp)
 target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
@@ -287,12 +289,12 @@ if (GGML_CUDA_SOURCES)
  set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()
 
-set(TARGET koboldcpp)
+set(TARGET koboldcpp_cublas)
 add_library(${TARGET} SHARED expose.cpp expose.h)
 target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
 target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
 set_target_properties(${TARGET} PROPERTIES PREFIX "")
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp")
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
+default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
 tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
 dev: koboldcpp_openblas
 dev2: koboldcpp_clblast
@@ -53,6 +53,9 @@ NONECFLAGS =
 OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 CLBLAST_FLAGS = -DGGML_USE_CLBLAST
 FAILSAFE_FLAGS = -DUSE_FAILSAFE
+CUBLAS_FLAGS = -DGGML_USE_CUBLAS
+CUBLASLD_FLAGS =
+CUBLAS_OBJS =
 
 #lets try enabling everything
 CFLAGS += -pthread -s
@@ -133,10 +136,9 @@ endif
 
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
 ifdef LLAMA_CUBLAS
- CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
- CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
- LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
- OBJS += ggml-cuda.o ggml_v2-cuda.o
+ CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+ CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+ CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
  NVCC = nvcc
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ifdef LLAMA_CUDA_DMMV_X
@@ -158,9 +160,11 @@ else
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
 
 ifdef LLAMA_HIPBLAS
@@ -225,15 +229,19 @@ FAILSAFE_BUILD =
 OPENBLAS_BUILD =
 OPENBLAS_NOAVX2_BUILD =
 CLBLAST_BUILD =
-CLBLAST_NOAVX2_BUILD =
+CUBLAS_BUILD =
 
 ifeq ($(OS),Windows_NT)
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
  OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
+
+ifdef LLAMA_CUBLAS
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
+endif
+
 else
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -244,20 +252,26 @@ else
  ifdef LLAMA_CLBLAST
  ifeq ($(UNAME_S),Darwin)
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
  else
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
  endif
  endif
 
+ifdef LLAMA_CUBLAS
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
+endif
+
  ifndef LLAMA_OPENBLAS
  ifndef LLAMA_CLBLAST
+ ifndef LLAMA_CUBLAS
  OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
  endif
  endif
+ endif
 endif
 
+
+
 #
 # Print build information
 #
@@ -287,8 +301,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
 ggml_clblast.o: ggml.c ggml.h
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
-ggml_clblast_noavx2.o: ggml.c ggml.h
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+ggml_cublas.o: ggml.c ggml.h
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
 
 #quants K
 k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
@@ -309,8 +323,8 @@ ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
 ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
-ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
 
 #extreme old version compat
 ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
@@ -339,9 +353,11 @@ gpttype_adapter.o: gpttype_adapter.cpp
  $(CXX) $(CXXFLAGS) -c $< -o $@
 gpttype_adapter_clblast.o: gpttype_adapter.cpp
  $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+gpttype_adapter_cublas.o: gpttype_adapter.cpp
+ $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
 
 clean:
- rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so
+ rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
 
 main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -360,8 +376,8 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml
  $(OPENBLAS_NOAVX2_BUILD)
 koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
  $(CLBLAST_BUILD)
-koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants_noavx2.o $(OBJS)
- $(CLBLAST_NOAVX2_BUILD)
+koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS)
+ $(CUBLAS_BUILD)
 
 quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin
 ![Preview](media/preview.png)
 
 ## Usage
-- [Download the latest release here](https://github.com/LostRuins/koboldcpp/releases/latest) or clone the repo.
+- **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
 - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
 - Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places).
 - To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
@@ -40,9 +40,9 @@ For more information, be sure to run the program with the `--help` flag.
 - You will have to compile your binaries from source. A makefile is provided, simply run `make`
 - If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1`
 - Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries.
-- For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1`
  - For Arch Linux: Install `cblas` `openblas` and `clblast`.
  - For Debian: Install `libclblast-dev` and `libopenblas-dev`.
+- For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1`
 - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
 - Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
 
@@ -65,7 +65,7 @@ For more information, be sure to run the program with the `--help` flag.
 - See https://github.com/ggerganov/llama.cpp/pull/1828/files
 
 ## CuBLAS?
-- You can attempt a CuBLAS build with LLAMA_CUBLAS=1 or using the provided CMake file (best for visual studio users). Note that support for CuBLAS is limited.
+- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. Note that support for CuBLAS is limited.
 
 ## Considerations
 - For Windows: No installation, single file executable, (It Just Works)

diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -113,14 +113,18 @@ def write_tensor_header(
 
  write_file_header(fout, params)
  for k, v in model.items():
+ if k.endswith(".default.weight"):
+ k = k.replace(".default.weight", ".weight")
+ if k in ["llama_proj.weight", "llama_proj.bias"]:
+ continue
  if k.endswith("lora_A.weight"):
  if v.dtype != torch.float16 and v.dtype != torch.float32:
  v = v.float()
  v = v.T
  else:
  v = v.float()
 
- t = v.numpy()
+ t = v.detach().numpy()
  tname = translate_tensor_name(k)
  print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
  write_tensor_header(fout, tname, t.shape, t.dtype)

diff --git a/cudart64_110.dll b/cudart64_110.dll
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -39,6 +39,7 @@ else()
  add_subdirectory(baby-llama)
  add_subdirectory(train-text-from-scratch)
  add_subdirectory(simple)
+ add_subdirectory(embd-input)
  if (LLAMA_METAL)
  add_subdirectory(metal)
  endif()

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
  // wk shape [n_embd, n_embd, 1, 1]
  // Qcur shape [n_embd/n_head, n_head, N, 1]
  // Kcur shape [n_embd/n_head, n_head, N, 1]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
 
  // store key and value to memory
  {
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
  // wk shape [n_embd, n_embd, 1, 1]
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
  model->layers[il].wqb,
  cur)),
  n_embd/n_head, n_head, N),
- n_past, n_rot, 0);
+ n_past, n_rot, 0, 0);
  struct ggml_tensor * Kcur = ggml_rope(ctx0,
  ggml_reshape_3d(ctx0,
  ggml_mul_mat(ctx0,
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
  model->layers[il].wkb,
  cur)),
  n_embd/n_head, n_head, N),
- n_past, n_rot, 0);
+ n_past, n_rot, 0, 0);
 
  // store key and value to memory
  {

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
  params.use_mmap = false;
  } else if (arg == "--mtest") {
  params.mem_test = true;
+ } else if (arg == "--numa") {
+ params.numa = true;
  } else if (arg == "--export") {
  params.export_cgraph = true;
  } else if (arg == "--verbose-prompt") {
@@ -414,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
  exit(1);
  }
 
-#ifdef GGML_USE_CUBLAS
- if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
- fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
- exit(1);
- }
-#endif // GGML_USE_CUBLAS
-
  if (escape_prompt) {
  process_escapes(params.prompt);
  }
@@ -488,6 +483,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  if (llama_mmap_supported()) {
  fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
  }
+ fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
+ fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
+ fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
  fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
  fprintf(stderr, " number of layers to store in VRAM\n");

diff --git a/examples/common.h b/examples/common.h
@@ -76,6 +76,7 @@ struct gpt_params {
  bool use_mmap = true; // use mmap for faster loads
  bool use_mlock = false; // use mlock to keep model in memory
  bool mem_test = false; // compute maximum memory usage
+ bool numa = false; // attempt optimizations that help on some NUMA systems
  bool export_cgraph = false; // export the computation graph
  bool verbose_prompt = false; // print prompt tokens before generation
 };

diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore
@@ -0,0 +1,4 @@
+PandaGPT
+MiniGPT-4
+*.pth
+