Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · Jul 8, 2023 · 912e31e · 912e31e
2 parents 74e2703 + ddaa4f2
commit 912e31e
Show file tree

Hide file tree

Showing 37 changed files with 6,583 additions and 2,127 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,6 +44,7 @@ endif()
 option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
 set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS "llama: use hipBLAS" ON)
@@ -77,8 +78,11 @@ if (LLAMA_CUBLAS)
  set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
 
  add_compile_definitions(GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
+
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
  add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+ add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) 
  if (LLAMA_CUDA_DMMV_F16)
  add_compile_definitions(GGML_CUDA_DMMV_F16)
  endif()
@@ -90,6 +94,15 @@ if (LLAMA_CUBLAS)
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
  endif()
 
+ if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+ if (LLAMA_CUDA_DMMV_F16)
+ set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
+ else()
+ set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+ endif()
+ endif()
+ message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
  else()
  message(WARNING "cuBLAS not found")
  endif()

diff --git a/Makefile b/Makefile
@@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS
  CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
  CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
  NVCC = nvcc
- NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_DMMV_X
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_DMMV_Y
+ NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
  NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 else
  NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+ NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_DMMV_Y
 ifdef LLAMA_CUDA_DMMV_F16
  NVCCFLAGS += -DGGML_CUDA_DMMV_F16

diff --git a/convert.py b/convert.py
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
  # try transformer naming first
  if "model.layers.0.self_attn.q_proj.weight" in model:
  n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+ elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
+ n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
  else:
  n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
+ if n_layer < 1:
+ raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
  n_head=n_embd // 128 # guessed
 
  return Params(

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin \
+./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
  --color \
  -f ./prompts/alpaca.txt \
  --ctx_size 2048 \

diff --git a/examples/common.h b/examples/common.h
@@ -31,7 +31,7 @@ struct gpt_params {
  int32_t n_gpu_layers = 0; // number of layers to store in VRAM
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
- bool  low_vram  = 0; // if true, reduce VRAM usage at the cost of performance
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
 
  // sampling parameters
  std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
  std::string lora_adapter = ""; // lora adapter path
  std::string lora_base = ""; // base model path for the lora adapter
 
+ bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
  bool memory_f16 = true; // use f16 instead of f32 for memory kv
  bool random_prompt = false; // do not randomize prompt if none provided
  bool use_color = false; // use color to distinguish generations and inputs

diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
 
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
- if (params.seed < 0) {
+ if (params.seed == LLAMA_DEFAULT_SEED) {
  params.seed = time(NULL);
  }
  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
  params.embedding = true;
 
  if (params.n_ctx > 2048) {
- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
  "expect poor results\n", __func__, params.n_ctx);
  }
 

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
  }
 
  if (params.n_ctx > 2048) {
- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
  "expect poor results\n", __func__, params.n_ctx);
  } else if (params.n_ctx < 8) {
  fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
  params.n_batch = std::min(params.n_batch, params.n_ctx);
 
  if (params.n_ctx > 2048) {
- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
  "expect poor results\n", __func__, params.n_ctx);
  }
 

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
  const ggml_tensor * layer,
  int64_t offset,
  int64_t chunk_size,
- const quantize_fns_t & qfns,
+ const ggml_type_traits_t & qfns,
  bool use_reference,
  float * input_scratch,
  char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
  }
 
  if (use_reference) {
- qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+ qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
  } else {
- qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+ qfns.from_float(input_scratch, quantized_scratch, chunk_size);
  }
- qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+ qfns.to_float(quantized_scratch, output_scratch, chunk_size);
 
  update_error_stats(chunk_size, input_scratch, output_scratch, stats);
 }
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
 void test_roundtrip_on_layer(
  std::string & name,
  bool print_layer_stats,
- const quantize_fns_t & qfns,
+ const ggml_type_traits_t & qfns,
  bool use_reference,
  const ggml_tensor * layer,
  std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
  if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
  continue;
  }
- quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
- if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+ ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+ if (qfns.from_float && qfns.to_float) {
  if (params.verbose) {
  printf("testing %s ...\n", ggml_type_name(type));
  }

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -1,13 +1,13 @@
 # llama.cpp/example/server
 
-This example demonstrates a simple HTTP API server to interact with llama.cpp.
+This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
 
 Command line options:
 
 - `--threads N`, `-t N`: Set the number of threads to use during computation.
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
-- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
@@ -21,24 +21,22 @@ Command line options:
 - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
 - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
 - `--port`: Set the port to listen. Default: `8080`.
+- `--path`: path from which to serve static files (default examples/server/public)
 - `--embedding`: Enable embedding extraction, Default: disabled.
 
 ## Build
 
-Build llama.cpp with server from repository root with either make or CMake.
+server is build alongside everything else from the root of the project
 
 - Using `make`:
 
  ```bash
- LLAMA_BUILD_SERVER=1 make
+ make
  ```
 
 - Using `CMake`:
 
  ```bash
- mkdir build-server
- cd build-server
- cmake -DLLAMA_BUILD_SERVER=ON ..
  cmake --build . --config Release
  ```
 
@@ -59,7 +57,7 @@ server.exe -m models\7B\ggml-model.bin -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
-You can consume the endpoints with Postman or NodeJS with axios library.
+You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
 
 ## Testing with CURL
 
@@ -190,3 +188,49 @@ Run with bash:
 ```sh
 bash chat.sh
 ```
+
+### API like OAI
+
+API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
+This example must be used with server.cpp
+
+```sh
+python api_like_OAI.py
+```
+
+After running the API server, you can use it in Python by setting the API base URL.
+```python
+openai.api_base = "http:https://<Your api-server IP>:port"
+```
+
+Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
+
+### Extending or building alternative Web Front End
+
+The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+
+Read the documentation in `/completion.js` to see convenient ways to access llama.
+
+A simple example is below:
+
+```html
+<html>
+ <body>
+ <pre>
+ <script type="module">
+ import { llama } from '/completion.js'
+
+ const prompt = `### Instruction:
+Write dad jokes, each one paragraph.
+You can use html formatting if needed.
+
+### Response:`
+
+ for await (const chunk of llama(prompt)) {
+ document.write(chunk.data.content)
+ }
+ </script>
+ </pre>
+ </body>
+</html>
+```