Merge branch 'master' into HEAD

ggerganov · Jun 26, 2023 · 8f98035 · 8f98035
2 parents 67ba34e + 9225bae
commit 8f98035
Show file tree

Hide file tree

Showing 30 changed files with 3,524 additions and 531 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
 option(LLAMA_METAL "llama: use Metal" OFF)
 option(LLAMA_K_QUANTS "llama: use k-quants" ON)
+option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
 
 option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -225,6 +226,14 @@ if (LLAMA_BLAS)
  endif()
 endif()
 
+if (LLAMA_K_QUANTS)
+ set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
+ add_compile_definitions(GGML_USE_K_QUANTS)
+ if (LLAMA_QKK_64)
+ add_compile_definitions(GGML_QKK_64)
+ endif()
+endif()
+
 if (LLAMA_CUBLAS)
  cmake_minimum_required(VERSION 3.17)
 
@@ -250,6 +259,15 @@ if (LLAMA_CUBLAS)
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
  endif()
 
+ if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+ if (LLAMA_CUDA_DMMV_F16)
+ set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
+ else()
+ set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
+ endif()
+ endif()
+ message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
  else()
  message(WARNING "cuBLAS not found")
  endif()
@@ -280,11 +298,6 @@ if (LLAMA_METAL)
  )
 endif()
 
-if (LLAMA_K_QUANTS)
- set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
- add_compile_definitions(GGML_USE_K_QUANTS)
-endif()
-
 if (LLAMA_CLBLAST)
  find_package(CLBlast)
  if (CLBlast_FOUND)
@@ -493,22 +506,6 @@ if (BUILD_SHARED_LIBS)
  endif()
 endif()
 
-if (GGML_SOURCES_CUDA)
- message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
- set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
- set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-
- set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
- set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-
- if (BUILD_SHARED_LIBS)
- set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
- set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
- endif()
-
- set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
-endif()
-
 
 #
 # programs, examples and tests

diff --git a/Makefile b/Makefile
@@ -43,8 +43,11 @@ endif
 
 # keep standard at C11 and C++11
 # -Ofast tends to produce faster code, but may not be available for some compilers.
-#OPT = -Ofast
+ifdef LLAMA_FAST
+OPT = -Ofast
+else
 OPT = -O3
+endif
 CFLAGS = -I. $(OPT) -std=c11 -fPIC
 CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
 LDFLAGS =
@@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
  CFLAGS += -DGGML_USE_K_QUANTS
  CXXFLAGS += -DGGML_USE_K_QUANTS
  OBJS += k_quants.o
+ifdef LLAMA_QKK_64
+ CFLAGS += -DGGML_QKK_64
+ CXXFLAGS += -DGGML_QKK_64
+endif
 endif
 
 ifndef LLAMA_NO_ACCELERATE

diff --git a/README.md b/README.md
@@ -5,16 +5,16 @@
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
-- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
-- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
-- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
-- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
-- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
-- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
+- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
+- New roadmap: https://github.com/users/ggerganov/projects/7
+- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
+- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
 
 <details>
  <summary>Table of Contents</summary>
@@ -33,6 +33,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
   <li><a href="#quantization">Quantization</a></li>
   <li><a href="#interactive-mode">Interactive mode</a></li>
   <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
+  <li><a href="#using-openllama">Using OpenLLaMA</a></li>
   <li><a href="#using-gpt4all">Using GPT4All</a></li>
   <li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
   <li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
@@ -344,7 +345,7 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
 - #### CLBlast
 
@@ -378,7 +379,7 @@ Building the program with BLAS support may lead to some performance improvements
  ```sh
  git clone https://github.com/CNugteren/CLBlast.git
  mkdir CLBlast/build
- cd CLBLast/build
+ cd CLBlast/build
  cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
  cmake --build . --config Release
  cmake --install . --prefix /some/path
@@ -547,6 +548,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 >
 ```
 
+### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
+
+OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
+
+- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
+- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
+
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
@@ -676,12 +684,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
 ```
 GGML_OPENCL_PLATFORM=0
 GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
-./main (...)
+export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
 ```
 
 For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
 
+Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+
 ### Docker
 
 #### Prerequisites

diff --git a/build.zig b/build.zig
@@ -1,61 +1,58 @@
 const std = @import("std");
 
+// Zig Version: 0.11.0-dev.3379+629f0d23b
 pub fn build(b: *std.build.Builder) void {
  const target = b.standardTargetOptions(.{});
- const optimize = b.standardReleaseOptions();
- const want_lto = b.option(bool, "lto", "Want -fLTO");
-
- const lib = b.addStaticLibrary("llama", null);
- lib.want_lto = want_lto;
- lib.setTarget(target);
- lib.setBuildMode(optimize);
+ const optimize = b.standardOptimizeOption(.{});
+ const lib = b.addStaticLibrary(.{
+ .name = "llama",
+  .target = target,
+  .optimize = optimize,
+ });
+ lib.linkLibC();
  lib.linkLibCpp();
  lib.addIncludePath(".");
- lib.addIncludePath("examples");
+ lib.addIncludePath("./examples");
  lib.addCSourceFiles(&.{
  "ggml.c",
  }, &.{"-std=c11"});
  lib.addCSourceFiles(&.{
  "llama.cpp",
  }, &.{"-std=c++11"});
- lib.install();
-
- const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
-
- const exe = build_example("main", build_args);
- _ = build_example("quantize", build_args);
- _ = build_example("perplexity", build_args);
- _ = build_example("embedding", build_args);
-
- // create "zig build run" command for ./main
-
- const run_cmd = exe.run();
- run_cmd.step.dependOn(b.getInstallStep());
- if (b.args) |args| {
- run_cmd.addArgs(args);
+ b.installArtifact(lib);
+
+ const examples = .{
+ "main",
+ "baby-llama",
+ "embedding",
+ // "metal",
+ "perplexity",
+ "quantize",
+ "quantize-stats",
+ "save-load-state",
+ // "server",
+ "simple",
+ "train-text-from-scratch",
+ };
+
+ inline for (examples) |example_name| {
+ const exe = b.addExecutable(.{
+ .name = example_name,
+ .target = target,
+ .optimize = optimize,
+ });
+ exe.addIncludePath(".");
+ exe.addIncludePath("./examples");
+ exe.addCSourceFiles(&.{
+ std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
+ "examples/common.cpp",
+ }, &.{"-std=c++11"});
+ exe.linkLibrary(lib);
+ b.installArtifact(exe);
+ const run_cmd = b.addRunArtifact(exe);
+ run_cmd.step.dependOn(b.getInstallStep());
+ if (b.args) |args| run_cmd.addArgs(args);
+ const run_step = b.step("run_" ++ example_name, "Run the app");
+ run_step.dependOn(&run_cmd.step);
  }
-
- const run_step = b.step("run", "Run the app");
- run_step.dependOn(&run_cmd.step);
-}
-
-fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
- const b = args.b;
- const lib = args.lib;
- const want_lto = args.want_lto;
-
- const exe = b.addExecutable(name, null);
- exe.want_lto = want_lto;
- lib.setTarget(args.target);
- lib.setBuildMode(args.optimize);
- exe.addIncludePath(".");
- exe.addIncludePath("examples");
- exe.addCSourceFiles(&.{
- std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
- "examples/common.cpp",
- }, &.{"-std=c++11"});
- exe.linkLibrary(lib);
- exe.install();
-
- return exe;
 }