Skip to content

Commit

Permalink
Merge branch 'master' into HEAD
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jun 26, 2023
2 parents 67ba34e + 9225bae commit 8f98035
Show file tree
Hide file tree
Showing 30 changed files with 3,524 additions and 531 deletions.
39 changes: 18 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -225,6 +226,14 @@ if (LLAMA_BLAS)
endif()
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
endif()

if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)

Expand All @@ -250,6 +259,15 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

else()
message(WARNING "cuBLAS not found")
endif()
Expand Down Expand Up @@ -280,11 +298,6 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -493,22 +506,6 @@ if (BUILD_SHARED_LIBS)
endif()
endif()

if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")

set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")

if (BUILD_SHARED_LIBS)
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
endif()

set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
endif()


#
# programs, examples and tests
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ endif

# keep standard at C11 and C++11
# -Ofast tends to produce faster code, but may not be available for some compilers.
#OPT = -Ofast
ifdef LLAMA_FAST
OPT = -Ofast
else
OPT = -O3
endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS =
Expand Down Expand Up @@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64
CFLAGS += -DGGML_QKK_64
CXXFLAGS += -DGGML_QKK_64
endif
endif

ifndef LLAMA_NO_ACCELERATE
Expand Down
29 changes: 19 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

**Hot topics:**

- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1

<details>
<summary>Table of Contents</summary>
Expand All @@ -33,6 +33,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
<li><a href="#using-gpt4all">Using GPT4All</a></li>
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
Expand Down Expand Up @@ -344,7 +345,7 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

- #### CLBlast

Expand Down Expand Up @@ -378,7 +379,7 @@ Building the program with BLAS support may lead to some performance improvements
```sh
git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast/build
cd CLBLast/build
cd CLBlast/build
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build . --config Release
cmake --install . --prefix /some/path
Expand Down Expand Up @@ -547,6 +548,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```

### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)

OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.

- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`

### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
Expand Down Expand Up @@ -676,12 +684,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
./main (...)
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```

For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.

Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.

### Docker

#### Prerequisites
Expand Down
91 changes: 44 additions & 47 deletions build.zig
Original file line number Diff line number Diff line change
@@ -1,61 +1,58 @@
const std = @import("std");

// Zig Version: 0.11.0-dev.3379+629f0d23b
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardReleaseOptions();
const want_lto = b.option(bool, "lto", "Want -fLTO");

const lib = b.addStaticLibrary("llama", null);
lib.want_lto = want_lto;
lib.setTarget(target);
lib.setBuildMode(optimize);
const optimize = b.standardOptimizeOption(.{});
const lib = b.addStaticLibrary(.{
.name = "llama",
.target = target,
.optimize = optimize,
});
lib.linkLibC();
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("examples");
lib.addIncludePath("./examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
lib.install();

const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };

const exe = build_example("main", build_args);
_ = build_example("quantize", build_args);
_ = build_example("perplexity", build_args);
_ = build_example("embedding", build_args);

// create "zig build run" command for ./main

const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
b.installArtifact(lib);

const examples = .{
"main",
"baby-llama",
"embedding",
// "metal",
"perplexity",
"quantize",
"quantize-stats",
"save-load-state",
// "server",
"simple",
"train-text-from-scratch",
};

inline for (examples) |example_name| {
const exe = b.addExecutable(.{
.name = example_name,
.target = target,
.optimize = optimize,
});
exe.addIncludePath(".");
exe.addIncludePath("./examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| run_cmd.addArgs(args);
const run_step = b.step("run_" ++ example_name, "Run the app");
run_step.dependOn(&run_cmd.step);
}

const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}

fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b;
const lib = args.lib;
const want_lto = args.want_lto;

const exe = b.addExecutable(name, null);
exe.want_lto = want_lto;
lib.setTarget(args.target);
lib.setBuildMode(args.optimize);
exe.addIncludePath(".");
exe.addIncludePath("examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
exe.install();

return exe;
}
Loading

0 comments on commit 8f98035

Please sign in to comment.