Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/concedo'
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Jul 1, 2023
2 parents 2c3b46f + eda663f commit 540f4e0
Show file tree
Hide file tree
Showing 16 changed files with 78 additions and 36 deletions.
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" ON)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)


Expand Down Expand Up @@ -339,4 +339,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ endif

# keep standard at C11 and C++11
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
LDFLAGS =

# these are used on windows, to build some libraries with extra old device compatibility
Expand All @@ -53,7 +53,11 @@ NONECFLAGS =
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
FAILSAFE_FLAGS = -DUSE_FAILSAFE
CUBLAS_FLAGS = -DGGML_USE_CUBLAS
ifdef LLAMA_CUBLAS
CUBLAS_FLAGS = -DGGML_USE_CUBLAS
else
CUBLAS_FLAGS =
endif
CUBLASLD_FLAGS =
CUBLAS_OBJS =

Expand Down
2 changes: 1 addition & 1 deletion examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
params.seed = std::stoul(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
Expand Down
2 changes: 1 addition & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
int32_t get_num_physical_cores();

struct gpt_params {
int32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
Expand Down
4 changes: 2 additions & 2 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ int main(int argc, char ** argv) {

fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.random_prompt) {
Expand Down
2 changes: 1 addition & 1 deletion examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ Example usage: `--logit-bias 29905-inf`

### RNG Seed

- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.

Expand Down
4 changes: 2 additions & 2 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ int main(int argc, char ** argv) {

fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.random_prompt) {
Expand Down
4 changes: 2 additions & 2 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,11 @@ int main(int argc, char ** argv) {

fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.random_prompt) {
Expand Down
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ node .

`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).

`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

`ignore_eos`: Ignore end of stream token and continue generating (default: false).

Expand Down
6 changes: 3 additions & 3 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2768,7 +2768,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
Expand Down Expand Up @@ -3034,10 +3034,10 @@ int main(int argc, char ** argv) {
return 1;
}

if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
printf("%s: seed: %d\n", __func__, params.seed);
printf("%s: seed: %u\n", __func__, params.seed);
srand(params.seed);

struct llama_context_params llama_params = llama_context_default_params();
Expand Down
6 changes: 3 additions & 3 deletions klite.embd

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def utfprint(str):
maxhordelen = 256
modelbusy = False
defaultport = 5001
KcppVersion = "1.33"
KcppVersion = "1.34"
showdebug = True

class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
Expand Down
49 changes: 41 additions & 8 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#endif
#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif

Expand Down Expand Up @@ -66,6 +67,7 @@ enum e_model {
MODEL_65B,
};

static const size_t kB = 1024;
static const size_t MB = 1024*1024;

// computed for n_ctx == 2048
Expand Down Expand Up @@ -129,6 +131,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
return k_sizes;
}

// amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * kB },
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB },
};
return k_sizes;
}

// amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 128ull },
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
{ MODEL_65B, 416ull },
};
return k_sizes;
}

// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
Expand Down Expand Up @@ -777,7 +807,7 @@ static bool kv_cache_init(

struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.seed =*/ -1,
/*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512,
/*.n_batch =*/ 512,
/*.gpu_layers =*/ 0,
Expand Down Expand Up @@ -1113,11 +1143,14 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
} else {
vram_scratch = n_batch * MB;
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch / MB);
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch_base / kB, vram_scratch_per_context,
(vram_scratch + MB - 1) / MB); // round up
}
}
#endif // GGML_USE_CUBLAS
Expand Down Expand Up @@ -2540,7 +2573,7 @@ struct llama_context * llama_new_context_with_model(

llama_context * ctx = new llama_context(*model, model->vocab);

if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

Expand Down Expand Up @@ -2974,8 +3007,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {

#define LLAMA_MAX_RNG_STATE (64*1024)

void llama_set_rng_seed(struct llama_context * ctx, int seed) {
if (seed < 0) {
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
ctx->rng.seed(seed);
Expand Down
14 changes: 8 additions & 6 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1

#define LLAMA_DEFAULT_SEED 0xFFFFFFFF

#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
Expand Down Expand Up @@ -81,11 +83,11 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);

struct llama_context_params {
int seed; // RNG seed, -1 for random
int n_ctx; // text context
int n_batch; // prompt processing batch size
int n_gpu_layers; // number of layers to store in VRAM
int main_gpu; // the GPU that is used for scratch and small tensors
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
Expand Down Expand Up @@ -196,7 +198,7 @@ extern "C" {
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);

// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
Expand Down
4 changes: 3 additions & 1 deletion otherarch/gpt2_v3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#endif
#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif


// load the model's weights from a file
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
Expand Down
2 changes: 2 additions & 0 deletions otherarch/llama_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#include "llama_v2.h"

#include "ggml_v2.h"

#ifdef GGML_USE_CUBLAS
#include "ggml_v2-cuda.h"
#endif
#if defined(GGML_USE_CLBLAST)
#include "ggml_v2-opencl.h"
#endif


#include <array>
#include <ctime>
#include <cinttypes>
Expand Down

0 comments on commit 540f4e0

Please sign in to comment.