Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement multimodal models (LLaVA) #3436

Merged
merged 36 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
59aa1ac
WIP: start implementing LLaVA
monatis Oct 2, 2023
0f0e7c6
rm scratch buf for now, will revert after cleanup
monatis Oct 2, 2023
7e9120f
LLaVA image encoder is working. will combine with llama
monatis Oct 2, 2023
d37ed47
Add llava inference code, but it's buggy. debugging
monatis Oct 3, 2023
8690f42
LLaVA is working e2e, needs to optimize memory allocation + cleanup
monatis Oct 7, 2023
94eeac3
Use ggml_allocr + rm unnecessary code
monatis Oct 8, 2023
0c2bd79
fix: crlf -> lf
monatis Oct 8, 2023
204d08b
fix: new line at EoF
monatis Oct 8, 2023
95da79e
fix: trailing whitespace
monatis Oct 8, 2023
2a04d0b
Merge branch 'master' into llava
monatis Oct 8, 2023
444dbce
Add readme
monatis Oct 9, 2023
8af7e21
Update readme
monatis Oct 9, 2023
54495c9
Some cleanup
monatis Oct 9, 2023
9b0ec4d
Are you happy editorconfig?
monatis Oct 9, 2023
8278a73
rm unused batch image preprocessing
monatis Oct 9, 2023
d78e816
rm unused import
monatis Oct 9, 2023
4759bfd
fix: rm designated initializers
monatis Oct 9, 2023
325d240
introduce pad-to-square mode for non-square images
monatis Oct 9, 2023
d75a031
are you happy editorconfig?
monatis Oct 9, 2023
ae01c85
gitignore /llava
monatis Oct 9, 2023
5009ae9
Handle cases where image file does not exist
monatis Oct 9, 2023
96171de
add llava target to Makefile
monatis Oct 9, 2023
d640aae
add support for 13b model variant
monatis Oct 10, 2023
587bde8
Maybe seed is unlucky?
monatis Oct 11, 2023
f1564bb
Merge branch 'master' into llava
monatis Oct 11, 2023
ab21587
Check if apples are compared to apples
monatis Oct 11, 2023
0409ae0
are you happy editorconfig?
monatis Oct 11, 2023
f0f7834
Use temperature = 0.1 by default
monatis Oct 11, 2023
2bc1710
command line: use gpt_params_parse()
monatis Oct 11, 2023
1403d87
Merge master and fix conflicts
monatis Oct 11, 2023
dc913ea
minor
monatis Oct 12, 2023
56ccf97
handle default n_predict
monatis Oct 12, 2023
e9534ea
fix typo
monatis Oct 12, 2023
346e3c1
Merge branch 'master' into llava
ggerganov Oct 12, 2023
4bc5c9c
llava : code formatting, rename files, fix compile warnings
ggerganov Oct 12, 2023
0bd7e69
do not use Wno-cast-qual for MSVC
monatis Oct 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Some cleanup
  • Loading branch information
monatis committed Oct 9, 2023
commit 54495c947474b45d7125505f898ca2ad29c87645
77 changes: 30 additions & 47 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ static std::string format(const char * fmt, ...) {
// utilities to get data from a gguf file
//

int get_key_idx(const gguf_context * ctx, const char * key) {
static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key);
if (i == -1) {
fprintf(stderr, "key %s not found in file\n", key);
Expand All @@ -99,19 +99,19 @@ int get_key_idx(const gguf_context * ctx, const char * key) {
return i;
}

const uint32_t get_u32(const gguf_context * ctx, std::string key) {
static const uint32_t get_u32(const gguf_context * ctx, std::string key) {
const int i = get_key_idx(ctx, key.c_str());

return gguf_get_val_u32(ctx, i);
}

const float get_f32(const gguf_context * ctx, std::string key) {
static const float get_f32(const gguf_context * ctx, std::string key) {
const int i = get_key_idx(ctx, key.c_str());

return gguf_get_val_f32(ctx, i);
}

struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) {
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if (!cur) {
printf("unable to find tensor %s\n", name.c_str());
Expand All @@ -121,7 +121,7 @@ struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) {
return cur;
}

std::string get_ftype(int ftype) {
static std::string get_ftype(int ftype) {
switch (ftype) {
case 0:
return "f32";
Expand Down Expand Up @@ -231,20 +231,13 @@ struct clip_ctx {
int32_t ftype = 1;
struct ggml_context * ctx;
struct gguf_context * ctx_gguf;
//struct clip_buffer buf_compute;

// reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;

// memory buffers used to evaluate the model
// memory buffers to evaluate the model
clip_buffer buf_compute;

clip_buffer buf_alloc;
ggml_allocr * alloc = NULL;

};


static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {

if (!ctx->has_vision_encoder) {
Expand Down Expand Up @@ -436,7 +429,8 @@ if (!ggml_allocr_is_measure(ctx->alloc)) {
embeddings = cur;
}

if (ctx->has_llava_projector) {
// llava projector
{
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
Expand All @@ -457,8 +451,6 @@ if (!ggml_allocr_is_measure(ctx->alloc)) {

embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);

ggml_set_name(embeddings, "check");
}

// build the graph
Expand Down Expand Up @@ -551,6 +543,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}

GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder);
GGML_ASSERT(!new_clip->has_text_encoder);

idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
Expand Down Expand Up @@ -643,16 +637,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));if (new_clip->has_llava_projector) {
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
} else {
vision_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "bias"));
vision_model.projection = get_tensor(new_clip->ctx, TN_VIS_PROJ);
}
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));

vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
Expand Down Expand Up @@ -861,7 +851,7 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}

bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, const bool normalize) {
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
Expand All @@ -870,37 +860,25 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
clip_image_f32_batch imgs{};
imgs.size = 1;
imgs.data = img;
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, normalize);
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
}

bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec,
const bool normalize) {
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {

if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
}

const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;

const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const int n_intermediate = hparams.n_intermediate;
const int projection_dim = hparams.projection_dim;
const float eps = hparams.eps;
int batch_size = imgs->size;
if(ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
}

// reset alloc buffer to clean the memory from previous invocations
ggml_allocr_reset(ctx->alloc);

// build the inference graph
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_allocr_alloc_graph(ctx->alloc, gf);

Expand All @@ -911,7 +889,10 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl

ggml_graph_compute(gf, &plan);

// the last node is the embedding tensor
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

// copy the embeddings to the location passed by the user
memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));

if (plan.work_size > 0) {
Expand All @@ -921,7 +902,6 @@ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
return true;
}

/*
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {

ggml_type type = GGML_TYPE_Q4_1;
Expand Down Expand Up @@ -1106,6 +1086,9 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i

return true;
}
*/

struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx) { return &ctx->vision_model.hparams; }
size_t clip_embd_nbytes(struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams;

return (params.image_size / params.patch_size) * (params.image_size / params.patch_size) * 4096 * sizeof(float);
}
39 changes: 3 additions & 36 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,6 @@ struct clip_ctx;
extern "C" {
#endif

struct clip_text_hparams {
int32_t n_vocab;
int32_t num_positions;
int32_t hidden_size;
int32_t n_intermediate;
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
float eps;
};

struct clip_vision_hparams {
int32_t image_size;
int32_t patch_size;
Expand All @@ -31,18 +20,11 @@ struct clip_vision_hparams {
float eps;
};

typedef int32_t clip_vocab_id;
struct clip_tokens {
clip_vocab_id * data;
size_t size;
};

struct clip_ctx * clip_model_load(const char * fname, const int verbosity);

void clip_free(struct clip_ctx * ctx);

struct clip_text_hparams * clip_get_text_hparams(struct clip_ctx * ctx);
struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx);
size_t clip_embd_nbytes(struct clip_ctx * ctx);

// RGB uint8 image
struct clip_image_u8 {
Expand Down Expand Up @@ -71,31 +53,16 @@ struct clip_image_f32_batch {
size_t size;
};

bool clip_tokenize(const struct clip_ctx * ctx, const char * text, struct clip_tokens * tokens);

struct clip_image_u8 * make_clip_image_u8();
struct clip_image_f32 * make_clip_image_f32();
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res);

bool clip_text_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_tokens * tokens, float * vec,
const bool normalize);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec,
const bool normalize);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);

void clip_image_batch_preprocess(const struct clip_ctx * ctx, const int n_threads,
const struct clip_image_u8_batch * img_inputs, struct clip_image_f32_batch * imgs_resized);
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
float * vec, const bool normalize);

// bool image_normalize(const clip_image_u8 *img, clip_image_f32 *res);

bool clip_compare_text_and_image(const struct clip_ctx * ctx, const int n_threads, const char * text,
const struct clip_image_u8 * image, float * score);
float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim);
bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices);
bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,
const char ** labels, const size_t n_labels, float * scores, int * indices);
float * vec);

bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);

Expand Down
Loading
Loading