ggerganov · monatis · Oct 12, 2023 · Oct 2, 2023 · Oct 2, 2023 · Oct 2, 2023
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ models-mnt
 /infill
 /libllama.so
 /llama-bench
+/llava
 /main
 /metal
 /perplexity

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
@@ -594,6 +594,9 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h examples/llava/stb_image.h ggml.o llama.o common.o $(OBJS)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -383,6 +383,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
  break;
  }
  params.lora_base = argv[i];
+ } else if (arg == "--mmproj") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.mmproj = argv[i];
+ } else if (arg == "--image") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.image = argv[i];
  } else if (arg == "-i" || arg == "--interactive") {
  params.interactive = true;
  } else if (arg == "--embedding") {
@@ -700,6 +712,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
  printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
  printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+ printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+ printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
  if (llama_mlock_supported()) {
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
  }

diff --git a/common/common.h b/common/common.h
@@ -122,6 +122,10 @@ struct gpt_params {
  bool numa = false; // attempt optimizations that help on some NUMA systems
  bool verbose_prompt = false; // print prompt tokens before generation
  bool infill = false; // use infill mode
+
+ // multimodal models (see examples/llava)
+ std::string mmproj = ""; // path to multimodal projector
+ std::string image = ""; // path to an image file
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -28,6 +28,7 @@ else()
  add_subdirectory(speculative)
  add_subdirectory(parallel)
  add_subdirectory(embd-input)
+ add_subdirectory(llava)
  add_subdirectory(llama-bench)
  add_subdirectory(beam-search)
  if (LLAMA_METAL)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(TARGET clip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+ add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET llava)
+add_executable(${TARGET} llava.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+ add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/llava/README.md b/examples/llava/README.md
@@ -0,0 +1,56 @@
+# LLaVA
+
+Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
+
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.
+
+After API is confirmed, more models will be supported / uploaded.
+## Usage
+Build with cmake or run `make llava` to build it.
+
+After building, run: `./llava` to see the usage. For example:
+
+```sh
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+
+## Model conversion
+
+- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
+
+```sh
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
+```
+
+3. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert_image_encoder_to_gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py ../llava-v1.5-7b
+```
+
+Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+
+## TODO
+
+- [ ] Support server mode.
+- [ ] Support non-CPU backend for the image encoding part.
+- [ ] Support different sampling methods.
+- [ ] Support more model variants.