mobile vit

datduonguva · Feb 24, 2024 · 7a12ef5 · 7a12ef5
1 parent 9074a16
commit 7a12ef5
Show file tree

Hide file tree

Showing 7 changed files with 473 additions and 5 deletions.
diff --git a/examples/mnist_training/train_mnist.cpp b/examples/mnist_training/train_mnist.cpp
@@ -52,14 +52,18 @@ struct ggml_tensor * square_error_loss(
  return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
 }
 
-struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+struct ggml_tensor * cross_entropy_loss(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b
+) {
  const float eps = 1e-3f;
  return
  ggml_sum(ctx,
  ggml_neg(ctx,
  ggml_sum_rows(ctx,
  ggml_mul(ctx,
- a,
+  a,
  ggml_log(ctx,
  ggml_add1(ctx,
  ggml_soft_max(ctx, b),

diff --git a/mobilevit/Makefile b/mobilevit/Makefile
@@ -0,0 +1,26 @@
+OBJ=./objects
+GGML=/home/datduong/github/ggml
+
+all: mkdir main
+
+main: ${OBJ}/ggml.o ${OBJ}/mobilevit.o ${OBJ}/ggml-quants.o
+ g++ -g -o main ${OBJ}/ggml.o ${OBJ}/mobilevit.o ${OBJ}/ggml-quants.o -lm -lpthread
+
+mkdir: 
+ mkdir -p ${OBJ}
+
+${OBJ}/ggml.o: ${GGML}/src/ggml.c
+ gcc -D_GNU_SOURCE -c -I${GGML}/include/ggml -o ${OBJ}/ggml.o ${GGML}/src/ggml.c 
+
+${OBJ}/ggml-quants.o: ${GGML}/src/ggml-quants.c
+ gcc -c -D_GNU_SOURCE -I${GGML}/include/ggml -o ${OBJ}/ggml-quants.o ${GGML}/src/ggml-quants.c 
+
+${OBJ}/mobilevit.o: main.cpp
+ g++ -std=c++11 -I${GGML}/include -I${GGML}/examples/ -o ${OBJ}/mobilevit.o -c main.cpp
+
+
+.PHONY: clean
+
+
+clean:
+ rm -f ${OBJ}/*.o 
diff --git a/mobilevit/README.md b/mobilevit/README.md
@@ -0,0 +1 @@
+MobileViT
diff --git a/mobilevit/convert-h5-to-ggml.py b/mobilevit/convert-h5-to-ggml.py
@@ -0,0 +1,63 @@
+# Convert MNIS h5 transformer model to ggml format
+#
+# Load the (state_dict) saved model using PyTorch
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+# - Number of dimensions (int)
+# - Name length (int)
+# - Dimensions (int[n_dims])
+# - Name (char[name_length])
+# - Data (float[n_dims])
+#
+# At the start of the ggml file we write the model parameters
+
+import sys
+import struct
+import json
+import numpy as np
+import re
+
+
+import torch
+import torch.nn as nn
+import torchvision.datasets as dsets
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+
+if len(sys.argv) != 2:
+ print("Usage: convert-h5-to-ggml.py model\n")
+ sys.exit(1)
+
+state_dict_file = sys.argv[1]
+fname_out = "models/mnist/ggml-model-f32.bin"
+
+state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
+#print (model)
+
+list_vars = state_dict
+print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+
+
+for name in list_vars.keys():
+ data = list_vars[name].squeeze().numpy()
+ print("Processing variable: " + name + " with shape: ", data.shape) 
+ n_dims = len(data.shape);
+
+ fout.write(struct.pack("i", n_dims))
+
+ data = data.astype(np.float32)
+ for i in range(n_dims):
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+
+ # data
+ data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/mobilevit/main.cpp b/mobilevit/main.cpp
@@ -0,0 +1,274 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include <iostream>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+struct mobilevit_hparams {
+ int num_channels = 3;
+ int image_size = 256;
+ int patch_size = 2;
+ int hidden_sizes[3] = {144, 192, 240};
+ int neck_hidden_sizes[7] = {16, 32, 64, 96, 128, 160, 640};
+ int num_attention_heads = 4;
+ float mlp_ratio = 2.0;
+ float expand_ratio = 4.0;
+ std::string hidden_act = "silu";
+ int conv_kernel_size = 3;
+ int output_stride = 32;
+ float hidden_dropout_prob = 0.1;
+ float attention_probs_dropout_prob = 0.0;
+ float classifier_dropout_prob = 0.1;
+ float initializer_range=0.02;
+ float layer_norm_eps = 1e-5;
+ bool qkv_bias = true;
+};
+
+struct mobilevit_conv_layer {
+ struct ggml_tensor * kernel; 
+// struct ggml_tensor * bias; // this conv layer never use bias
+ struct ggml_tensor * gamma;
+ struct ggml_tensor * beta;
+};
+
+struct inverted_residual_layer {
+ int in_channels;
+ int out_channels;
+ int strides;
+ int dilation = 1;
+ bool use_residual;
+ mobilevit_conv_layer expand_1x1;
+ mobilevit_conv_layer conv_3x3;
+ mobilevit_conv_layer reduce_1x1;
+};
+
+struct mobile_net_layer {
+ int in_channels;
+ int out_channels;
+ int num_stages;
+ int strides;
+ std::vector<inverted_residual_layer> residual_layers;
+};
+
+struct mobile_vit_layer {
+ int in_channels;
+ int out_channels;
+ int num_stages;
+ int strides;
+ int hidden_size;
+ int dilation;
+
+ inverted_residual_layer downsampling_layer;
+ mobilevit_conv_layer conv_kxk;
+ mobilevit_conv_layer conv_1x1;
+ mobilevit_transformer transformer;
+ mobilevit_conv_layer conv_projection;
+ mobilevit_conv_layer fusion;
+};
+
+struct mobilevit_encoder {
+ mobile_net_layer layer_1; 
+ mobile_net_layer layer_2;
+
+
+ mobile_vit_layer layer_3;
+ mobile_vit_layer layer_4;
+ mobile_vit_layer layer_5;
+};
+
+struct mobilevit_model {
+ mobilevit_hparams hparams;
+
+ mobilevit_conv_layer conv_stem; 
+ mobilevit_encoder encoder; 
+
+ struct ggml_context * ctx_w; // context for model's weights
+};
+
+
+void read_weights(ggml_tensor * tensor, ggml_context * ctx_w, std::ifstream &fin){
+ int name_length, n_dims;
+ // read name_length
+ fin.read(reinterpret_cast<char *>(&name_length), sizeof(name_length));
+ std::cout << "name length: " << name_length << std::endl;
+
+ // read name
+ std::string name(name_length, 0);
+ fin.read(&name[0], name_length);
+ std::cout << "name: " << name << std::endl;
+
+ // read n_dims
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+ std::cout << "n_dims: " << n_dims << std::endl;
+
+ int dims[4];
+ for (int i = 0; i < n_dims; i++){
+ fin.read(reinterpret_cast<char *>(&dims[i]), sizeof(int));
+ std::cout << "dim: " << dims[i] << std::endl;
+ }
+ // read the kernel
+ if (n_dims == 4){ 
+ tensor = ggml_new_tensor_4d(ctx_w, GGML_TYPE_F32, dims[0], dims[1], dims[2], dims[3]);
+ }else if (n_dims == 3){
+ tensor = ggml_new_tensor_3d(ctx_w, GGML_TYPE_F32, dims[0], dims[1], dims[2]);
+ }else if (n_dims == 2){
+ tensor = ggml_new_tensor_2d(ctx_w, GGML_TYPE_F32, dims[0], dims[1]);
+ }else if (n_dims == 1){
+ tensor = ggml_new_tensor_1d(ctx_w, GGML_TYPE_F32, dims[0]);
+ }
+
+
+ fin.read(
+ reinterpret_cast<char *>(tensor->data),
+ ggml_nbytes(tensor)
+ );
+}
+
+void load_model(mobilevit_model & model, std::string model_path){
+ auto fin = std::ifstream(model_path, std::ios::binary);
+ if (!fin){
+ std::cout << "Error opening file" << std::endl;
+ }
+
+ // read layer conv_stem
+ {
+ read_weights(model.conv_stem.kernel, model.ctx_w, fin); 
+ read_weights(model.conv_stem.gamma, model.ctx_w, fin);
+ read_weights(model.conv_stem.beta, model.ctx_w, fin);
+ }
+
+
+ // read encoder
+ {
+ // read layer_1
+ {
+ int in_channels = model.hparams.neck_hidden_sizes[0];
+ int out_channels = model.hparams.neck_hidden_sizes[1];
+ int strides = 1;
+ int num_stages = 1;
+ // set the parasm
+ model.encoder.layer_1.in_channels = in_channels;
+ model.encoder.layer_1.out_channels = out_channels;
+ model.encoder.layer_1.num_stages = num_stages;
+ model.encoder.layer_1.strides = strides;
+ model.encoder.layer_1.residual_layers.resize(num_stages);
+
+ for (int i = 0; i < num_stages; i++){
+ auto residual_layer = model.encoder.layer_1.residual_layers[i];
+ residual_layer.in_channels = in_channels;
+ residual_layer.out_channels = out_channels;
+ residual_layer.strides = i == 0 ? strides : 1;
+
+ read_weights(residual_layer.expand_1x1.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.expand_1x1.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.expand_1x1.beta, model.ctx_w, fin);
+
+ read_weights(residual_layer.conv_3x3.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.conv_3x3.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.conv_3x3.beta, model.ctx_w, fin);
+
+ read_weights(residual_layer.reduce_1x1.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.reduce_1x1.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.reduce_1x1.beta, model.ctx_w, fin);
+
+ // after the first residual layer, in_channels equals out_channels
+ in_channels = out_channels;
+ }
+
+ } 
+
+ // read layer 2
+ {
+ int in_channels = model.hparams.neck_hidden_sizes[1];
+ int out_channels = model.hparams.neck_hidden_sizes[2];
+ int strides = 2;
+ int num_stages = 3;
+ // set the parasm
+ model.encoder.layer_2.in_channels = in_channels;
+ model.encoder.layer_2.out_channels = out_channels;
+ model.encoder.layer_2.num_stages = num_stages;
+ model.encoder.layer_2.strides = strides;
+ model.encoder.layer_2.residual_layers.resize(num_stages);
+
+ for (int i = 0; i < num_stages; i++){
+ auto residual_layer = model.encoder.layer_2.residual_layers[i];
+ residual_layer.in_channels = in_channels;
+ residual_layer.out_channels = out_channels;
+ residual_layer.strides = i == 0 ? strides : 1;
+
+ read_weights(residual_layer.expand_1x1.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.expand_1x1.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.expand_1x1.beta, model.ctx_w, fin);
+
+ read_weights(residual_layer.conv_3x3.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.conv_3x3.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.conv_3x3.beta, model.ctx_w, fin);
+
+ read_weights(residual_layer.reduce_1x1.kernel, model.ctx_w, fin);
+ read_weights(residual_layer.reduce_1x1.gamma, model.ctx_w, fin);
+ read_weights(residual_layer.reduce_1x1.beta, model.ctx_w, fin);
+
+ // after the first residual layer, in_channels equals out_channels
+ in_channels = out_channels;
+ }
+ }
+
+ // read layer_3
+ {
+ int in_channels = model.hparams.neck_hidden_sizes[2];
+ int out_channels = model.hparams.neck_hidden_sizes[3];
+ int strides = 2;
+ int num_stages = 2;
+ int hidden_size = model.hparams.hidden_sizes[0];
+
+ model.encoder.layer_3.in_channels = in_channels;
+
+ model.encoder.layer_3.out_channels = out_channels;
+ model.encoder.layer_3.num_stages = num_stages;
+ model.encoder.layer_3.strides = strides;
+ model.encoder.layer_3.hidden_size = hidden_size;
+ model.encoder.layer_3.dilation = 1;
+ // TODO: I am right here
+
+
+ }
+
+ // read layer 4
+ {
+
+ }
+
+ // read layer 5
+ {
+
+ }
+
+ }
+
+ fin.close();
+}
+
+int main(int argc, char ** argv) {
+ ggml_time_init();
+ mobilevit_model model;
+
+ // create ggml_context for model's weight
+ {
+ struct ggml_init_params params = {128*1024*1024, NULL, false};
+
+ model.ctx_w = ggml_init(params);
+ if (!model.ctx_w) {
+ std::cout << "Cannot create context for model's weights" << std::endl;
+ }
+ }
+
+ load_model(model, "weight.ggml");
+ return 0;
+}