Skip to content

Commit

Permalink
mobile vit
Browse files Browse the repository at this point in the history
  • Loading branch information
datduonguva committed Feb 24, 2024
1 parent 9074a16 commit 7a12ef5
Show file tree
Hide file tree
Showing 7 changed files with 473 additions and 5 deletions.
8 changes: 6 additions & 2 deletions examples/mnist_training/train_mnist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,18 @@ struct ggml_tensor * square_error_loss(
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
}

struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
struct ggml_tensor * cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b
) {
const float eps = 1e-3f;
return
ggml_sum(ctx,
ggml_neg(ctx,
ggml_sum_rows(ctx,
ggml_mul(ctx,
a,
a,
ggml_log(ctx,
ggml_add1(ctx,
ggml_soft_max(ctx, b),
Expand Down
26 changes: 26 additions & 0 deletions mobilevit/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
OBJ=./objects
GGML=/home/datduong/github/ggml

all: mkdir main

main: ${OBJ}/ggml.o ${OBJ}/mobilevit.o ${OBJ}/ggml-quants.o
g++ -g -o main ${OBJ}/ggml.o ${OBJ}/mobilevit.o ${OBJ}/ggml-quants.o -lm -lpthread

mkdir:
mkdir -p ${OBJ}

${OBJ}/ggml.o: ${GGML}/src/ggml.c
gcc -D_GNU_SOURCE -c -I${GGML}/include/ggml -o ${OBJ}/ggml.o ${GGML}/src/ggml.c

${OBJ}/ggml-quants.o: ${GGML}/src/ggml-quants.c
gcc -c -D_GNU_SOURCE -I${GGML}/include/ggml -o ${OBJ}/ggml-quants.o ${GGML}/src/ggml-quants.c

${OBJ}/mobilevit.o: main.cpp
g++ -std=c++11 -I${GGML}/include -I${GGML}/examples/ -o ${OBJ}/mobilevit.o -c main.cpp


.PHONY: clean


clean:
rm -f ${OBJ}/*.o
1 change: 1 addition & 0 deletions mobilevit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
MobileViT
63 changes: 63 additions & 0 deletions mobilevit/convert-h5-to-ggml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Convert MNIS h5 transformer model to ggml format
#
# Load the (state_dict) saved model using PyTorch
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
# At the start of the ggml file we write the model parameters

import sys
import struct
import json
import numpy as np
import re


import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

if len(sys.argv) != 2:
print("Usage: convert-h5-to-ggml.py model\n")
sys.exit(1)

state_dict_file = sys.argv[1]
fname_out = "models/mnist/ggml-model-f32.bin"

state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
#print (model)

list_vars = state_dict
print (list_vars)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex


for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
print("Processing variable: " + name + " with shape: ", data.shape)
n_dims = len(data.shape);

fout.write(struct.pack("i", n_dims))

data = data.astype(np.float32)
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))

# data
data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")
274 changes: 274 additions & 0 deletions mobilevit/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
#include "ggml/ggml.h"

#include "common.h"
#include <iostream>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>

struct mobilevit_hparams {
int num_channels = 3;
int image_size = 256;
int patch_size = 2;
int hidden_sizes[3] = {144, 192, 240};
int neck_hidden_sizes[7] = {16, 32, 64, 96, 128, 160, 640};
int num_attention_heads = 4;
float mlp_ratio = 2.0;
float expand_ratio = 4.0;
std::string hidden_act = "silu";
int conv_kernel_size = 3;
int output_stride = 32;
float hidden_dropout_prob = 0.1;
float attention_probs_dropout_prob = 0.0;
float classifier_dropout_prob = 0.1;
float initializer_range=0.02;
float layer_norm_eps = 1e-5;
bool qkv_bias = true;
};

struct mobilevit_conv_layer {
struct ggml_tensor * kernel;
// struct ggml_tensor * bias; // this conv layer never use bias
struct ggml_tensor * gamma;
struct ggml_tensor * beta;
};

struct inverted_residual_layer {
int in_channels;
int out_channels;
int strides;
int dilation = 1;
bool use_residual;
mobilevit_conv_layer expand_1x1;
mobilevit_conv_layer conv_3x3;
mobilevit_conv_layer reduce_1x1;
};

struct mobile_net_layer {
int in_channels;
int out_channels;
int num_stages;
int strides;
std::vector<inverted_residual_layer> residual_layers;
};

struct mobile_vit_layer {
int in_channels;
int out_channels;
int num_stages;
int strides;
int hidden_size;
int dilation;

inverted_residual_layer downsampling_layer;
mobilevit_conv_layer conv_kxk;
mobilevit_conv_layer conv_1x1;
mobilevit_transformer transformer;
mobilevit_conv_layer conv_projection;
mobilevit_conv_layer fusion;
};

struct mobilevit_encoder {
mobile_net_layer layer_1;
mobile_net_layer layer_2;


mobile_vit_layer layer_3;
mobile_vit_layer layer_4;
mobile_vit_layer layer_5;
};

struct mobilevit_model {
mobilevit_hparams hparams;

mobilevit_conv_layer conv_stem;
mobilevit_encoder encoder;

struct ggml_context * ctx_w; // context for model's weights
};


void read_weights(ggml_tensor * tensor, ggml_context * ctx_w, std::ifstream &fin){
int name_length, n_dims;
// read name_length
fin.read(reinterpret_cast<char *>(&name_length), sizeof(name_length));
std::cout << "name length: " << name_length << std::endl;

// read name
std::string name(name_length, 0);
fin.read(&name[0], name_length);
std::cout << "name: " << name << std::endl;

// read n_dims
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
std::cout << "n_dims: " << n_dims << std::endl;

int dims[4];
for (int i = 0; i < n_dims; i++){
fin.read(reinterpret_cast<char *>(&dims[i]), sizeof(int));
std::cout << "dim: " << dims[i] << std::endl;
}
// read the kernel
if (n_dims == 4){
tensor = ggml_new_tensor_4d(ctx_w, GGML_TYPE_F32, dims[0], dims[1], dims[2], dims[3]);
}else if (n_dims == 3){
tensor = ggml_new_tensor_3d(ctx_w, GGML_TYPE_F32, dims[0], dims[1], dims[2]);
}else if (n_dims == 2){
tensor = ggml_new_tensor_2d(ctx_w, GGML_TYPE_F32, dims[0], dims[1]);
}else if (n_dims == 1){
tensor = ggml_new_tensor_1d(ctx_w, GGML_TYPE_F32, dims[0]);
}


fin.read(
reinterpret_cast<char *>(tensor->data),
ggml_nbytes(tensor)
);
}

void load_model(mobilevit_model & model, std::string model_path){
auto fin = std::ifstream(model_path, std::ios::binary);
if (!fin){
std::cout << "Error opening file" << std::endl;
}

// read layer conv_stem
{
read_weights(model.conv_stem.kernel, model.ctx_w, fin);
read_weights(model.conv_stem.gamma, model.ctx_w, fin);
read_weights(model.conv_stem.beta, model.ctx_w, fin);
}


// read encoder
{
// read layer_1
{
int in_channels = model.hparams.neck_hidden_sizes[0];
int out_channels = model.hparams.neck_hidden_sizes[1];
int strides = 1;
int num_stages = 1;
// set the parasm
model.encoder.layer_1.in_channels = in_channels;
model.encoder.layer_1.out_channels = out_channels;
model.encoder.layer_1.num_stages = num_stages;
model.encoder.layer_1.strides = strides;
model.encoder.layer_1.residual_layers.resize(num_stages);

for (int i = 0; i < num_stages; i++){
auto residual_layer = model.encoder.layer_1.residual_layers[i];
residual_layer.in_channels = in_channels;
residual_layer.out_channels = out_channels;
residual_layer.strides = i == 0 ? strides : 1;

read_weights(residual_layer.expand_1x1.kernel, model.ctx_w, fin);
read_weights(residual_layer.expand_1x1.gamma, model.ctx_w, fin);
read_weights(residual_layer.expand_1x1.beta, model.ctx_w, fin);

read_weights(residual_layer.conv_3x3.kernel, model.ctx_w, fin);
read_weights(residual_layer.conv_3x3.gamma, model.ctx_w, fin);
read_weights(residual_layer.conv_3x3.beta, model.ctx_w, fin);

read_weights(residual_layer.reduce_1x1.kernel, model.ctx_w, fin);
read_weights(residual_layer.reduce_1x1.gamma, model.ctx_w, fin);
read_weights(residual_layer.reduce_1x1.beta, model.ctx_w, fin);

// after the first residual layer, in_channels equals out_channels
in_channels = out_channels;
}

}

// read layer 2
{
int in_channels = model.hparams.neck_hidden_sizes[1];
int out_channels = model.hparams.neck_hidden_sizes[2];
int strides = 2;
int num_stages = 3;
// set the parasm
model.encoder.layer_2.in_channels = in_channels;
model.encoder.layer_2.out_channels = out_channels;
model.encoder.layer_2.num_stages = num_stages;
model.encoder.layer_2.strides = strides;
model.encoder.layer_2.residual_layers.resize(num_stages);

for (int i = 0; i < num_stages; i++){
auto residual_layer = model.encoder.layer_2.residual_layers[i];
residual_layer.in_channels = in_channels;
residual_layer.out_channels = out_channels;
residual_layer.strides = i == 0 ? strides : 1;

read_weights(residual_layer.expand_1x1.kernel, model.ctx_w, fin);
read_weights(residual_layer.expand_1x1.gamma, model.ctx_w, fin);
read_weights(residual_layer.expand_1x1.beta, model.ctx_w, fin);

read_weights(residual_layer.conv_3x3.kernel, model.ctx_w, fin);
read_weights(residual_layer.conv_3x3.gamma, model.ctx_w, fin);
read_weights(residual_layer.conv_3x3.beta, model.ctx_w, fin);

read_weights(residual_layer.reduce_1x1.kernel, model.ctx_w, fin);
read_weights(residual_layer.reduce_1x1.gamma, model.ctx_w, fin);
read_weights(residual_layer.reduce_1x1.beta, model.ctx_w, fin);

// after the first residual layer, in_channels equals out_channels
in_channels = out_channels;
}
}

// read layer_3
{
int in_channels = model.hparams.neck_hidden_sizes[2];
int out_channels = model.hparams.neck_hidden_sizes[3];
int strides = 2;
int num_stages = 2;
int hidden_size = model.hparams.hidden_sizes[0];

model.encoder.layer_3.in_channels = in_channels;

model.encoder.layer_3.out_channels = out_channels;
model.encoder.layer_3.num_stages = num_stages;
model.encoder.layer_3.strides = strides;
model.encoder.layer_3.hidden_size = hidden_size;
model.encoder.layer_3.dilation = 1;
// TODO: I am right here


}

// read layer 4
{

}

// read layer 5
{

}

}

fin.close();
}

int main(int argc, char ** argv) {
ggml_time_init();
mobilevit_model model;

// create ggml_context for model's weight
{
struct ggml_init_params params = {128*1024*1024, NULL, false};

model.ctx_w = ggml_init(params);
if (!model.ctx_w) {
std::cout << "Cannot create context for model's weights" << std::endl;
}
}

load_model(model, "weight.ggml");
return 0;
}
Loading

0 comments on commit 7a12ef5

Please sign in to comment.