-
-
Notifications
You must be signed in to change notification settings - Fork 29
/
benchmark.cpp
183 lines (145 loc) · 6 KB
/
benchmark.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include "ggml/ggml.h"
#include "clip.h"
#include "common-clip.h"
#include <iostream>
#include <string>
#include <vector>
#include <map>
int main(int argc, char **argv)
{
if (argc != 4 && argc != 5)
{
printf("usage: %s <model_path> <images_dir> <num_images_per_dir> [output_file]\n\n", argv[0]);
printf("model_path: path to CLIP model in GGML format\n");
printf("images_dir: path to a directory of images where images are organized into subdirectories named classes\n");
printf("num_images_per_dir: maximum number of images to read from each one of subdirectories. if 0, read all files\n");
printf("output_file: optional. if specified, dump the output to this file instead of stdout\n");
return 1;
}
std::string model_path = argv[1];
std::string dir_path = argv[2];
uint32_t max_files_per_dir = std::stoi(argv[3]); // Example: Limit to 100 files per directory
FILE *fout = stdout;
if (argc == 5)
{
fout = fopen(argv[4], "w");
if (fout == NULL)
{
printf("%s: cannot open file %s\n", __func__, argv[4]);
return 1;
}
printf("%s: dumping benchmarking results to %s...\n", __func__, argv[4]);
}
auto result = get_dir_keyed_files(dir_path, max_files_per_dir);
size_t n_labels = result.size();
if (n_labels < 2)
{
printf("%s There must be at least 2 directories of images, but %d found\n", __func__, n_labels);
return 1;
}
fprintf(fout, "%s: %zu directories found in %s\n\n", __func__, n_labels, dir_path.c_str());
auto ctx = clip_model_load(model_path.c_str(), 2);
if (!ctx)
{
printf("%s: unable to load model from %s\n", __func__, model_path.c_str());
return 1;
}
const size_t batch_size = 4;
const int vec_dim = ctx->text_model.hparams.projection_dim;
float txt_vecs[n_labels * vec_dim];
ggml_time_init();
// walk through directory names and encode them as texts
int label_idx = 0;
const int64_t t_start_encode_texts = ggml_time_us();
for (const auto &entry : result)
{
auto tokens = clip_tokenize(ctx, entry.first);
if (!clip_text_encode(ctx, 4, tokens, txt_vecs + label_idx * vec_dim))
{
printf("%s: Could not encode the label at index %d: %s\n", __func__, label_idx, entry.first.c_str());
return 1;
}
label_idx += 1;
}
const int64_t t_end_encode_texts = ggml_time_us();
label_idx = 0; // reset label index
int n_total_items = 0; // total number of images processed
float total_acc1_score = 0.0f; // total accuracy at 1 for the intire dataset
float total_acc5_score = 0.0f; // total accuracy at 5 in intitre dataset
float img_vecs[vec_dim * batch_size];
float similarities[n_labels];
float sorted_scores[n_labels];
int indices[n_labels];
std::vector<clip_image_u8> img_inputs(batch_size);
std::vector<clip_image_f32> imgs_resized(batch_size);
// print table headers
fprintf(fout, "| class name | acc@1 | acc@5 |\n");
fprintf(fout, "| -------------------- | ------ | ------ |\n");
int64_t t_start_encode_images = ggml_time_us();
for (auto &entry : result)
{
int n_items = 0;
int n_acc1 = 0;
int n_acc5 = 0;
size_t n_batched = (entry.second.size() / batch_size) * batch_size;
for (size_t i = 0; i < n_batched; i += batch_size)
{
for (size_t ib = i; ib < i + batch_size; ib++)
{
std::string file_path = entry.second[ib];
if (!clip_image_load_from_file(file_path, img_inputs[ib % batch_size]))
{
printf("%s: cannot load file from %s\n", __func__, file_path.c_str());
return 1;
}
}
clip_image_batch_preprocess(ctx, 4, img_inputs, imgs_resized);
clip_image_batch_encode(ctx, 4, imgs_resized, img_vecs);
for (size_t b = 0; b < batch_size; b++)
{
for (size_t j = 0; j < n_labels; j++)
{
similarities[j] = clip_similarity_score(img_vecs + b * vec_dim, txt_vecs + j * vec_dim, vec_dim);
}
softmax_with_sorting(similarities, n_labels, sorted_scores, indices);
for (int k = 0; k < 5; k++)
{
if (k == 0 && indices[k] == label_idx)
{
n_acc1 += 1;
n_acc5 += 1;
break;
}
else if (indices[k] == label_idx)
{
n_acc5 += 1;
break;
}
}
n_items += 1;
n_total_items += 1;
}
}
float acc1_score = (float)n_acc1 / n_items;
float acc5_score = (float)n_acc5 / n_items;
total_acc1_score += acc1_score;
total_acc5_score += acc5_score;
fprintf(fout, "| %-*s ", 20, entry.first.c_str());
fprintf(fout, "| %2.4f | %2.4f |\n", acc1_score, acc5_score);
label_idx += 1;
}
int64_t t_end_encode_images = ggml_time_us();
fprintf(fout, "| total | %2.4f | %2.4f |\n\n", total_acc1_score / (float)n_labels, total_acc5_score / (float)n_labels);
// print timings
float total_text_duration = (t_end_encode_texts - t_start_encode_texts) / 1000.0f;
float total_image_duration = (t_end_encode_images - t_start_encode_images) / 1000.0f;
fprintf(fout, "# Timings\n");
fprintf(fout, "- %d texts encoded in %8.2f ms (%8.2f ms per text)\n", n_labels, total_text_duration, total_text_duration / (float)n_labels);
fprintf(fout, "- %d images encoded in %8.2f ms (%8.2f ms per image)\n", n_total_items, total_image_duration, total_image_duration / (float)n_total_items);
if (fout != stdout)
{
fclose(fout);
}
clip_free(ctx);
return 0;
}