-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cu
191 lines (167 loc) · 8.34 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#include "kernel.cuh"
__global__ void sum_row(float *img, float *l1_dev, float *l2_dev, float *lx_dev, float *ly_dev, int I_width, int I_height) // width of image
{
// Shared memory for four tables
__shared__ float buffer_l1[BLOCK_HEIGHT][BLOCK_WIDTH];
__shared__ float buffer_l2[BLOCK_HEIGHT][BLOCK_WIDTH];
__shared__ float buffer_lx[BLOCK_HEIGHT][BLOCK_WIDTH];
__shared__ float buffer_ly[BLOCK_HEIGHT][BLOCK_WIDTH];
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
int m, cursor;
// Temporal values for prefix sum
float temp_l1, temp_l2, temp_lx, temp_ly;
float last_l1, last_l2, last_lx, last_ly;
for (m = 0; m < (I_width / BLOCK_WIDTH + (I_width%BLOCK_WIDTH>0)); m++)
{
int index_in = row * I_width + blockDim.x * m + threadIdx.x;
int thread_width = BLOCK_WIDTH;
// thread_width is the width of blocks with activated threads
if ((m == I_width / BLOCK_WIDTH) && I_width%BLOCK_WIDTH > 0)
thread_width = I_width%BLOCK_WIDTH;
if (index_in < (I_width * I_height) && (threadIdx.x < thread_width))
{
// Dealing with images whose width is larger than the buffer width
if (m > 0) {
last_l1 = buffer_l1[threadIdx.y][BLOCK_WIDTH - 1];
last_l2 = buffer_l2[threadIdx.y][BLOCK_WIDTH - 1];
last_lx = buffer_lx[threadIdx.y][BLOCK_WIDTH - 1];
last_ly = buffer_ly[threadIdx.y][BLOCK_WIDTH - 1];
}
// Copy from global memory to shared memory
buffer_l1[threadIdx.y][threadIdx.x] = img[index_in];
buffer_l2[threadIdx.y][threadIdx.x] = powf(img[index_in], 2);
buffer_lx[threadIdx.y][threadIdx.x] = img[index_in] * (blockDim.x * m + threadIdx.x);
buffer_ly[threadIdx.y][threadIdx.x] = img[index_in] * row;
// Prefix sum for current array
for (cursor = 1; cursor <= ceilf(log2f(thread_width)); cursor++)
{
if ((threadIdx.x >= __float2int_rd(powf(2, cursor - 1))) && (threadIdx.x < thread_width))
{
temp_l1 = buffer_l1[threadIdx.y][threadIdx.x - __float2int_rd(powf(2, cursor - 1))];
temp_l2 = buffer_l2[threadIdx.y][threadIdx.x - __float2int_rd(powf(2, cursor - 1))];
temp_lx = buffer_lx[threadIdx.y][threadIdx.x - __float2int_rd(powf(2, cursor - 1))];
temp_ly = buffer_ly[threadIdx.y][threadIdx.x - __float2int_rd(powf(2, cursor - 1))];
}
__syncthreads();
if ((threadIdx.x >= __float2int_rd(powf(2, cursor - 1))) && (threadIdx.x < thread_width))
{
buffer_l1[threadIdx.y][threadIdx.x] += temp_l1;
buffer_l2[threadIdx.y][threadIdx.x] += temp_l2;
buffer_lx[threadIdx.y][threadIdx.x] += temp_lx;
buffer_ly[threadIdx.y][threadIdx.x] += temp_ly;
}
__syncthreads();
}
// Dealing with images whose width is larger than the buffer width
if (m > 0) {
buffer_l1[threadIdx.y][threadIdx.x] += last_l1;
buffer_l2[threadIdx.y][threadIdx.x] += last_l2;
buffer_lx[threadIdx.y][threadIdx.x] += last_lx;
buffer_ly[threadIdx.y][threadIdx.x] += last_ly;
}
// Copy from shared memory to global memory
l1_dev[index_in] = buffer_l1[threadIdx.y][threadIdx.x];
l2_dev[index_in] = buffer_l2[threadIdx.y][threadIdx.x];
lx_dev[index_in] = buffer_lx[threadIdx.y][threadIdx.x];
ly_dev[index_in] = buffer_ly[threadIdx.y][threadIdx.x];
}
}
}
__global__ void sum_col(float *l1_dev, float *l2_dev, float *lx_dev, float *ly_dev, int I_width, int I_height)
{
// Shared memory for four tables
__shared__ float buffer_l1[BLOCK_HEIGHT2][BLOCK_WIDTH2];
__shared__ float buffer_l2[BLOCK_HEIGHT2][BLOCK_WIDTH2];
__shared__ float buffer_lx[BLOCK_HEIGHT2][BLOCK_WIDTH2];
__shared__ float buffer_ly[BLOCK_HEIGHT2][BLOCK_WIDTH2];
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
int m, cursor;
// Temporal values for prefix sum
float temp_l1, temp_l2, temp_lx, temp_ly;
float last_l1, last_l2, last_lx, last_ly;
for (m = 0; m < (I_height / BLOCK_HEIGHT2 + (I_height%BLOCK_HEIGHT2>0)); m++)
{
int index_in = (blockDim.y * m + threadIdx.y) * I_width + col;
int thread_height = BLOCK_HEIGHT2;
if ((m == I_height / BLOCK_HEIGHT2) && I_height%BLOCK_HEIGHT2 > 0)
thread_height = I_height%BLOCK_HEIGHT2;
if ((index_in < I_width * I_height) && col < I_width)
{
// for array length longer than block width. Each element has to add the
// previous sum.
if (m > 0) {
last_l1 = buffer_l1[BLOCK_HEIGHT2 - 1][threadIdx.x];
last_l2 = buffer_l2[BLOCK_HEIGHT2 - 1][threadIdx.x];
last_lx = buffer_lx[BLOCK_HEIGHT2 - 1][threadIdx.x];
last_ly = buffer_ly[BLOCK_HEIGHT2 - 1][threadIdx.x];
}
// Copy from global memory to shared memory (? if buffer length longer than img)
buffer_l1[threadIdx.y][threadIdx.x] = l1_dev[index_in];
buffer_l2[threadIdx.y][threadIdx.x] = l2_dev[index_in];
buffer_lx[threadIdx.y][threadIdx.x] = lx_dev[index_in];
buffer_ly[threadIdx.y][threadIdx.x] = ly_dev[index_in];
// Prefix sum for current array
for (cursor = 1; cursor <= ceilf(log2f(thread_height)); cursor++) {
/*prefix sum*/
if (threadIdx.y >= __float2int_rd(powf(2, cursor - 1))) {
temp_l1 = buffer_l1[threadIdx.y - __float2int_rd(powf(2, cursor - 1))][threadIdx.x];
temp_l2 = buffer_l2[threadIdx.y - __float2int_rd(powf(2, cursor - 1))][threadIdx.x];
temp_lx = buffer_lx[threadIdx.y - __float2int_rd(powf(2, cursor - 1))][threadIdx.x];
temp_ly = buffer_ly[threadIdx.y - __float2int_rd(powf(2, cursor - 1))][threadIdx.x];
}
__syncthreads();
if (threadIdx.y >= __float2int_rd(powf(2, cursor - 1))) {
buffer_l1[threadIdx.y][threadIdx.x] += temp_l1;
buffer_l2[threadIdx.y][threadIdx.x] += temp_l2;
buffer_lx[threadIdx.y][threadIdx.x] += temp_lx;
buffer_ly[threadIdx.y][threadIdx.x] += temp_ly;
}
__syncthreads();
}
// Dealing with images whose height is larger than the buffer height
if (m > 0) {
buffer_l1[threadIdx.y][threadIdx.x] += last_l1;
buffer_l2[threadIdx.y][threadIdx.x] += last_l2;
buffer_lx[threadIdx.y][threadIdx.x] += last_lx;
buffer_ly[threadIdx.y][threadIdx.x] += last_ly;
}
// Copy from shared memory to global memory
l1_dev[index_in] = buffer_l1[threadIdx.y][threadIdx.x];
l2_dev[index_in] = buffer_l2[threadIdx.y][threadIdx.x];
lx_dev[index_in] = buffer_lx[threadIdx.y][threadIdx.x];
ly_dev[index_in] = buffer_ly[threadIdx.y][threadIdx.x];
}
}
}
__global__ void compute_feature(float vt1value, float vt2value, float vt3value, float vt4value, float *v1_dev, float *v2_dev, float *v3_dev, float *v4_dev, float *X_dev, float *l1_dev, float *l2_dev, float *lx_dev, float *ly_dev, int K, int M, int N) {
float S1value = 0;
float S2value = 0;
float Sxvalue = 0;
float Syvalue = 0;
float v1value = 0;
float v2value = 0;
float v3value = 0;
float v4value = 0;
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
if ((col <= (M - K)) && (row <= (N - K))) {
// Compute four sum of pixel values within every patch.
S1value = (l1_dev[(row + K - 1)*M + (col + K - 1)] - l1_dev[(row + K - 1)*M + (col)] - l1_dev[(row)*M + (col + K - 1)] + l1_dev[(row)*M + (col)]);
S2value = (l2_dev[(row + K - 1)*M + (col + K - 1)] - l2_dev[(row + K - 1)*M + (col)] - l2_dev[(row)*M + (col + K - 1)] + l2_dev[(row)*M + (col)]);
Sxvalue = (lx_dev[(row + K - 1)*M + (col + K - 1)] - lx_dev[(row + K - 1)*M + (col)] - lx_dev[(row)*M + (col + K - 1)] + lx_dev[(row)*M + (col)]);
Syvalue = (ly_dev[(row + K - 1)*M + (col + K - 1)] - ly_dev[(row + K - 1)*M + (col)] - ly_dev[(row)*M + (col + K - 1)] + ly_dev[(row)*M + (col)]);
// Compute four features for every patch and place them in right place.
v1value = S1value / K / K;
v2value = S2value / K / K - v1value*v1value;
v3value = 4.0 * (Sxvalue - (col + 1.0 * (K - 1) / 2) * S1value) / K / K / K;
v4value = 4.0 * (Syvalue - (row + 1.0 * (K - 1) / 2) * S1value) / K / K / K;
v1_dev[row * (M - K + 1) + col] = v1value;
v2_dev[row * (M - K + 1) + col] = v2value;
v3_dev[row * (M - K + 1) + col] = v3value;
v4_dev[row * (M - K + 1) + col] = v4value;
// Compute the square of Euclidean distance between the template and every patch and place the results in right place.
X_dev[row * (M - K + 1) + col] = powf(v1value - vt1value, 2) + powf(v2value - vt2value, 2) + powf(v3value - vt3value, 2) + powf(v4value - vt4value, 2);
}
}