-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Separated gradient to current gradient, accumulated_gradient, gradien…
…t_with_moments Change-Id: Ieaa22b49da90047960474fbaae822b5f99da01d2
- Loading branch information
Showing
32 changed files
with
633 additions
and
226 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,29 @@ | ||
#ifndef OUTPUT_SIZE | ||
#error "OUTPUT_SIZE must be defined" | ||
#endif | ||
|
||
#ifndef OUTPUT_SAMPLE_SIZE | ||
#error "OUTPUT_SAMPLE_SIZE must be defined" | ||
#endif | ||
|
||
extern "C" | ||
__global__ void apply_bias_with_activation(dtype *output, const dtype *bias) { | ||
size_t idx = blockIdx.y * OUTPUT_SAMPLE_SIZE + blockIdx.x; | ||
dtype y = output[idx]; | ||
#if INCLUDE_BIAS > 0 | ||
y += bias[blockIdx.x]; | ||
#endif | ||
#if ACTIVATION_LINEAR > 0 | ||
output[idx] = y; | ||
#elif ACTIVATION_TANH > 0 | ||
output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y); | ||
#elif ACTIVATION_RELU > 0 | ||
output[idx] = y > 15 ? y : log(exp(y) + 1); | ||
#elif ACTIVATION_STRICT_RELU > 0 | ||
output[idx] = max(y, (dtype)0.0); | ||
#else | ||
#error "Unsupported activation" | ||
#endif | ||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx < OUTPUT_SIZE) { | ||
dtype y = output[idx]; | ||
#if INCLUDE_BIAS > 0 | ||
y += bias[idx % OUTPUT_SAMPLE_SIZE]; | ||
#endif | ||
#if ACTIVATION_LINEAR > 0 | ||
output[idx] = y; | ||
#elif ACTIVATION_TANH > 0 | ||
output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y); | ||
#elif ACTIVATION_RELU > 0 | ||
output[idx] = y > 15 ? y : log(exp(y) + 1); | ||
#elif ACTIVATION_STRICT_RELU > 0 | ||
output[idx] = max(y, (dtype)0.0); | ||
#else | ||
#error "Unsupported activation" | ||
#endif | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#include "gradient_descent_common.cu" | ||
|
||
|
||
/// @brief Calculate gradient for bias update. | ||
/// @param bias Layer bias. | ||
/// @param err_output Backpropagated error. | ||
/// @param gradient Computed gradient. | ||
/// @param gradient_with_moment Accumulated gradient with moments. | ||
/// @param lr learning_rate. | ||
/// @param factor_l12 lnorm_factor. | ||
/// @param l1_vs_l2 how much to prefer l1 over l2 (from [0, 1]). | ||
/// @param moment Moment for gradient. | ||
/// @details Should be defined externally: | ||
/// REDUCE_SIZE - size of the block for matrix reduce, | ||
/// BATCH - minibatch size, | ||
/// Y - output size. | ||
extern "C" | ||
__global__ void bias_update(const dtype *err_output, | ||
dtype *bias, | ||
dtype *gradient, | ||
dtype *accumulated_gradient, | ||
dtype *gradient_with_moment, | ||
const dtype lr, | ||
const dtype factor_l12, | ||
const dtype l1_vs_l2, | ||
const dtype moment) { | ||
|
||
#define A err_output | ||
#define A_WIDTH Y | ||
#define A_HEIGHT BATCH | ||
#define A_COL | ||
|
||
#include "matrix_reduce.cu" | ||
|
||
#undef A_COL | ||
#undef A_HEIGHT | ||
#undef A_WIDTH | ||
#undef A | ||
|
||
#include "bias_update.store_output.cu" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#include "gradient_descent_common.cu" | ||
|
||
|
||
#if USE_ORTHO > 0 | ||
#include "weights_ortho.cu" | ||
#endif | ||
|
||
|
||
extern "C" | ||
__global__ void weights_update(const dtype *_err_output, | ||
const dtype *_input, | ||
dtype *weights, | ||
const dtype *gradient, | ||
dtype *accumulated_gradient, | ||
dtype *gradient_with_moment, | ||
const dtype lr, | ||
const dtype factor_l12, | ||
const dtype l1_vs_l2, | ||
const dtype moment | ||
#if USE_ORTHO > 0 | ||
, const dtype factor_ortho, | ||
const dtype *col_sums | ||
#endif | ||
) { | ||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx < (H * Y)) { | ||
dtype sum = gradient[idx]; | ||
#include "weights_update.store_output.cu" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
if (!tx) { | ||
sum += AS[0]; | ||
dtype weight = bias[bx]; | ||
dtype gd = -lr * (sum + gradient_step_l12(weight, factor_l12, l1_vs_l2)); | ||
#define weights bias | ||
#define idx bx | ||
gradient[idx] = sum; | ||
#include "gradient_descent.store_output.cu" | ||
#undef idx | ||
#undef weights | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,61 @@ | ||
#ifndef SAMPLE_SIZE | ||
#error "SAMPLE_SIZE should be defined" | ||
#endif | ||
#ifndef MAX_MINIBATCH_SIZE | ||
#error "MAX_MINIBATCH_SIZE should be defined" | ||
#endif | ||
|
||
|
||
extern "C" | ||
__global__ void fill_minibatch_data_labels( | ||
const original_data_dtype /* IN */ *original_data, | ||
minibatch_data_dtype /* OUT */ *minibatch_data, | ||
const int /* IN */ start_offset, | ||
const int /* IN */ count, | ||
const original_data_dtype *original_data, | ||
minibatch_data_dtype *minibatch_data, | ||
const int start_offset, | ||
const int count, | ||
#if LABELS > 0 | ||
const int /* IN */ *original_labels, | ||
int /* OUT */ *minibatch_labels, | ||
const int *original_labels, | ||
int *minibatch_labels, | ||
#endif | ||
const int /* IN */ *shuffled_indices, | ||
int /* OUT */ *minibatch_indices) { | ||
const int *shuffled_indices, | ||
int *minibatch_indices) { | ||
|
||
int sample_number = blockDim.x * blockIdx.x + threadIdx.x; | ||
int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
int sample_number = idx / SAMPLE_SIZE; | ||
int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1; | ||
|
||
int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y; | ||
int offs_in_sample = idx % SAMPLE_SIZE; | ||
int offs_in_data = real_sample_number * SAMPLE_SIZE + offs_in_sample; | ||
int offs_in_minibatch = sample_number * SAMPLE_SIZE + offs_in_sample; | ||
|
||
minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0; | ||
#if LABELS > 0 | ||
minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1; | ||
#endif | ||
minibatch_indices[sample_number] = real_sample_number; | ||
if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) { | ||
minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0; | ||
#if LABELS > 0 | ||
minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1; | ||
#endif | ||
minibatch_indices[sample_number] = real_sample_number; | ||
} | ||
} | ||
|
||
|
||
#if TARGET > 0 | ||
extern "C" | ||
__global__ void fill_minibatch_target( | ||
const original_target_dtype /* IN */ *original_target, | ||
minibatch_target_dtype /* OUT */ *minibatch_target, | ||
const int /* IN */ start_offset, | ||
const int /* IN */ count, | ||
int /* IN */ *shuffled_indices) { | ||
|
||
int sample_number = blockDim.x * blockIdx.x + threadIdx.x; | ||
const original_target_dtype *original_target, | ||
minibatch_target_dtype *minibatch_target, | ||
const int start_offset, | ||
const int count, | ||
int *shuffled_indices) { | ||
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
int sample_number = idx / SAMPLE_SIZE; | ||
int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1; | ||
|
||
int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y; | ||
int offs_in_target = real_sample_number * TARGET_SIZE + offs_in_sample; | ||
int offs_in_minibatch = sample_number * TARGET_SIZE + offs_in_sample; | ||
|
||
minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0; | ||
if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) { | ||
minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0; | ||
} | ||
} | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#if ACCUMULATE_GRADIENT == OP_STORE | ||
accumulated_gradient[idx] = gd; | ||
#elif ACCUMULATE_GRADIENT == OP_ADD | ||
accumulated_gradient[idx] += gd; | ||
#elif ACCUMULATE_GRADIENT == OP_FLUSH | ||
gd += accumulated_gradient[idx]; | ||
accumulated_gradient[idx] = 0; | ||
#endif | ||
|
||
#ifndef USE_MOMENT | ||
#error "USE_MOMENT should be defined" | ||
#endif | ||
#if USE_MOMENT > 0 | ||
gd += gradient_with_moment[idx] * moment; | ||
gradient_with_moment[idx] = gd; | ||
#endif | ||
|
||
#if APPLY_GRADIENT > 0 | ||
weights[idx] = weight + gd; | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef _GRADIENT_DESCENT_COMMON_ | ||
#define _GRADIENT_DESCENT_COMMON_ | ||
|
||
#include "defines.cu" | ||
|
||
#ifndef WEIGHTS_TRANSPOSED | ||
#error "WEIGHTS_TRANSPOSED should be defined" | ||
#endif | ||
|
||
#ifndef ACCUMULATE_GRADIENT | ||
#error "ACCUMULATE_GRADIENT should be defined" | ||
#endif | ||
#define OP_NONE 0 | ||
#define OP_STORE 1 | ||
#define OP_ADD 2 | ||
#define OP_FLUSH 3 | ||
#if (ACCUMULATE_GRADIENT >= 0) && (ACCUMULATE_GRADIENT <= 3) | ||
// All Ok | ||
#else | ||
#error "Incorrect ACCUMULATE_GRADIENT" | ||
#endif | ||
|
||
#ifndef APPLY_GRADIENT | ||
#error "APPLY_GRADIENT should be defined" | ||
#endif | ||
|
||
|
||
#define gradient_step_l12(weight, factor, l1_vs_l2) (factor * (((dtype)1.0 - l1_vs_l2) * weight + (dtype)0.5 * l1_vs_l2 * SIGN(weight))) | ||
|
||
|
||
#endif // _GRADIENT_DESCENT_COMMON_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#include "defines.cu" | ||
|
||
|
||
/// @brief Updates backpropagated error by activation derivative. | ||
/// @details err_y *= 1.0 - exp(-y) | ||
extern "C" | ||
__global__ void err_y_update(dtype *err_y, const dtype *y) { | ||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx < (Y * BATCH)) { | ||
err_y[idx] *= (dtype)1.0 - exp(-y[idx]); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#include "defines.cu" | ||
|
||
|
||
/// Strict ReLU back propagation | ||
/// @brief Updates backpropagated error by activation derivative. | ||
/// @details err_y *= (y > 0) ? 1 : 0 | ||
extern "C" | ||
__global__ void err_y_update(dtype *err_y, const dtype *y) { | ||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if ((idx < (Y * BATCH)) && (y[idx] <= 0)) { | ||
err_y[idx] = 0; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#include "defines.cu" | ||
|
||
|
||
/// @brief Updates backpropagated error by activation derivative. | ||
/// @details err_y *= y * y * (-0.388484177) + 1.14381894 | ||
extern "C" | ||
__global__ void err_y_update(dtype *err_y, const dtype *y) { | ||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx < (Y * BATCH)) { | ||
dtype x = y[idx]; | ||
err_y[idx] *= x * x * (dtype)(-0.388484177) + (dtype)1.14381894; | ||
} | ||
} |
Oops, something went wrong.