Skip to content

Commit

Permalink
Separated gradient to current gradient, accumulated_gradient, gradien…
Browse files Browse the repository at this point in the history
…t_with_moments

Change-Id: Ieaa22b49da90047960474fbaae822b5f99da01d2
  • Loading branch information
ajkxyz committed Jan 15, 2015
1 parent 6b5bdd9 commit 535293c
Show file tree
Hide file tree
Showing 32 changed files with 633 additions and 226 deletions.
45 changes: 27 additions & 18 deletions all2all.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,27 @@ def initialize(self, device, **kwargs):

def cuda_init(self):
dtype = self.input.dtype
self.gemm = (cublas.CUBLAS.sgemm if dtype == numpy.float32
else cublas.CUBLAS.dgemm)
self.gemm_ = (cublas.CUBLAS.sgemm if dtype == numpy.float32
else cublas.CUBLAS.dgemm)
self.np_one = numpy.ones(1, dtype=dtype)
self.np_zero = numpy.zeros(1, dtype=dtype)
if self.weights_transposed:
raise NotImplementedError("TODO(a.kazantsev): implement")
self._A_ = self.input.devmem
self._B_ = self.weights.devmem
self._transA = cublas.CUBLAS_OP_T
self._transB = cublas.CUBLAS_OP_N
self._rowsCountA = self.input.shape[0]
self._columnCountB = self.weights.shape[0]
else:
self.A = self.weights.devmem
self.B = self.input.devmem
self.transA = cublas.CUBLAS_OP_T
self.transB = cublas.CUBLAS_OP_N
self.rowsCountA = self.weights.shape[0]
self.columnCountB = self.input.shape[0]
self.commonSideLength = self.input.sample_size
self._A_ = self.weights.devmem
self._B_ = self.input.devmem
self._transA = cublas.CUBLAS_OP_T
self._transB = cublas.CUBLAS_OP_N
self._rowsCountA = self.weights.shape[0]
self._columnCountB = self.input.shape[0]
self._commonSideLength = self.input.sample_size
self.build_program({"OUTPUT_SAMPLE_SIZE": self.output.sample_size,
"OUTPUT_SIZE": self.output.size,
self.s_activation: 1,
"INCLUDE_BIAS": int(self.include_bias),
"Y": self.output.sample_size},
Expand All @@ -171,8 +177,10 @@ def cuda_init(self):
if self.include_bias:
self.assign_kernel("apply_bias_with_activation")
self.set_args(self.output, self.bias)
self._global_size_bias = (self.output.sample_size,
self.output.shape[0], 1)
block_size = self.device.suggest_block_size(self._kernel_)
self._global_size_bias = (
int(numpy.ceil(self.output.size / block_size)), 1, 1)
self._local_size_bias = (block_size, 1, 1)

def ocl_init(self):
output_shape = (self.output_shape.mem.shape[1:]
Expand Down Expand Up @@ -223,13 +231,14 @@ def cuda_run(self):
self.weights.unmap()
self.bias.unmap()

self.gemm(self.device.blas, self.transA, self.transB,
self.rowsCountA, self.columnCountB, self.commonSideLength,
self.np_one, self.A, self.B,
self.np_zero, self.output.devmem)
self.gemm_(
self.device.blas, self._transA, self._transB,
self._rowsCountA, self._columnCountB, self._commonSideLength,
self.np_one, self._A_, self._B_,
self.np_zero, self.output.devmem)

if self.include_bias:
self.execute_kernel(self._global_size_bias, (1, 1, 1))
self.execute_kernel(self._global_size_bias, self._local_size_bias)

def cpu_run(self):
"""Forward propagation from batch on CPU only.
Expand Down Expand Up @@ -331,7 +340,7 @@ class All2AllSoftmax(All2All):
def __init__(self, workflow, **kwargs):
super(All2AllSoftmax, self).__init__(workflow, **kwargs)
self.max_idx = Vector()
self.reduce_size = 64
self.reduce_size = 256

def init_unpickled(self):
super(All2AllSoftmax, self).init_unpickled()
Expand Down
38 changes: 22 additions & 16 deletions cuda/all2all/forward.cu
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
#ifndef OUTPUT_SIZE
#error "OUTPUT_SIZE must be defined"
#endif

#ifndef OUTPUT_SAMPLE_SIZE
#error "OUTPUT_SAMPLE_SIZE must be defined"
#endif

extern "C"
__global__ void apply_bias_with_activation(dtype *output, const dtype *bias) {
size_t idx = blockIdx.y * OUTPUT_SAMPLE_SIZE + blockIdx.x;
dtype y = output[idx];
#if INCLUDE_BIAS > 0
y += bias[blockIdx.x];
#endif
#if ACTIVATION_LINEAR > 0
output[idx] = y;
#elif ACTIVATION_TANH > 0
output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y);
#elif ACTIVATION_RELU > 0
output[idx] = y > 15 ? y : log(exp(y) + 1);
#elif ACTIVATION_STRICT_RELU > 0
output[idx] = max(y, (dtype)0.0);
#else
#error "Unsupported activation"
#endif
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < OUTPUT_SIZE) {
dtype y = output[idx];
#if INCLUDE_BIAS > 0
y += bias[idx % OUTPUT_SAMPLE_SIZE];
#endif
#if ACTIVATION_LINEAR > 0
output[idx] = y;
#elif ACTIVATION_TANH > 0
output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y);
#elif ACTIVATION_RELU > 0
output[idx] = y > 15 ? y : log(exp(y) + 1);
#elif ACTIVATION_STRICT_RELU > 0
output[idx] = max(y, (dtype)0.0);
#else
#error "Unsupported activation"
#endif
}
}
41 changes: 41 additions & 0 deletions cuda/all2all/gradient_descent/bias_update.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "gradient_descent_common.cu"


/// @brief Calculate gradient for bias update.
/// @param bias Layer bias.
/// @param err_output Backpropagated error.
/// @param gradient Computed gradient.
/// @param gradient_with_moment Accumulated gradient with moments.
/// @param lr learning_rate.
/// @param factor_l12 lnorm_factor.
/// @param l1_vs_l2 how much to prefer l1 over l2 (from [0, 1]).
/// @param moment Moment for gradient.
/// @details Should be defined externally:
/// REDUCE_SIZE - size of the block for matrix reduce,
/// BATCH - minibatch size,
/// Y - output size.
extern "C"
__global__ void bias_update(const dtype *err_output,
dtype *bias,
dtype *gradient,
dtype *accumulated_gradient,
dtype *gradient_with_moment,
const dtype lr,
const dtype factor_l12,
const dtype l1_vs_l2,
const dtype moment) {

#define A err_output
#define A_WIDTH Y
#define A_HEIGHT BATCH
#define A_COL

#include "matrix_reduce.cu"

#undef A_COL
#undef A_HEIGHT
#undef A_WIDTH
#undef A

#include "bias_update.store_output.cu"
}
30 changes: 30 additions & 0 deletions cuda/all2all/gradient_descent/weights_update.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "gradient_descent_common.cu"


#if USE_ORTHO > 0
#include "weights_ortho.cu"
#endif


extern "C"
__global__ void weights_update(const dtype *_err_output,
const dtype *_input,
dtype *weights,
const dtype *gradient,
dtype *accumulated_gradient,
dtype *gradient_with_moment,
const dtype lr,
const dtype factor_l12,
const dtype l1_vs_l2,
const dtype moment
#if USE_ORTHO > 0
, const dtype factor_ortho,
const dtype *col_sums
#endif
) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < (H * Y)) {
dtype sum = gradient[idx];
#include "weights_update.store_output.cu"
}
}
11 changes: 11 additions & 0 deletions cuda/bias_update.store_output.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
if (!tx) {
sum += AS[0];
dtype weight = bias[bx];
dtype gd = -lr * (sum + gradient_step_l12(weight, factor_l12, l1_vs_l2));
#define weights bias
#define idx bx
gradient[idx] = sum;
#include "gradient_descent.store_output.cu"
#undef idx
#undef weights
}
60 changes: 37 additions & 23 deletions cuda/fullbatch_loader.cu
Original file line number Diff line number Diff line change
@@ -1,47 +1,61 @@
#ifndef SAMPLE_SIZE
#error "SAMPLE_SIZE should be defined"
#endif
#ifndef MAX_MINIBATCH_SIZE
#error "MAX_MINIBATCH_SIZE should be defined"
#endif


extern "C"
__global__ void fill_minibatch_data_labels(
const original_data_dtype /* IN */ *original_data,
minibatch_data_dtype /* OUT */ *minibatch_data,
const int /* IN */ start_offset,
const int /* IN */ count,
const original_data_dtype *original_data,
minibatch_data_dtype *minibatch_data,
const int start_offset,
const int count,
#if LABELS > 0
const int /* IN */ *original_labels,
int /* OUT */ *minibatch_labels,
const int *original_labels,
int *minibatch_labels,
#endif
const int /* IN */ *shuffled_indices,
int /* OUT */ *minibatch_indices) {
const int *shuffled_indices,
int *minibatch_indices) {

int sample_number = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int sample_number = idx / SAMPLE_SIZE;
int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1;

int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y;
int offs_in_sample = idx % SAMPLE_SIZE;
int offs_in_data = real_sample_number * SAMPLE_SIZE + offs_in_sample;
int offs_in_minibatch = sample_number * SAMPLE_SIZE + offs_in_sample;

minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0;
#if LABELS > 0
minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1;
#endif
minibatch_indices[sample_number] = real_sample_number;
if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) {
minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0;
#if LABELS > 0
minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1;
#endif
minibatch_indices[sample_number] = real_sample_number;
}
}


#if TARGET > 0
extern "C"
__global__ void fill_minibatch_target(
const original_target_dtype /* IN */ *original_target,
minibatch_target_dtype /* OUT */ *minibatch_target,
const int /* IN */ start_offset,
const int /* IN */ count,
int /* IN */ *shuffled_indices) {

int sample_number = blockDim.x * blockIdx.x + threadIdx.x;
const original_target_dtype *original_target,
minibatch_target_dtype *minibatch_target,
const int start_offset,
const int count,
int *shuffled_indices) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;
int sample_number = idx / SAMPLE_SIZE;
int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1;

int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y;
int offs_in_target = real_sample_number * TARGET_SIZE + offs_in_sample;
int offs_in_minibatch = sample_number * TARGET_SIZE + offs_in_sample;

minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0;
if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) {
minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0;
}
}
#endif
20 changes: 20 additions & 0 deletions cuda/gradient_descent.store_output.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#if ACCUMULATE_GRADIENT == OP_STORE
accumulated_gradient[idx] = gd;
#elif ACCUMULATE_GRADIENT == OP_ADD
accumulated_gradient[idx] += gd;
#elif ACCUMULATE_GRADIENT == OP_FLUSH
gd += accumulated_gradient[idx];
accumulated_gradient[idx] = 0;
#endif

#ifndef USE_MOMENT
#error "USE_MOMENT should be defined"
#endif
#if USE_MOMENT > 0
gd += gradient_with_moment[idx] * moment;
gradient_with_moment[idx] = gd;
#endif

#if APPLY_GRADIENT > 0
weights[idx] = weight + gd;
#endif
31 changes: 31 additions & 0 deletions cuda/gradient_descent_common.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef _GRADIENT_DESCENT_COMMON_
#define _GRADIENT_DESCENT_COMMON_

#include "defines.cu"

#ifndef WEIGHTS_TRANSPOSED
#error "WEIGHTS_TRANSPOSED should be defined"
#endif

#ifndef ACCUMULATE_GRADIENT
#error "ACCUMULATE_GRADIENT should be defined"
#endif
#define OP_NONE 0
#define OP_STORE 1
#define OP_ADD 2
#define OP_FLUSH 3
#if (ACCUMULATE_GRADIENT >= 0) && (ACCUMULATE_GRADIENT <= 3)
// All Ok
#else
#error "Incorrect ACCUMULATE_GRADIENT"
#endif

#ifndef APPLY_GRADIENT
#error "APPLY_GRADIENT should be defined"
#endif


#define gradient_step_l12(weight, factor, l1_vs_l2) (factor * (((dtype)1.0 - l1_vs_l2) * weight + (dtype)0.5 * l1_vs_l2 * SIGN(weight)))


#endif // _GRADIENT_DESCENT_COMMON_
12 changes: 12 additions & 0 deletions cuda/gradient_descent_relu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include "defines.cu"


/// @brief Updates backpropagated error by activation derivative.
/// @details err_y *= 1.0 - exp(-y)
extern "C"
__global__ void err_y_update(dtype *err_y, const dtype *y) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < (Y * BATCH)) {
err_y[idx] *= (dtype)1.0 - exp(-y[idx]);
}
}
13 changes: 13 additions & 0 deletions cuda/gradient_descent_strict_relu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "defines.cu"


/// Strict ReLU back propagation
/// @brief Updates backpropagated error by activation derivative.
/// @details err_y *= (y > 0) ? 1 : 0
extern "C"
__global__ void err_y_update(dtype *err_y, const dtype *y) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if ((idx < (Y * BATCH)) && (y[idx] <= 0)) {
err_y[idx] = 0;
}
}
13 changes: 13 additions & 0 deletions cuda/gradient_descent_tanh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "defines.cu"


/// @brief Updates backpropagated error by activation derivative.
/// @details err_y *= y * y * (-0.388484177) + 1.14381894
extern "C"
__global__ void err_y_update(dtype *err_y, const dtype *y) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < (Y * BATCH)) {
dtype x = y[idx];
err_y[idx] *= x * x * (dtype)(-0.388484177) + (dtype)1.14381894;
}
}
Loading

0 comments on commit 535293c

Please sign in to comment.