Separated gradient to current gradient, accumulated_gradient, gradien…

…t_with_moments Change-Id: Ieaa22b49da90047960474fbaae822b5f99da01d2
Samsung · Jan 15, 2015 · 535293c · 535293c
1 parent 6b5bdd9
commit 535293c
Show file tree

Hide file tree

Showing 32 changed files with 633 additions and 226 deletions.
diff --git a/all2all.py b/all2all.py
@@ -146,21 +146,27 @@ def initialize(self, device, **kwargs):
 
     def cuda_init(self):
         dtype = self.input.dtype
-        self.gemm = (cublas.CUBLAS.sgemm if dtype == numpy.float32
-                     else cublas.CUBLAS.dgemm)
+        self.gemm_ = (cublas.CUBLAS.sgemm if dtype == numpy.float32
+                      else cublas.CUBLAS.dgemm)
         self.np_one = numpy.ones(1, dtype=dtype)
         self.np_zero = numpy.zeros(1, dtype=dtype)
         if self.weights_transposed:
-            raise NotImplementedError("TODO(a.kazantsev): implement")
+            self._A_ = self.input.devmem
+            self._B_ = self.weights.devmem
+            self._transA = cublas.CUBLAS_OP_T
+            self._transB = cublas.CUBLAS_OP_N
+            self._rowsCountA = self.input.shape[0]
+            self._columnCountB = self.weights.shape[0]
         else:
-            self.A = self.weights.devmem
-            self.B = self.input.devmem
-            self.transA = cublas.CUBLAS_OP_T
-            self.transB = cublas.CUBLAS_OP_N
-            self.rowsCountA = self.weights.shape[0]
-            self.columnCountB = self.input.shape[0]
-        self.commonSideLength = self.input.sample_size
+            self._A_ = self.weights.devmem
+            self._B_ = self.input.devmem
+            self._transA = cublas.CUBLAS_OP_T
+            self._transB = cublas.CUBLAS_OP_N
+            self._rowsCountA = self.weights.shape[0]
+            self._columnCountB = self.input.shape[0]
+        self._commonSideLength = self.input.sample_size
         self.build_program({"OUTPUT_SAMPLE_SIZE": self.output.sample_size,
+                            "OUTPUT_SIZE": self.output.size,
                             self.s_activation: 1,
                             "INCLUDE_BIAS": int(self.include_bias),
                             "Y": self.output.sample_size},
@@ -171,8 +177,10 @@ def cuda_init(self):
         if self.include_bias:
             self.assign_kernel("apply_bias_with_activation")
             self.set_args(self.output, self.bias)
-            self._global_size_bias = (self.output.sample_size,
-                                      self.output.shape[0], 1)
+            block_size = self.device.suggest_block_size(self._kernel_)
+            self._global_size_bias = (
+                int(numpy.ceil(self.output.size / block_size)), 1, 1)
+            self._local_size_bias = (block_size, 1, 1)
 
     def ocl_init(self):
         output_shape = (self.output_shape.mem.shape[1:]
@@ -223,13 +231,14 @@ def cuda_run(self):
         self.weights.unmap()
         self.bias.unmap()
 
-        self.gemm(self.device.blas, self.transA, self.transB,
-                  self.rowsCountA, self.columnCountB, self.commonSideLength,
-                  self.np_one, self.A, self.B,
-                  self.np_zero, self.output.devmem)
+        self.gemm_(
+            self.device.blas, self._transA, self._transB,
+            self._rowsCountA, self._columnCountB, self._commonSideLength,
+            self.np_one, self._A_, self._B_,
+            self.np_zero, self.output.devmem)
 
         if self.include_bias:
-            self.execute_kernel(self._global_size_bias, (1, 1, 1))
+            self.execute_kernel(self._global_size_bias, self._local_size_bias)
 
     def cpu_run(self):
         """Forward propagation from batch on CPU only.
@@ -331,7 +340,7 @@ class All2AllSoftmax(All2All):
     def __init__(self, workflow, **kwargs):
         super(All2AllSoftmax, self).__init__(workflow, **kwargs)
         self.max_idx = Vector()
-        self.reduce_size = 64
+        self.reduce_size = 256
 
     def init_unpickled(self):
         super(All2AllSoftmax, self).init_unpickled()

diff --git a/cuda/all2all/forward.cu b/cuda/all2all/forward.cu
@@ -1,23 +1,29 @@
+#ifndef OUTPUT_SIZE
+#error "OUTPUT_SIZE must be defined"
+#endif
+
 #ifndef OUTPUT_SAMPLE_SIZE
 #error "OUTPUT_SAMPLE_SIZE must be defined"
 #endif
 
 extern "C"
 __global__ void apply_bias_with_activation(dtype *output, const dtype *bias) {
-  size_t idx = blockIdx.y * OUTPUT_SAMPLE_SIZE + blockIdx.x;
-  dtype y = output[idx];
-  #if INCLUDE_BIAS > 0
-    y += bias[blockIdx.x];
-  #endif
-  #if ACTIVATION_LINEAR > 0
-    output[idx] = y;
-  #elif ACTIVATION_TANH > 0
-    output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y);
-  #elif ACTIVATION_RELU > 0
-    output[idx] = y > 15 ? y : log(exp(y) + 1);
-  #elif ACTIVATION_STRICT_RELU > 0
-    output[idx] = max(y, (dtype)0.0);
-  #else
-    #error "Unsupported activation"
-  #endif
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < OUTPUT_SIZE) {
+    dtype y = output[idx];
+    #if INCLUDE_BIAS > 0
+      y += bias[idx % OUTPUT_SAMPLE_SIZE];
+    #endif
+    #if ACTIVATION_LINEAR > 0
+      output[idx] = y;
+    #elif ACTIVATION_TANH > 0
+      output[idx] = (dtype)1.7159 * tanh((dtype)0.6666 * y);
+    #elif ACTIVATION_RELU > 0
+      output[idx] = y > 15 ? y : log(exp(y) + 1);
+    #elif ACTIVATION_STRICT_RELU > 0
+      output[idx] = max(y, (dtype)0.0);
+    #else
+      #error "Unsupported activation"
+    #endif
+  }
 }
diff --git a/cuda/all2all/gradient_descent/bias_update.cu b/cuda/all2all/gradient_descent/bias_update.cu
@@ -0,0 +1,41 @@
+#include "gradient_descent_common.cu"
+
+
+/// @brief Calculate gradient for bias update.
+/// @param bias Layer bias.
+/// @param err_output Backpropagated error.
+/// @param gradient Computed gradient.
+/// @param gradient_with_moment Accumulated gradient with moments.
+/// @param lr learning_rate.
+/// @param factor_l12 lnorm_factor.
+/// @param l1_vs_l2 how much to prefer l1 over l2 (from [0, 1]).
+/// @param moment Moment for gradient.
+/// @details Should be defined externally:
+///          REDUCE_SIZE - size of the block for matrix reduce,
+///          BATCH - minibatch size,
+///          Y - output size.
+extern "C"
+__global__ void bias_update(const dtype    *err_output,
+                            dtype          *bias,
+                            dtype          *gradient,
+                            dtype          *accumulated_gradient,
+                            dtype          *gradient_with_moment,
+                            const dtype    lr,
+                            const dtype    factor_l12,
+                            const dtype    l1_vs_l2,
+                            const dtype    moment) {
+
+  #define A err_output
+  #define A_WIDTH Y
+  #define A_HEIGHT BATCH
+  #define A_COL
+
+  #include "matrix_reduce.cu"
+
+  #undef A_COL
+  #undef A_HEIGHT
+  #undef A_WIDTH
+  #undef A
+
+  #include "bias_update.store_output.cu"
+}
diff --git a/cuda/all2all/gradient_descent/weights_update.cu b/cuda/all2all/gradient_descent/weights_update.cu
@@ -0,0 +1,30 @@
+#include "gradient_descent_common.cu"
+
+
+#if USE_ORTHO > 0
+#include "weights_ortho.cu"
+#endif
+
+
+extern "C"
+__global__ void weights_update(const dtype    *_err_output,
+                               const dtype    *_input,
+                               dtype          *weights,
+                               const dtype    *gradient,
+                               dtype          *accumulated_gradient,
+                               dtype          *gradient_with_moment,
+                               const dtype    lr,
+                               const dtype    factor_l12,
+                               const dtype    l1_vs_l2,
+                               const dtype    moment
+#if USE_ORTHO > 0
+                             , const dtype    factor_ortho,
+                               const dtype    *col_sums
+#endif
+                    ) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < (H * Y)) {
+    dtype sum = gradient[idx];
+    #include "weights_update.store_output.cu"
+  }
+}
diff --git a/cuda/bias_update.store_output.cu b/cuda/bias_update.store_output.cu
@@ -0,0 +1,11 @@
+if (!tx) {
+  sum += AS[0];
+  dtype weight = bias[bx];
+  dtype gd = -lr * (sum + gradient_step_l12(weight, factor_l12, l1_vs_l2));
+  #define weights bias
+  #define idx bx
+  gradient[idx] = sum;
+  #include "gradient_descent.store_output.cu"
+  #undef idx
+  #undef weights
+}
diff --git a/cuda/fullbatch_loader.cu b/cuda/fullbatch_loader.cu
@@ -1,47 +1,61 @@
+#ifndef SAMPLE_SIZE
+#error "SAMPLE_SIZE should be defined"
+#endif
+#ifndef MAX_MINIBATCH_SIZE
+#error "MAX_MINIBATCH_SIZE should be defined"
+#endif
+
+
 extern "C"
 __global__ void fill_minibatch_data_labels(
-    const original_data_dtype    /* IN */    *original_data,
-    minibatch_data_dtype        /* OUT */    *minibatch_data,
-    const int                    /* IN */    start_offset,
-    const int                    /* IN */    count,
+    const original_data_dtype    *original_data,
+    minibatch_data_dtype         *minibatch_data,
+    const int                    start_offset,
+    const int                    count,
 #if LABELS > 0
-    const int                    /* IN */    *original_labels,
-    int                         /* OUT */    *minibatch_labels,
+    const int                    *original_labels,
+    int                          *minibatch_labels,
 #endif
-    const int                    /* IN */    *shuffled_indices,
-    int                         /* OUT */    *minibatch_indices) {
+    const int                    *shuffled_indices,
+    int                          *minibatch_indices) {
 
-  int sample_number = blockDim.x * blockIdx.x + threadIdx.x;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_number = idx / SAMPLE_SIZE;
   int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1;
 
-  int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y;
+  int offs_in_sample = idx % SAMPLE_SIZE;
   int offs_in_data = real_sample_number * SAMPLE_SIZE + offs_in_sample;
   int offs_in_minibatch = sample_number * SAMPLE_SIZE + offs_in_sample;
 
-  minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0;
-#if LABELS > 0
-  minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1;
-#endif
-  minibatch_indices[sample_number] = real_sample_number;
+  if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) {
+    minibatch_data[offs_in_minibatch] = sample_number < count ? (minibatch_data_dtype)original_data[offs_in_data] : 0;
+    #if LABELS > 0
+      minibatch_labels[sample_number] = sample_number < count ? original_labels[real_sample_number] : -1;
+    #endif
+    minibatch_indices[sample_number] = real_sample_number;
+  }
 }
 
 
 #if TARGET > 0
 extern "C"
 __global__ void fill_minibatch_target(
-    const original_target_dtype    /* IN */    *original_target,
-    minibatch_target_dtype        /* OUT */    *minibatch_target,
-    const int                      /* IN */    start_offset,
-    const int                      /* IN */    count,
-    int                            /* IN */    *shuffled_indices) {
-
-  int sample_number = blockDim.x * blockIdx.x + threadIdx.x;
+    const original_target_dtype    *original_target,
+    minibatch_target_dtype         *minibatch_target,
+    const int                      start_offset,
+    const int                      count,
+    int                            *shuffled_indices) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_number = idx / SAMPLE_SIZE;
   int real_sample_number = sample_number < count ? shuffled_indices[start_offset + sample_number] : -1;
 
   int offs_in_sample = blockDim.y * blockIdx.y + threadIdx.y;
   int offs_in_target = real_sample_number * TARGET_SIZE + offs_in_sample;
   int offs_in_minibatch = sample_number * TARGET_SIZE + offs_in_sample;
 
-  minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0;
+  if (idx < (MAX_MINIBATCH_SIZE * SAMPLE_SIZE)) {
+    minibatch_target[offs_in_minibatch] = sample_number < count ? (minibatch_target_dtype)original_target[offs_in_target] : 0;
+  }
 }
 #endif
diff --git a/cuda/gradient_descent.store_output.cu b/cuda/gradient_descent.store_output.cu
@@ -0,0 +1,20 @@
+#if ACCUMULATE_GRADIENT == OP_STORE
+accumulated_gradient[idx] = gd;
+#elif ACCUMULATE_GRADIENT == OP_ADD
+accumulated_gradient[idx] += gd;
+#elif ACCUMULATE_GRADIENT == OP_FLUSH
+gd += accumulated_gradient[idx];
+accumulated_gradient[idx] = 0;
+#endif
+
+#ifndef USE_MOMENT
+#error "USE_MOMENT should be defined"
+#endif
+#if USE_MOMENT > 0
+gd += gradient_with_moment[idx] * moment;
+gradient_with_moment[idx] = gd;
+#endif
+
+#if APPLY_GRADIENT > 0
+weights[idx] = weight + gd;
+#endif
diff --git a/cuda/gradient_descent_common.cu b/cuda/gradient_descent_common.cu
@@ -0,0 +1,31 @@
+#ifndef _GRADIENT_DESCENT_COMMON_
+#define _GRADIENT_DESCENT_COMMON_
+
+#include "defines.cu"
+
+#ifndef WEIGHTS_TRANSPOSED
+#error "WEIGHTS_TRANSPOSED should be defined"
+#endif
+
+#ifndef ACCUMULATE_GRADIENT
+#error "ACCUMULATE_GRADIENT should be defined"
+#endif
+#define OP_NONE 0
+#define OP_STORE 1
+#define OP_ADD 2
+#define OP_FLUSH 3
+#if (ACCUMULATE_GRADIENT >= 0) && (ACCUMULATE_GRADIENT <= 3)
+// All Ok
+#else
+#error "Incorrect ACCUMULATE_GRADIENT"
+#endif
+
+#ifndef APPLY_GRADIENT
+#error "APPLY_GRADIENT should be defined"
+#endif
+
+
+#define gradient_step_l12(weight, factor, l1_vs_l2) (factor * (((dtype)1.0 - l1_vs_l2) * weight + (dtype)0.5 * l1_vs_l2 * SIGN(weight)))
+
+
+#endif  // _GRADIENT_DESCENT_COMMON_
diff --git a/cuda/gradient_descent_relu.cu b/cuda/gradient_descent_relu.cu
@@ -0,0 +1,12 @@
+#include "defines.cu"
+
+
+/// @brief Updates backpropagated error by activation derivative.
+/// @details err_y *= 1.0 - exp(-y)
+extern "C"
+__global__ void err_y_update(dtype *err_y, const dtype *y) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < (Y * BATCH)) {
+    err_y[idx] *= (dtype)1.0 - exp(-y[idx]);
+  }
+}
diff --git a/cuda/gradient_descent_strict_relu.cu b/cuda/gradient_descent_strict_relu.cu
@@ -0,0 +1,13 @@
+#include "defines.cu"
+
+
+/// Strict ReLU back propagation
+/// @brief Updates backpropagated error by activation derivative.
+/// @details err_y *= (y > 0) ? 1 : 0
+extern "C"
+__global__ void err_y_update(dtype *err_y, const dtype *y) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if ((idx < (Y * BATCH)) && (y[idx] <= 0)) {
+    err_y[idx] = 0;
+  }
+}
diff --git a/cuda/gradient_descent_tanh.cu b/cuda/gradient_descent_tanh.cu
@@ -0,0 +1,13 @@
+#include "defines.cu"
+
+
+/// @brief Updates backpropagated error by activation derivative.
+/// @details err_y *= y * y * (-0.388484177) + 1.14381894
+extern "C"
+__global__ void err_y_update(dtype *err_y, const dtype *y) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < (Y * BATCH)) {
+    dtype x = y[idx];
+    err_y[idx] *= x * x * (dtype)(-0.388484177) + (dtype)1.14381894;
+  }
+}