custom addinplace kernel and misc updates

patflick · Jul 28, 2017 · 0df6d0b · 0df6d0b
1 parent 70abeae
commit 0df6d0b
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -1,11 +1,14 @@
 ROCM_PATH?= $(wildcard /opt/rocm)
 HIP_PATH?= $(wildcard /opt/rocm/hip)
 HIPCC=$(HIP_PATH)/bin/hipcc
-INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include
+INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include -I$(ROCM_PATH)/hipblas/include
 LD_FLAGS=-L$(ROCM_PATH)/lib -L$(ROCM_PATH)/opencl/lib/x86_64 -lMIOpen -lOpenCL -lmiopengemm -lhipblas-hcc -lrocblas-hcc
 TARGET=--amdgpu-target=gfx900
+LAYER_TIMING=1
+
+#HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
+HIPCC_FLAGS=-g -O3 -Wall -DLAYER_TIMING=$(LAYER_TIMING) $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
 
-HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
 
 all: alexnet resnet benchmark_wino layerwise gputop
 

diff --git a/alexnet.cpp b/alexnet.cpp
@@ -38,7 +38,7 @@ void alexNet() {
  /* classifier */
  Sequential classifier(features.getOutputDesc());
  // TODO Dropout
- classifier.reshape(128, 256 * 6 * 6, 1, 1);
+ classifier.reshape(input_dim.n, 256 * 6 * 6, 1, 1);
  classifier.addLinear(4096);
  classifier.addReLU();
  // TODO: Dropout
@@ -51,7 +51,7 @@ void alexNet() {
  m.add(classifier);
 
  BenchmarkLogger::new_session("alex_net");
- BenchmarkLogger::benchmark(m);
+ BenchmarkLogger::benchmark(m, 50);
 }
 
 

diff --git a/benchmark_wino.cpp b/benchmark_wino.cpp
@@ -12,13 +12,14 @@ int main(int argc, char *argv[])
 
  // batch_size, w, h, channels_in, channels_out, kernel_size, padding, stride
  ConvLayerDesc l({128, 64, 64, 64, 128, 3, 1, 1});
+ //ConvLayerDesc l({128, 64, 64, 64, 128, 9, 0, 1});
  TensorDesc input_dim(l.batch_size, l.channels_in, l.height, l.width);
  Model m(input_dim);
  m.emplace<ConvLayer>(l.channels_out, l.kernel_size, l.padding, l.stride);
 
  // benchmark fwd
  BenchmarkLogger::new_session("wino_conv");
- BenchmarkLogger::fwd_layer_benchmark(m, 5000);
+ BenchmarkLogger::fwd_layer_benchmark(m, 100000);
 
  return 0;
 }
diff --git a/layerwise.cpp b/layerwise.cpp
@@ -25,7 +25,7 @@ void benchmark_convlayers() {
 
 
  int layer = 5;
- int reps = 10;
+ int reps = 50;
  BenchmarkLogger::new_session("conv_layers");
  for (ConvLayerDesc& l : runs) {
  std::stringstream ss;

diff --git a/main.cpp b/main.cpp
@@ -29,6 +29,7 @@ int main(int argc, char *argv[])
  // enable profiling
  CHECK_MIO(miopenEnableProfiling(mio::handle(), true));
 
+ /*
  TensorDesc input(32, 3, 8, 8);
  Model m(input);
  m.emplace<BatchNorm>();
@@ -37,6 +38,8 @@ int main(int argc, char *argv[])
  for (int i = 0; i < 10; ++i) {
  m.forward();
  }
+ */
+ check_add();
 
  miopenDestroy(mio::handle());
  return 0;

diff --git a/multi_layers.hpp b/multi_layers.hpp
@@ -1,6 +1,8 @@
 #ifndef MULTI_LAYERS_HPP
 #define MULTI_LAYERS_HPP
 
+#include <hip/hip_runtime.h>
+
 #include "tensor.hpp"
 #include "function.hpp"
 #include "utils.hpp"
@@ -205,10 +207,28 @@ struct Model : public Sequential {
 
 
 // implements x += y
+/*
 void add_inplace(Tensor& x, const Tensor& y) {
  float alpha1 = 1.f, alpha2 = 1.f, beta = 0.f;
  miopenOpTensor(mio::handle(), miopenTensorOpAdd, &alpha1, x.desc, x.data, &alpha2, y.desc, y.data, &beta, x.desc, x.data);
 }
+*/
+
+__global__ void addinplace_kernel(hipLaunchParm lp, float* x, const float* y, size_t N) {
+ size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ size_t stride = hipBlockDim_x * hipGridDim_x;
+
+ for (size_t i = offset; i < N; i+= stride) {
+ x[i] = x[i] + y[i];
+ }
+}
+
+void add_inplace(Tensor& x, const Tensor& y) {
+ unsigned int blocks = 512;
+ unsigned int threadsPerBlock = 256;
+ assert(x.data_size == y.data_size);
+ hipLaunchKernel(addinplace_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, (float*)x.data, (float*)y.data, x.data_size/4);
+}
 
 struct ShortCutAdd : public Function {
  // Implements Residual Shortcutting: y = F(x) + x

diff --git a/utils.hpp b/utils.hpp
@@ -137,11 +137,13 @@ struct BenchmarkLogger : public Timer {
 
  using Timer::toc;
  void toc(Function& f, bool bwd) {
+#if LAYER_TIMING == 1
  CHECK_HIP(hipDeviceSynchronize());
  float dur = this->toc();
  std::stringstream ss;
  ss << f;
  log_step(ss.str(), bwd, dur);
+#endif
  }
 
  void toc(const std::string& s, bool bwd) {