From 0df6d0b34525d01195970fd08642d17bef6c8ab9 Mon Sep 17 00:00:00 2001
From: Patrick Flick <patrick.flick@gmail.com>
Date: Fri, 28 Jul 2017 14:59:44 -0400
Subject: [PATCH] custom addinplace kernel and misc updates

---
 Makefile           |  7 +++++--
 alexnet.cpp        |  4 ++--
 benchmark_wino.cpp |  3 ++-
 layerwise.cpp      |  2 +-
 main.cpp           |  3 +++
 multi_layers.hpp   | 20 ++++++++++++++++++++
 utils.hpp          |  2 ++
 7 files changed, 35 insertions(+), 6 deletions(-)
diff --git a/Makefile b/Makefile
index 0970446..13d6c5f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
 ROCM_PATH?= $(wildcard /opt/rocm)
 HIP_PATH?= $(wildcard /opt/rocm/hip)
 HIPCC=$(HIP_PATH)/bin/hipcc
-INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include
+INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include -I$(ROCM_PATH)/hipblas/include
 LD_FLAGS=-L$(ROCM_PATH)/lib -L$(ROCM_PATH)/opencl/lib/x86_64 -lMIOpen -lOpenCL -lmiopengemm -lhipblas-hcc -lrocblas-hcc
 TARGET=--amdgpu-target=gfx900
+LAYER_TIMING=1
+
+#HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
+HIPCC_FLAGS=-g -O3 -Wall -DLAYER_TIMING=$(LAYER_TIMING) $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
 
-HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
 
 all: alexnet resnet benchmark_wino layerwise gputop
 
diff --git a/alexnet.cpp b/alexnet.cpp
index 7e5125d..2170aad 100644
--- a/alexnet.cpp
+++ b/alexnet.cpp
@@ -38,7 +38,7 @@ void alexNet() {
     /* classifier */
     Sequential classifier(features.getOutputDesc());
     // TODO Dropout
-    classifier.reshape(128, 256 * 6 * 6, 1, 1);
+    classifier.reshape(input_dim.n, 256 * 6 * 6, 1, 1);
     classifier.addLinear(4096);
     classifier.addReLU();
     // TODO: Dropout
@@ -51,7 +51,7 @@ void alexNet() {
     m.add(classifier);
 
     BenchmarkLogger::new_session("alex_net");
-    BenchmarkLogger::benchmark(m);
+    BenchmarkLogger::benchmark(m, 50);
 }
 
 
diff --git a/benchmark_wino.cpp b/benchmark_wino.cpp
index 04ace42..6cfa101 100644
--- a/benchmark_wino.cpp
+++ b/benchmark_wino.cpp
@@ -12,13 +12,14 @@ int main(int argc, char *argv[])
 
     // batch_size, w, h, channels_in, channels_out, kernel_size, padding, stride
     ConvLayerDesc l({128, 64, 64, 64, 128, 3, 1, 1});
+    //ConvLayerDesc l({128, 64, 64, 64, 128, 9, 0, 1});
     TensorDesc input_dim(l.batch_size, l.channels_in, l.height, l.width);
     Model m(input_dim);
     m.emplace<ConvLayer>(l.channels_out, l.kernel_size, l.padding, l.stride);
 
     // benchmark fwd
     BenchmarkLogger::new_session("wino_conv");
-    BenchmarkLogger::fwd_layer_benchmark(m, 5000);
+    BenchmarkLogger::fwd_layer_benchmark(m, 100000);
 
     return 0;
 }
diff --git a/layerwise.cpp b/layerwise.cpp
index 7fca287..f878223 100644
--- a/layerwise.cpp
+++ b/layerwise.cpp
@@ -25,7 +25,7 @@ void benchmark_convlayers() {
 
 
     int layer = 5;
-    int reps = 10;
+    int reps = 50;
     BenchmarkLogger::new_session("conv_layers");
     for (ConvLayerDesc& l : runs) {
         std::stringstream ss;
diff --git a/main.cpp b/main.cpp
index 683c1d3..e84182c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -29,6 +29,7 @@ int main(int argc, char *argv[])
     // enable profiling
     CHECK_MIO(miopenEnableProfiling(mio::handle(), true));
 
+    /*
     TensorDesc input(32, 3, 8, 8);
     Model m(input);
     m.emplace<BatchNorm>();
@@ -37,6 +38,8 @@ int main(int argc, char *argv[])
     for (int i = 0; i < 10; ++i) {
         m.forward();
     }
+    */
+    check_add();
 
     miopenDestroy(mio::handle());
     return 0;
diff --git a/multi_layers.hpp b/multi_layers.hpp
index 5873b84..3d0f553 100644
--- a/multi_layers.hpp
+++ b/multi_layers.hpp
@@ -1,6 +1,8 @@
 #ifndef MULTI_LAYERS_HPP
 #define MULTI_LAYERS_HPP
 
+#include <hip/hip_runtime.h>
+
 #include "tensor.hpp"
 #include "function.hpp"
 #include "utils.hpp"
@@ -205,10 +207,28 @@ struct Model : public Sequential {
 
 
 // implements x += y
+/*
 void add_inplace(Tensor& x, const Tensor& y) {
     float alpha1 = 1.f, alpha2 = 1.f, beta = 0.f;
     miopenOpTensor(mio::handle(), miopenTensorOpAdd, &alpha1, x.desc, x.data, &alpha2, y.desc, y.data, &beta, x.desc, x.data);
 }
+*/
+
+__global__ void addinplace_kernel(hipLaunchParm lp, float* x, const float* y, size_t N) {
+    size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    size_t stride = hipBlockDim_x * hipGridDim_x;
+
+    for (size_t i = offset; i < N; i+= stride) {
+        x[i] = x[i] + y[i];
+    }
+}
+
+void add_inplace(Tensor& x, const Tensor& y) {
+    unsigned int blocks = 512;
+    unsigned int threadsPerBlock = 256;
+    assert(x.data_size == y.data_size);
+    hipLaunchKernel(addinplace_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, (float*)x.data, (float*)y.data, x.data_size/4);
+}
 
 struct ShortCutAdd : public Function {
     // Implements Residual Shortcutting: y = F(x) + x
diff --git a/utils.hpp b/utils.hpp
index b82e51e..476f216 100644
--- a/utils.hpp
+++ b/utils.hpp
@@ -137,11 +137,13 @@ struct BenchmarkLogger : public Timer {
 
     using Timer::toc;
     void toc(Function& f, bool bwd) {
+#if LAYER_TIMING == 1
         CHECK_HIP(hipDeviceSynchronize());
         float dur = this->toc();
         std::stringstream ss;
         ss << f;
         log_step(ss.str(), bwd, dur);
+#endif
     }
 
     void toc(const std::string& s, bool bwd) {