From 0df6d0b34525d01195970fd08642d17bef6c8ab9 Mon Sep 17 00:00:00 2001 From: Patrick Flick Date: Fri, 28 Jul 2017 14:59:44 -0400 Subject: [PATCH] custom addinplace kernel and misc updates --- Makefile | 7 +++++-- alexnet.cpp | 4 ++-- benchmark_wino.cpp | 3 ++- layerwise.cpp | 2 +- main.cpp | 3 +++ multi_layers.hpp | 20 ++++++++++++++++++++ utils.hpp | 2 ++ 7 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 0970446..13d6c5f 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ ROCM_PATH?= $(wildcard /opt/rocm) HIP_PATH?= $(wildcard /opt/rocm/hip) HIPCC=$(HIP_PATH)/bin/hipcc -INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include +INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include -I$(ROCM_PATH)/hipblas/include LD_FLAGS=-L$(ROCM_PATH)/lib -L$(ROCM_PATH)/opencl/lib/x86_64 -lMIOpen -lOpenCL -lmiopengemm -lhipblas-hcc -lrocblas-hcc TARGET=--amdgpu-target=gfx900 +LAYER_TIMING=1 + +#HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS) +HIPCC_FLAGS=-g -O3 -Wall -DLAYER_TIMING=$(LAYER_TIMING) $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS) -HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS) all: alexnet resnet benchmark_wino layerwise gputop diff --git a/alexnet.cpp b/alexnet.cpp index 7e5125d..2170aad 100644 --- a/alexnet.cpp +++ b/alexnet.cpp @@ -38,7 +38,7 @@ void alexNet() { /* classifier */ Sequential classifier(features.getOutputDesc()); // TODO Dropout - classifier.reshape(128, 256 * 6 * 6, 1, 1); + classifier.reshape(input_dim.n, 256 * 6 * 6, 1, 1); classifier.addLinear(4096); classifier.addReLU(); // TODO: Dropout @@ -51,7 +51,7 @@ void alexNet() { m.add(classifier); BenchmarkLogger::new_session("alex_net"); - BenchmarkLogger::benchmark(m); + BenchmarkLogger::benchmark(m, 50); } diff --git a/benchmark_wino.cpp b/benchmark_wino.cpp index 04ace42..6cfa101 100644 --- a/benchmark_wino.cpp +++ b/benchmark_wino.cpp @@ -12,13 +12,14 @@ int main(int argc, char *argv[]) // batch_size, w, h, channels_in, channels_out, kernel_size, padding, stride ConvLayerDesc l({128, 64, 64, 64, 128, 3, 1, 1}); + //ConvLayerDesc l({128, 64, 64, 64, 128, 9, 0, 1}); TensorDesc input_dim(l.batch_size, l.channels_in, l.height, l.width); Model m(input_dim); m.emplace(l.channels_out, l.kernel_size, l.padding, l.stride); // benchmark fwd BenchmarkLogger::new_session("wino_conv"); - BenchmarkLogger::fwd_layer_benchmark(m, 5000); + BenchmarkLogger::fwd_layer_benchmark(m, 100000); return 0; } diff --git a/layerwise.cpp b/layerwise.cpp index 7fca287..f878223 100644 --- a/layerwise.cpp +++ b/layerwise.cpp @@ -25,7 +25,7 @@ void benchmark_convlayers() { int layer = 5; - int reps = 10; + int reps = 50; BenchmarkLogger::new_session("conv_layers"); for (ConvLayerDesc& l : runs) { std::stringstream ss; diff --git a/main.cpp b/main.cpp index 683c1d3..e84182c 100644 --- a/main.cpp +++ b/main.cpp @@ -29,6 +29,7 @@ int main(int argc, char *argv[]) // enable profiling CHECK_MIO(miopenEnableProfiling(mio::handle(), true)); + /* TensorDesc input(32, 3, 8, 8); Model m(input); m.emplace(); @@ -37,6 +38,8 @@ int main(int argc, char *argv[]) for (int i = 0; i < 10; ++i) { m.forward(); } + */ + check_add(); miopenDestroy(mio::handle()); return 0; diff --git a/multi_layers.hpp b/multi_layers.hpp index 5873b84..3d0f553 100644 --- a/multi_layers.hpp +++ b/multi_layers.hpp @@ -1,6 +1,8 @@ #ifndef MULTI_LAYERS_HPP #define MULTI_LAYERS_HPP +#include + #include "tensor.hpp" #include "function.hpp" #include "utils.hpp" @@ -205,10 +207,28 @@ struct Model : public Sequential { // implements x += y +/* void add_inplace(Tensor& x, const Tensor& y) { float alpha1 = 1.f, alpha2 = 1.f, beta = 0.f; miopenOpTensor(mio::handle(), miopenTensorOpAdd, &alpha1, x.desc, x.data, &alpha2, y.desc, y.data, &beta, x.desc, x.data); } +*/ + +__global__ void addinplace_kernel(hipLaunchParm lp, float* x, const float* y, size_t N) { + size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + size_t stride = hipBlockDim_x * hipGridDim_x; + + for (size_t i = offset; i < N; i+= stride) { + x[i] = x[i] + y[i]; + } +} + +void add_inplace(Tensor& x, const Tensor& y) { + unsigned int blocks = 512; + unsigned int threadsPerBlock = 256; + assert(x.data_size == y.data_size); + hipLaunchKernel(addinplace_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, (float*)x.data, (float*)y.data, x.data_size/4); +} struct ShortCutAdd : public Function { // Implements Residual Shortcutting: y = F(x) + x diff --git a/utils.hpp b/utils.hpp index b82e51e..476f216 100644 --- a/utils.hpp +++ b/utils.hpp @@ -137,11 +137,13 @@ struct BenchmarkLogger : public Timer { using Timer::toc; void toc(Function& f, bool bwd) { +#if LAYER_TIMING == 1 CHECK_HIP(hipDeviceSynchronize()); float dur = this->toc(); std::stringstream ss; ss << f; log_step(ss.str(), bwd, dur); +#endif } void toc(const std::string& s, bool bwd) {