Skip to content

Commit

Permalink
custom addinplace kernel and misc updates
Browse files Browse the repository at this point in the history
  • Loading branch information
patflick committed Jul 28, 2017
1 parent 70abeae commit 0df6d0b
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 6 deletions.
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
ROCM_PATH?= $(wildcard /opt/rocm)
HIP_PATH?= $(wildcard /opt/rocm/hip)
HIPCC=$(HIP_PATH)/bin/hipcc
INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include
INCLUDE_DIRS=-I$(HIP_PATH)/include -I$(ROCM_PATH)/include -I$(ROCM_PATH)/hipblas/include
LD_FLAGS=-L$(ROCM_PATH)/lib -L$(ROCM_PATH)/opencl/lib/x86_64 -lMIOpen -lOpenCL -lmiopengemm -lhipblas-hcc -lrocblas-hcc
TARGET=--amdgpu-target=gfx900
LAYER_TIMING=1

#HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)
HIPCC_FLAGS=-g -O3 -Wall -DLAYER_TIMING=$(LAYER_TIMING) $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)

HIPCC_FLAGS=-g -Wall $(CXXFLAGS) $(TARGET) $(INCLUDE_DIRS)

all: alexnet resnet benchmark_wino layerwise gputop

Expand Down
4 changes: 2 additions & 2 deletions alexnet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void alexNet() {
/* classifier */
Sequential classifier(features.getOutputDesc());
// TODO Dropout
classifier.reshape(128, 256 * 6 * 6, 1, 1);
classifier.reshape(input_dim.n, 256 * 6 * 6, 1, 1);
classifier.addLinear(4096);
classifier.addReLU();
// TODO: Dropout
Expand All @@ -51,7 +51,7 @@ void alexNet() {
m.add(classifier);

BenchmarkLogger::new_session("alex_net");
BenchmarkLogger::benchmark(m);
BenchmarkLogger::benchmark(m, 50);
}


Expand Down
3 changes: 2 additions & 1 deletion benchmark_wino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ int main(int argc, char *argv[])

// batch_size, w, h, channels_in, channels_out, kernel_size, padding, stride
ConvLayerDesc l({128, 64, 64, 64, 128, 3, 1, 1});
//ConvLayerDesc l({128, 64, 64, 64, 128, 9, 0, 1});
TensorDesc input_dim(l.batch_size, l.channels_in, l.height, l.width);
Model m(input_dim);
m.emplace<ConvLayer>(l.channels_out, l.kernel_size, l.padding, l.stride);

// benchmark fwd
BenchmarkLogger::new_session("wino_conv");
BenchmarkLogger::fwd_layer_benchmark(m, 5000);
BenchmarkLogger::fwd_layer_benchmark(m, 100000);

return 0;
}
2 changes: 1 addition & 1 deletion layerwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void benchmark_convlayers() {


int layer = 5;
int reps = 10;
int reps = 50;
BenchmarkLogger::new_session("conv_layers");
for (ConvLayerDesc& l : runs) {
std::stringstream ss;
Expand Down
3 changes: 3 additions & 0 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ int main(int argc, char *argv[])
// enable profiling
CHECK_MIO(miopenEnableProfiling(mio::handle(), true));

/*
TensorDesc input(32, 3, 8, 8);
Model m(input);
m.emplace<BatchNorm>();
Expand All @@ -37,6 +38,8 @@ int main(int argc, char *argv[])
for (int i = 0; i < 10; ++i) {
m.forward();
}
*/
check_add();

miopenDestroy(mio::handle());
return 0;
Expand Down
20 changes: 20 additions & 0 deletions multi_layers.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef MULTI_LAYERS_HPP
#define MULTI_LAYERS_HPP

#include <hip/hip_runtime.h>

#include "tensor.hpp"
#include "function.hpp"
#include "utils.hpp"
Expand Down Expand Up @@ -205,10 +207,28 @@ struct Model : public Sequential {


// implements x += y
/*
void add_inplace(Tensor& x, const Tensor& y) {
float alpha1 = 1.f, alpha2 = 1.f, beta = 0.f;
miopenOpTensor(mio::handle(), miopenTensorOpAdd, &alpha1, x.desc, x.data, &alpha2, y.desc, y.data, &beta, x.desc, x.data);
}
*/

__global__ void addinplace_kernel(hipLaunchParm lp, float* x, const float* y, size_t N) {
size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
size_t stride = hipBlockDim_x * hipGridDim_x;

for (size_t i = offset; i < N; i+= stride) {
x[i] = x[i] + y[i];
}
}

void add_inplace(Tensor& x, const Tensor& y) {
unsigned int blocks = 512;
unsigned int threadsPerBlock = 256;
assert(x.data_size == y.data_size);
hipLaunchKernel(addinplace_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, (float*)x.data, (float*)y.data, x.data_size/4);
}

struct ShortCutAdd : public Function {
// Implements Residual Shortcutting: y = F(x) + x
Expand Down
2 changes: 2 additions & 0 deletions utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,13 @@ struct BenchmarkLogger : public Timer {

using Timer::toc;
void toc(Function& f, bool bwd) {
#if LAYER_TIMING == 1
CHECK_HIP(hipDeviceSynchronize());
float dur = this->toc();
std::stringstream ss;
ss << f;
log_step(ss.str(), bwd, dur);
#endif
}

void toc(const std::string& s, bool bwd) {
Expand Down

0 comments on commit 0df6d0b

Please sign in to comment.