Skip to content

Commit

Permalink
beta 0.2.0.9
Browse files Browse the repository at this point in the history
- fix quantization tool compiling on Windows
- fix converter compiling on Windows
- fix eltwise optimization on Windows
- separate sse & avx for Windows
- add LeakyReLU support for TensorFlow
- fix reshape, const for TensorFlow
- fix dimension format error for ONNX ops
- optimize winograd, ReLU for OpenCL
- add fp16 availability & dimensions size check-up for OpenCL
- optimize GEMM for arm32
- fix ExpandDims shape calculation when inputs size == 1
  • Loading branch information
liqing committed Sep 1, 2019
1 parent 27f45da commit 487a0fb
Show file tree
Hide file tree
Showing 92 changed files with 1,367 additions and 851 deletions.
72 changes: 44 additions & 28 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ if(SYSTEM.Linux)
endif()

if(WIN32)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MT")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /MT")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /MT")
set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /MT")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MD")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /MD")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MD")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MDd")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /MD")
set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /MD")
set(CMAKE_CXX_STANDARD 14)
elseif(SYSTEM.Android OR SYSTEM.Linux)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
Expand All @@ -134,13 +134,13 @@ elseif()
endif()

if(MNN_DEBUG)
add_definitions(-DMNN_DEBUG)
add_definitions(-DMNN_DEBUG -DDEBUG)
if(MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDEBUG /DEBUG")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG /DEBUG")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DEBUG")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DEBUG")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDEBUG -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -g")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
endif()
else()
if (MSVC)
Expand Down Expand Up @@ -195,7 +195,15 @@ endif()

if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)")
add_definitions(-DMNN_USE_SSE)
set (MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/sse)
file(GLOB AVX_Source_Files ${MNN.Path}/backend/cpu/x86_x64/avx/*.cpp)
if(WIN32 OR MSVC)
set_source_files_properties(${AVX_Source_Files} PROPERTIES COMPILE_FLAGS /arch:AVX)
else()
set_source_files_properties(${AVX_Source_Files} PROPERTIES COMPILE_FLAGS -mavx)
endif()
set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64)
set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64/sse)
set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64/avx)
endif()

# *.c
Expand Down Expand Up @@ -292,36 +300,44 @@ if(SYSTEM.Linux)
target_link_libraries(MNN pthread)
endif()

set(MNN_DEPEND MNN)
if(NOT MNN_BUILD_SHARED_LIBS)
if (BUILD_IOS OR APPLE)
set(MNN_DEPEND -all_load, ${MNN_DEPEND}, -noall_load)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MNN_DEPEND -Wl,--whole-archive ${MNN_DEPEND} -Wl,--no-whole-archive)
endif()
endif()

if(MNN_VULKAN)
set(MNN_DEPEND ${MNN_DEPEND} MNN_Vulkan)
list(APPEND MNN_DEPEND MNN_Vulkan)
add_subdirectory(${MNN.Path}/backend/vulkan)
message(STATUS "[*] linking MNN with Vulkan done")
endif()
if(MNN_OPENGL)
set(MNN_DEPEND ${MNN_DEPEND} MNN_GL)
list(APPEND MNN_DEPEND MNN_GL)
add_subdirectory(${MNN.Path}/backend/opengl)
message(STATUS "[*] linking MNN with OpenGL done")
endif()
if(MNN_OPENCL)
set(MNN_DEPEND ${MNN_DEPEND} MNN_CL)
list(APPEND MNN_DEPEND MNN_CL)
add_subdirectory(${MNN.Path}/backend/opencl)
message(STATUS "[*] linking MNN with OpenCL done")
endif()
if (MNN_ARM82)
set(MNN_DEPEND ${MNN_DEPEND} MNN_Arm82)
list(APPEND MNN_DEPEND MNN_Arm82)
add_subdirectory(${MNN.Path}/backend/arm82)
message(STATUS "[*] linking MNN with ARM 82 done")
endif()

if (MSVC OR WIN32)
target_link_options(MNN PRIVATE "/IGNORE:4049,4217")
foreach(DEPEND ${MNN_DEPEND})
target_link_options(MNN PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
target_link_libraries(MNN PRIVATE ${DEPEND})
endforeach()
set(MNN_DEPEND MNN)
elseif (NOT MNN_BUILD_SHARED_LIBS)
if (BUILD_IOS OR APPLE)
set(MNN_DEPEND -all_load, MNN, -noall_load ${MNN_DEPEND})
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MNN_DEPEND -Wl,--whole-archive MNN -Wl,--no-whole-archive ${MNN_DEPEND})
endif()
else()
set(MNN_DEPEND MNN ${MNN_DEPEND})
endif()

if (BUILD_IOS OR APPLE)
else()
if(MNN_OPENMP)
Expand Down Expand Up @@ -367,7 +383,7 @@ endif()
if(WIN32)
target_compile_definitions(MNN PRIVATE "-DBUILDING_DLL")
target_compile_definitions(MNN PUBLIC "-D_CRT_SECURE_NO_WARNINGS")
target_compile_options(MNN PUBLIC "/wd4244" "/wd4146" "/wd4018" "/wd4267" "/wd4996" "/wd4081")
target_compile_options(MNN PUBLIC "/wd4244" "/wd4146" "/wd4018" "/wd4267" "/wd4996" "/wd4081" "/wd4251")
endif()

if(SYSTEM.Android AND NOT MNN_BUILD_FOR_ANDROID_COMMAND)
Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ std::vector<Model> findModelFiles(const char* dir) {
Model m;
m.name = ffd.cFileName;
m.model_file = std::string(dir) + "\\" + m.name;
if(INVALID_FILE_ATTRIBUTES != GetFileAttributes(m.model_file.c_str()) || GetLastError() != ERROR_FILE_NOT_FOUND) {
if(INVALID_FILE_ATTRIBUTES != GetFileAttributes(m.model_file.c_str()) && GetLastError() != ERROR_FILE_NOT_FOUND) {
models.push_back(std::move(m));
}
} while (FindNextFile(hFind, &ffd) != 0);
Expand Down
1 change: 1 addition & 0 deletions schema/default/MNN.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ table Op {
name: string;
outputIndexes: [int];
type: OpType;
defaultDimentionFormat : MNN_DATA_FORMAT = NHWC;
}

table TensorDescribe {
Expand Down
86 changes: 46 additions & 40 deletions source/backend/cpu/CPUConvolutionDepthwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,6 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onResize(const std::vector<Ten
backend()->onAcquireBuffer(&mInputTempBuffer, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mInputTempBuffer, Backend::DYNAMIC);

return result;
}

ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {
auto layer = mCommon;
auto inputTensor = inputs[0];
auto outputTensor = outputs[0];
Expand All @@ -358,7 +353,7 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
int padX = mPadX;
int padY = mPadY;
int weight_z_step = kernel_height * kernel_width * gIntUnit;

// Compute Mid Rect
int l = 0, t = 0, r = dst_width, b = dst_height;
for (; l * strideX - padX < 0; l++) {
Expand All @@ -373,12 +368,12 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
// do nothing
}

auto postFunction = getPostFunction();
for (int i=0; i<4; ++i) {
mQuanScale[i] = mQuan->quantScale();
}

auto runBasic = [=](float* dst_z, const int8_t* src_z, const int8_t* weight_dz, const float* alpha_z, int L, int T,
int R, int B) {
for (int dy = T; dy < B; ++dy) {
Expand All @@ -399,41 +394,52 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
}
}
};
for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
const float* srcOrigin = inputTensor->host<float>() + batchIndex * src_z_step * dst_depth_quad;
float* dstOrigin = outputTensor->host<float>() + batchIndex * dst_z_step * dst_depth_quad;

MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) {
float* dst_z_float = dstOrigin + dst_z_step * dz;
const float* src_z_float = srcOrigin + src_z_step * dz;

auto dst_z = dst_z_float;
auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride;

MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, mQuan->aMin(), mQuan->aMax());

const float* bias_z = mBias.get() + gIntUnit * dz;
const float* alpha_z = mAlpha.get() + gIntUnit * dz;
const int8_t* weight_dz = mWeight.get() + dz * weight_z_step;
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t);
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height);
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b);
runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b);
if (r > l) {
for (int dy = t; dy < b; ++dy) {
float* dst_y = dst_z + dy * dst_y_step;
int srcStartY = dy * strideY - padY;
auto src_dy = src_z + srcStartY * src_y_step;
MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step,
alpha_z);
auto aMin = mQuan->aMin();
auto aMax = mQuan->aMax();
mRun = [=]() {
for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
const float* srcOrigin = inputTensor->host<float>() + batchIndex * src_z_step * dst_depth_quad;
float* dstOrigin = outputTensor->host<float>() + batchIndex * dst_z_step * dst_depth_quad;

MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) {
float* dst_z_float = dstOrigin + dst_z_step * dz;
const float* src_z_float = srcOrigin + src_z_step * dz;

auto dst_z = dst_z_float;
auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride;

MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, aMin, aMax);

const float* bias_z = mBias.get() + gIntUnit * dz;
const float* alpha_z = mAlpha.get() + gIntUnit * dz;
const int8_t* weight_dz = mWeight.get() + dz * weight_z_step;
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t);
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height);
runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b);
runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b);
if (r > l) {
for (int dy = t; dy < b; ++dy) {
float* dst_y = dst_z + dy * dst_y_step;
int srcStartY = dy * strideY - padY;
auto src_dy = src_z + srcStartY * src_y_step;
MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step,
alpha_z);
}
}

postFunction(dst_z_float, bias_z, dst_width * dst_height, 1);
}

postFunction(dst_z_float, bias_z, dst_width * dst_height, 1);
MNN_CONCURRENCY_END();
}
MNN_CONCURRENCY_END();
}
};
return result;
}

ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {

mRun();
return NO_ERROR;
}

Expand Down
1 change: 1 addition & 0 deletions source/backend/cpu/CPUConvolutionDepthwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class CPUConvolutionDepthwise : public Execution {

Tensor mInputTempBuffer;
const IDSTQuan *mQuan;
std::function<void()> mRun;
};

CPUConvolutionDepthwise(const Op *convOp, Backend *b);
Expand Down
2 changes: 1 addition & 1 deletion source/backend/cpu/CPUCosineSimilarity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ErrorCode CPUCosineSimilarity::onExecute(const std::vector<Tensor*>& inputs, con
const int batchStride = x1->stride(0);
const int channel = x1->channel();
const int channleStride = x1->stride(1);
const float eps = 1e-8;
const float eps = 1e-8f;
const auto x1DataPtr = x1->host<float>();
const auto x2DataPtr = x2->host<float>();
auto outputDataPtr = output->host<float>();
Expand Down
5 changes: 5 additions & 0 deletions source/backend/cpu/CPUDetectionOutput.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
// Created by MNN on 2018/07/17.
// Copyright © 2018, Alibaba Group Holding Limited
//
/* When use MSVC compile the file on x86 Release, a compiler internal error will be report because of MSVC's bug.
reference link: https://developercommunity.visualstudio.com/comments/535612/view.html */
#if defined(_MSC_VER) && defined(_M_IX86) && !defined(_DEBUG)
#pragma optimize("", off)
#endif

#include "CPUDetectionOutput.hpp"
#include <math.h>
Expand Down
37 changes: 24 additions & 13 deletions source/backend/cpu/CPUReshape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@

namespace MNN {

CPUReshape::CPUReshape(Backend *b) : MNN::Execution(b), mStorage(2) {
// nothing to do
CPUReshape::CPUReshape(Backend *b, MNN_DATA_FORMAT midFormat) : MNN::Execution(b), mStorage(2) {
mMidFormat = midFormat;
}

ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(1 == inputs.size() || 2 == inputs.size());
MNN_ASSERT(1 == outputs.size());

auto input = inputs[0];
auto output = outputs[0];
int totalSize = 1;

mWrapTensorForInput.buffer().type = inputs[0]->buffer().type;
Expand All @@ -31,9 +32,6 @@ ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
if (TensorUtils::getDescribe(input)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
return NO_ERROR;
}
TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;

for (int i = 0; i < input->buffer().dimensions; ++i) {
totalSize *= input->buffer().dim[i].extent;
}
Expand All @@ -44,16 +42,29 @@ ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
mStorage.buffer().type = input->getType();
backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);

TensorUtils::copyShape(inputs[0], &mWrapTensorForInput);

mWrapTensorForInput.buffer().host = mStorage.buffer().host;
TensorUtils::setLinearLayout(&mWrapTensorForInput);

TensorUtils::copyShape(outputs[0], &mWrapTensorForOutput);
mWrapTensorForOutput.buffer().host = mStorage.buffer().host;
if (MNN_DATA_FORMAT_NHWC == mMidFormat) {
TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
mWrapTensorForInput.buffer().dimensions = 4;
mWrapTensorForOutput.buffer().dimensions = 4;
mWrapTensorForInput.setLength(0, input->batch());
mWrapTensorForInput.setLength(1, input->height());
mWrapTensorForInput.setLength(2, input->width());
mWrapTensorForInput.setLength(3, input->channel());
mWrapTensorForOutput.setLength(0, output->batch());
mWrapTensorForOutput.setLength(1, output->height());
mWrapTensorForOutput.setLength(2, output->width());
mWrapTensorForOutput.setLength(3, output->channel());
} else {
TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
TensorUtils::copyShape(inputs[0], &mWrapTensorForInput);
TensorUtils::copyShape(outputs[0], &mWrapTensorForOutput);
}
TensorUtils::setLinearLayout(&mWrapTensorForInput);
TensorUtils::setLinearLayout(&mWrapTensorForOutput);

return NO_ERROR;
}

Expand All @@ -78,7 +89,7 @@ class CPUReshapeCreator : public CPUBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override {
return new CPUReshape(backend);
return new CPUReshape(backend, op->main_as_Reshape()->dimType());
}
};

Expand Down
3 changes: 2 additions & 1 deletion source/backend/cpu/CPUReshape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
namespace MNN {
class CPUReshape : public Execution {
public:
CPUReshape(Backend *b);
CPUReshape(Backend *b, MNN_DATA_FORMAT midFormat);
virtual ~CPUReshape() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
Expand All @@ -24,6 +24,7 @@ class CPUReshape : public Execution {
Tensor mStorage;
Tensor mWrapTensorForInput;
Tensor mWrapTensorForOutput;
MNN_DATA_FORMAT mMidFormat;
};
} // namespace MNN
#endif /* CPUReshape_hpp */
2 changes: 1 addition & 1 deletion source/backend/cpu/CPUShape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ErrorCode CPUShape::onExecute(const std::vector<Tensor*>& inputs, const std::vec

auto& ib = inputs[0]->buffer();
int32_t* outData = outputs[0]->host<int32_t>();
if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
outData[0] = ib.dim[0].extent;
outData[1] = ib.dim[2].extent;
outData[2] = ib.dim[3].extent;
Expand Down
Loading

0 comments on commit 487a0fb

Please sign in to comment.