beta 0.2.0.9

- fix quantization tool compiling on Windows - fix converter compiling on Windows - fix eltwise optimization on Windows - separate sse & avx for Windows - add LeakyReLU support for TensorFlow - fix reshape, const for TensorFlow - fix dimension format error for ONNX ops - optimize winograd, ReLU for OpenCL - add fp16 availability & dimensions size check-up for OpenCL - optimize GEMM for arm32 - fix ExpandDims shape calculation when inputs size == 1
00liujj · Sep 1, 2019 · 487a0fb · 487a0fb
1 parent 27f45da
commit 487a0fb
Show file tree

Hide file tree

Showing 92 changed files with 1,367 additions and 851 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -117,14 +117,14 @@ if(SYSTEM.Linux)
 endif()
 
 if(WIN32)
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MT")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /MT")
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT")
-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /MT")
-    set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /MT")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MD")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /MD")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MD")
+    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MDd")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /MD")
+    set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /MD")
 	set(CMAKE_CXX_STANDARD 14)
 elseif(SYSTEM.Android OR SYSTEM.Linux)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
@@ -134,13 +134,13 @@ elseif()
 endif()
 
 if(MNN_DEBUG)
-    add_definitions(-DMNN_DEBUG)
+    add_definitions(-DMNN_DEBUG -DDEBUG)
     if(MSVC)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDEBUG /DEBUG")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG /DEBUG")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DEBUG")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DEBUG")
     else()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDEBUG -g")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -g")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
     endif()
 else()
     if (MSVC)
@@ -195,7 +195,15 @@ endif()
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)")
     add_definitions(-DMNN_USE_SSE)
-    set (MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/sse)
+    file(GLOB AVX_Source_Files ${MNN.Path}/backend/cpu/x86_x64/avx/*.cpp)
+    if(WIN32 OR MSVC)
+        set_source_files_properties(${AVX_Source_Files} PROPERTIES COMPILE_FLAGS /arch:AVX)
+    else()
+        set_source_files_properties(${AVX_Source_Files} PROPERTIES COMPILE_FLAGS -mavx)
+    endif()
+    set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64)
+    set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64/sse)
+    set(MNN.Source_DIR ${MNN.Source_DIR} ${MNN.Path}/backend/cpu/x86_x64/avx)
 endif()
 
 # *.c
@@ -292,36 +300,44 @@ if(SYSTEM.Linux)
     target_link_libraries(MNN pthread)
 endif()
 
-set(MNN_DEPEND MNN)
-if(NOT MNN_BUILD_SHARED_LIBS)
-    if (BUILD_IOS OR APPLE)
-        set(MNN_DEPEND -all_load, ${MNN_DEPEND}, -noall_load)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        set(MNN_DEPEND -Wl,--whole-archive ${MNN_DEPEND} -Wl,--no-whole-archive)
-    endif()
-endif()
-
 if(MNN_VULKAN)
-    set(MNN_DEPEND ${MNN_DEPEND} MNN_Vulkan)
+    list(APPEND MNN_DEPEND MNN_Vulkan)
     add_subdirectory(${MNN.Path}/backend/vulkan)
     message(STATUS "[*] linking MNN with Vulkan done")
 endif()
 if(MNN_OPENGL)
-    set(MNN_DEPEND ${MNN_DEPEND} MNN_GL)
+    list(APPEND MNN_DEPEND MNN_GL)
     add_subdirectory(${MNN.Path}/backend/opengl)
     message(STATUS "[*] linking MNN with OpenGL done")
 endif()
 if(MNN_OPENCL)
-    set(MNN_DEPEND ${MNN_DEPEND} MNN_CL)
+    list(APPEND MNN_DEPEND MNN_CL)
     add_subdirectory(${MNN.Path}/backend/opencl)
     message(STATUS "[*] linking MNN with OpenCL done")
 endif()
 if (MNN_ARM82)
-    set(MNN_DEPEND ${MNN_DEPEND} MNN_Arm82)
+    list(APPEND MNN_DEPEND MNN_Arm82)
     add_subdirectory(${MNN.Path}/backend/arm82)
     message(STATUS "[*] linking MNN with ARM 82 done")
 endif()
 
+if (MSVC OR WIN32)
+    target_link_options(MNN PRIVATE "/IGNORE:4049,4217")
+    foreach(DEPEND ${MNN_DEPEND})
+        target_link_options(MNN PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
+        target_link_libraries(MNN PRIVATE ${DEPEND})
+    endforeach()
+    set(MNN_DEPEND MNN)
+elseif (NOT MNN_BUILD_SHARED_LIBS)
+     if (BUILD_IOS OR APPLE)
+        set(MNN_DEPEND -all_load, MNN, -noall_load ${MNN_DEPEND})
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        set(MNN_DEPEND -Wl,--whole-archive MNN -Wl,--no-whole-archive ${MNN_DEPEND})
+    endif()
+else()
+    set(MNN_DEPEND MNN ${MNN_DEPEND})
+endif()
+
 if (BUILD_IOS OR APPLE)
 else()
     if(MNN_OPENMP)
@@ -367,7 +383,7 @@ endif()
 if(WIN32)
     target_compile_definitions(MNN PRIVATE "-DBUILDING_DLL")
     target_compile_definitions(MNN PUBLIC "-D_CRT_SECURE_NO_WARNINGS")
-    target_compile_options(MNN PUBLIC "/wd4244" "/wd4146" "/wd4018" "/wd4267" "/wd4996" "/wd4081")
+    target_compile_options(MNN PUBLIC "/wd4244" "/wd4146" "/wd4018" "/wd4267" "/wd4996" "/wd4081" "/wd4251")
 endif()
 
 if(SYSTEM.Android AND NOT MNN_BUILD_FOR_ANDROID_COMMAND)

diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -64,7 +64,7 @@ std::vector<Model> findModelFiles(const char* dir) {
         Model m;
         m.name       = ffd.cFileName;
         m.model_file = std::string(dir) + "\\" + m.name;
-        if(INVALID_FILE_ATTRIBUTES != GetFileAttributes(m.model_file.c_str()) || GetLastError() != ERROR_FILE_NOT_FOUND) {
+        if(INVALID_FILE_ATTRIBUTES != GetFileAttributes(m.model_file.c_str()) && GetLastError() != ERROR_FILE_NOT_FOUND) {
             models.push_back(std::move(m));
         }
     } while (FindNextFile(hFind, &ffd) != 0);

diff --git a/schema/default/MNN.fbs b/schema/default/MNN.fbs
@@ -240,6 +240,7 @@ table Op {
     name: string;
     outputIndexes: [int];    
     type: OpType;
+    defaultDimentionFormat : MNN_DATA_FORMAT = NHWC;
 }
 
 table TensorDescribe {

diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -330,11 +330,6 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onResize(const std::vector<Ten
     backend()->onAcquireBuffer(&mInputTempBuffer, Backend::DYNAMIC);
     backend()->onReleaseBuffer(&mInputTempBuffer, Backend::DYNAMIC);
 
-    return result;
-}
-
-ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Tensor*>& inputs,
-                                                            const std::vector<Tensor*>& outputs) {
     auto layer         = mCommon;
     auto inputTensor   = inputs[0];
     auto outputTensor  = outputs[0];
@@ -358,7 +353,7 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
     int padX           = mPadX;
     int padY           = mPadY;
     int weight_z_step  = kernel_height * kernel_width * gIntUnit;
-
+    
     // Compute Mid Rect
     int l = 0, t = 0, r = dst_width, b = dst_height;
     for (; l * strideX - padX < 0; l++) {
@@ -373,12 +368,12 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
     for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
         // do nothing
     }
-
+    
     auto postFunction = getPostFunction();
     for (int i=0; i<4; ++i) {
         mQuanScale[i] = mQuan->quantScale();
     }
-
+    
     auto runBasic = [=](float* dst_z, const int8_t* src_z, const int8_t* weight_dz, const float* alpha_z, int L, int T,
                         int R, int B) {
         for (int dy = T; dy < B; ++dy) {
@@ -399,41 +394,52 @@ ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Te
             }
         }
     };
-    for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
-        const float* srcOrigin = inputTensor->host<float>() + batchIndex * src_z_step * dst_depth_quad;
-        float* dstOrigin       = outputTensor->host<float>() + batchIndex * dst_z_step * dst_depth_quad;
-
-        MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) {
-            float* dst_z_float       = dstOrigin + dst_z_step * dz;
-            const float* src_z_float = srcOrigin + src_z_step * dz;
-
-            auto dst_z = dst_z_float;
-            auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride;
-
-            MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, mQuan->aMin(), mQuan->aMax());
-
-            const float* bias_z     = mBias.get() + gIntUnit * dz;
-            const float* alpha_z    = mAlpha.get() + gIntUnit * dz;
-            const int8_t* weight_dz = mWeight.get() + dz * weight_z_step;
-            runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t);
-            runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height);
-            runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b);
-            runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b);
-            if (r > l) {
-                for (int dy = t; dy < b; ++dy) {
-                    float* dst_y  = dst_z + dy * dst_y_step;
-                    int srcStartY = dy * strideY - padY;
-                    auto src_dy   = src_z + srcStartY * src_y_step;
-                    MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
-                                                   strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step,
-                                                   alpha_z);
+    auto aMin = mQuan->aMin();
+    auto aMax = mQuan->aMax();
+    mRun = [=]() {
+        for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
+            const float* srcOrigin = inputTensor->host<float>() + batchIndex * src_z_step * dst_depth_quad;
+            float* dstOrigin       = outputTensor->host<float>() + batchIndex * dst_z_step * dst_depth_quad;
+
+            MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) {
+                float* dst_z_float       = dstOrigin + dst_z_step * dz;
+                const float* src_z_float = srcOrigin + src_z_step * dz;
+
+                auto dst_z = dst_z_float;
+                auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride;
+
+                MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, aMin, aMax);
+
+                const float* bias_z     = mBias.get() + gIntUnit * dz;
+                const float* alpha_z    = mAlpha.get() + gIntUnit * dz;
+                const int8_t* weight_dz = mWeight.get() + dz * weight_z_step;
+                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t);
+                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height);
+                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b);
+                runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b);
+                if (r > l) {
+                    for (int dy = t; dy < b; ++dy) {
+                        float* dst_y  = dst_z + dy * dst_y_step;
+                        int srcStartY = dy * strideY - padY;
+                        auto src_dy   = src_z + srcStartY * src_y_step;
+                        MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
+                                                       strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step,
+                                                       alpha_z);
+                    }
                 }
+
+                postFunction(dst_z_float, bias_z, dst_width * dst_height, 1);
             }
-
-            postFunction(dst_z_float, bias_z, dst_width * dst_height, 1);
+            MNN_CONCURRENCY_END();
         }
-        MNN_CONCURRENCY_END();
-    }
+    };
+    return result;
+}
+
+ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Tensor*>& inputs,
+                                                            const std::vector<Tensor*>& outputs) {
+
+    mRun();
     return NO_ERROR;
 }
 

diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@@ -78,6 +78,7 @@ class CPUConvolutionDepthwise : public Execution {
 
         Tensor mInputTempBuffer;
         const IDSTQuan *mQuan;
+        std::function<void()> mRun;
     };
 
     CPUConvolutionDepthwise(const Op *convOp, Backend *b);

diff --git a/source/backend/cpu/CPUCosineSimilarity.cpp b/source/backend/cpu/CPUCosineSimilarity.cpp
@@ -23,7 +23,7 @@ ErrorCode CPUCosineSimilarity::onExecute(const std::vector<Tensor*>& inputs, con
     const int batchStride   = x1->stride(0);
     const int channel       = x1->channel();
     const int channleStride = x1->stride(1);
-    const float eps         = 1e-8;
+    const float eps         = 1e-8f;
     const auto x1DataPtr    = x1->host<float>();
     const auto x2DataPtr    = x2->host<float>();
     auto outputDataPtr      = output->host<float>();

diff --git a/source/backend/cpu/CPUDetectionOutput.cpp b/source/backend/cpu/CPUDetectionOutput.cpp
@@ -5,6 +5,11 @@
 //  Created by MNN on 2018/07/17.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
+/* When use MSVC compile the file on x86 Release, a compiler internal error will be report because of MSVC's bug.
+   reference link: https://developercommunity.visualstudio.com/comments/535612/view.html */
+#if defined(_MSC_VER) && defined(_M_IX86) && !defined(_DEBUG)
+#pragma optimize("", off)
+#endif
 
 #include "CPUDetectionOutput.hpp"
 #include <math.h>

diff --git a/source/backend/cpu/CPUReshape.cpp b/source/backend/cpu/CPUReshape.cpp
@@ -14,15 +14,16 @@
 
 namespace MNN {
 
-CPUReshape::CPUReshape(Backend *b) : MNN::Execution(b), mStorage(2) {
-    // nothing to do
+CPUReshape::CPUReshape(Backend *b, MNN_DATA_FORMAT midFormat) : MNN::Execution(b), mStorage(2) {
+    mMidFormat = midFormat;
 }
 
 ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(1 == inputs.size() || 2 == inputs.size());
     MNN_ASSERT(1 == outputs.size());
 
     auto input    = inputs[0];
+    auto output   = outputs[0];
     int totalSize = 1;
 
     mWrapTensorForInput.buffer().type  = inputs[0]->buffer().type;
@@ -31,9 +32,6 @@ ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
     if (TensorUtils::getDescribe(input)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
         return NO_ERROR;
     }
-    TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat  = MNN_DATA_FORMAT_NCHW;
-    TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
-
     for (int i = 0; i < input->buffer().dimensions; ++i) {
         totalSize *= input->buffer().dim[i].extent;
     }
@@ -44,16 +42,29 @@ ErrorCode CPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
     mStorage.buffer().type          = input->getType();
     backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
     backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
-
-    TensorUtils::copyShape(inputs[0], &mWrapTensorForInput);
-
     mWrapTensorForInput.buffer().host = mStorage.buffer().host;
-    TensorUtils::setLinearLayout(&mWrapTensorForInput);
-
-    TensorUtils::copyShape(outputs[0], &mWrapTensorForOutput);
     mWrapTensorForOutput.buffer().host = mStorage.buffer().host;
+    if (MNN_DATA_FORMAT_NHWC == mMidFormat) {
+        TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat  = MNN_DATA_FORMAT_NHWC;
+        TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        mWrapTensorForInput.buffer().dimensions = 4;
+        mWrapTensorForOutput.buffer().dimensions = 4;
+        mWrapTensorForInput.setLength(0, input->batch());
+        mWrapTensorForInput.setLength(1, input->height());
+        mWrapTensorForInput.setLength(2, input->width());
+        mWrapTensorForInput.setLength(3, input->channel());
+        mWrapTensorForOutput.setLength(0, output->batch());
+        mWrapTensorForOutput.setLength(1, output->height());
+        mWrapTensorForOutput.setLength(2, output->width());
+        mWrapTensorForOutput.setLength(3, output->channel());
+    } else {
+        TensorUtils::getDescribe(&mWrapTensorForInput)->dimensionFormat  = MNN_DATA_FORMAT_NCHW;
+        TensorUtils::getDescribe(&mWrapTensorForOutput)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+        TensorUtils::copyShape(inputs[0], &mWrapTensorForInput);
+        TensorUtils::copyShape(outputs[0], &mWrapTensorForOutput);
+    }
+    TensorUtils::setLinearLayout(&mWrapTensorForInput);
     TensorUtils::setLinearLayout(&mWrapTensorForOutput);
-
     return NO_ERROR;
 }
 
@@ -78,7 +89,7 @@ class CPUReshapeCreator : public CPUBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
-        return new CPUReshape(backend);
+        return new CPUReshape(backend, op->main_as_Reshape()->dimType());
     }
 };
 

diff --git a/source/backend/cpu/CPUReshape.hpp b/source/backend/cpu/CPUReshape.hpp
@@ -15,7 +15,7 @@
 namespace MNN {
 class CPUReshape : public Execution {
 public:
-    CPUReshape(Backend *b);
+    CPUReshape(Backend *b, MNN_DATA_FORMAT midFormat);
     virtual ~CPUReshape() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@@ -24,6 +24,7 @@ class CPUReshape : public Execution {
     Tensor mStorage;
     Tensor mWrapTensorForInput;
     Tensor mWrapTensorForOutput;
+    MNN_DATA_FORMAT mMidFormat;
 };
 } // namespace MNN
 #endif /* CPUReshape_hpp */
diff --git a/source/backend/cpu/CPUShape.cpp b/source/backend/cpu/CPUShape.cpp
@@ -17,7 +17,7 @@ ErrorCode CPUShape::onExecute(const std::vector<Tensor*>& inputs, const std::vec
 
     auto& ib         = inputs[0]->buffer();
     int32_t* outData = outputs[0]->host<int32_t>();
-    if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+    if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
         outData[0] = ib.dim[0].extent;
         outData[1] = ib.dim[2].extent;
         outData[2] = ib.dim[3].extent;