alibaba · jxt1234 · Jul 5, 2023
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -44,6 +44,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
 | MNN_CUDA_PROFILE     | 是否打开CUDA profile工具，默认为`OFF` |
 | MNN_CUDA_QUANT       | 是否打开CUDA 量化文件编译，默认为`OFF` |
+| MNN_CUDA_BF16        | 是否打开CUDA Bf16文件编译，默认为`OFF` |
 | MNN_TENSORRT         | 是否构建`TensorRT`后端，默认为`OFF` |
 | MNN_COREML           | 是否构建`CoreML`后端，默认为`OFF` |
 | MNN_NNAPI            | 是否构建`NNAPI`后端，默认为`OFF`  |

diff --git a/docs/index.rst b/docs/index.rst
@@ -47,6 +47,7 @@
    :maxdepth: 1
    :caption: 表达式
    :name: expr
+
    inference/expr
 
 .. toctree::

diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -68,7 +68,7 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR_IMP(x) #x
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
-#define MNN_VERSION_MINOR 5
-#define MNN_VERSION_PATCH 3
+#define MNN_VERSION_MINOR 6
+#define MNN_VERSION_PATCH 0
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -763,6 +763,8 @@
 		C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C4F906B127688C3A0026B847 /* NMSModule.hpp */; };
 		C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4F906B227688C3A0026B847 /* NMSModule.cpp */; };
 		C4FB6CB22769DF0800963B07 /* GeometryCumSum.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */; };
+		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
+		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
 		CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
 		CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
@@ -785,9 +787,7 @@
 		CEDB211C2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */; };
 		CEDB211D284706F900AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
 		CEDB211E2847070600AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
-		CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */; };
 		CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */; };
-		CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */; };
 		CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */; };
 		CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */; };
 		CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
@@ -1590,6 +1590,8 @@
 		C4F906B127688C3A0026B847 /* NMSModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = NMSModule.hpp; sourceTree = "<group>"; };
 		C4F906B227688C3A0026B847 /* NMSModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NMSModule.cpp; sourceTree = "<group>"; };
 		C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCumSum.cpp; sourceTree = "<group>"; };
+		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
+		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
 		CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
 		CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
@@ -1614,9 +1616,7 @@
 		CEDB21172846D58200AE9DC4 /* testcat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = testcat.jpg; path = ../../../demo/model/MobileNet/testcat.jpg; sourceTree = "<group>"; };
 		CEDB21182846D58200AE9DC4 /* synset_words.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = synset_words.txt; path = ../../../demo/model/MobileNet/synset_words.txt; sourceTree = "<group>"; };
 		CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; name = mobilenet_v2.caffe.mnn; path = ../../../resource/model/MobileNet/v2/mobilenet_v2.caffe.mnn; sourceTree = "<group>"; };
-		CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC16.S; sourceTree = "<group>"; };
 		CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
-		CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC16.S; sourceTree = "<group>"; };
 		CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
 		CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
 		CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
@@ -2501,8 +2501,8 @@
 		92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
 			isa = PBXGroup;
 			children = (
-				CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */,
-				CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */,
+				CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */,
+				CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */,
 				CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */,
 				CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */,
 				950B28DF29F627E00002F454 /* MNNBinaryAddInt8.S */,
@@ -3356,6 +3356,7 @@
 				950B28ED29F627F70002F454 /* MNNBinaryMulInt8.S in Sources */,
 				481FA853259C27E00047F01F /* ShapeTensorArray.cpp in Sources */,
 				6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */,
+				CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */,
 				92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */,
 				48034563254157CE004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
 				48FA474823AA127B00172C3B /* Expr.cpp in Sources */,
@@ -3375,7 +3376,6 @@
 				48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
 				48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
-				CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */,
 				48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
 				4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
 				48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
@@ -3515,6 +3515,7 @@
 				CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
 				92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
 				92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
+				CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */,
 				92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
 				92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
 				92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
@@ -3555,7 +3556,6 @@
 				482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */,
 				92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
 				11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
-				CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */,
 				48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
 				EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
 				4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */,

diff --git a/pymnn/src/util.h b/pymnn/src/util.h
@@ -107,13 +107,23 @@ inline int64_t unpackLong(PyObject* obj) {
   }
   return (int64_t)value;
 }
+inline double unpackDoubleOrLong(PyObject* obj) {
+    if (PyLong_Check(obj)
+#if PY_MAJOR_VERSION < 3
+    || PyInt_Check(obj)
+#endif
+    ) {
+        return static_cast<float>(unpackLong(obj));
+    }
+    return unpackDouble(obj);
+}
 inline void store_scalar(void* data, int dtype, PyObject* obj) {
   switch (dtype) {
     case 4: *(uint8_t*)data = (uint8_t)unpackLong(obj); break;
     case 3: *(int32_t*)data = (int32_t)unpackLong(obj); break;
     case 9: *(int64_t*)data = unpackLong(obj); break;
-    case 1: *(float*)data = (float)unpackDouble(obj); break;
-    case 2: *(double*)data = (double)unpackDouble(obj); break;
+    case 1: *(float*)data = (float)unpackDoubleOrLong(obj); break;
+    case 2: *(double*)data = (double)unpackDoubleOrLong(obj); break;
     case 6: *(int8_t*)data = (int8_t)unpackLong(obj); break;
     default: PyMNN_ERROR_LOG("store_scalar: invalid type");
   }

diff --git a/source/backend/cpu/BinaryUtils.hpp b/source/backend/cpu/BinaryUtils.hpp
@@ -330,7 +330,7 @@ void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int
 }
 
 template<typename Tin, typename Tout, typename Func>
-void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
+void executeInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const int8_t* inputOffset0, const int8_t* inputOffset1, const int8_t* outputOffset, size_t elementSize, size_t needBroadcast) {
     Func f;
     int size = elementSize;
 #ifdef MNN_USE_NEON
@@ -355,19 +355,19 @@ void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* input
 #endif
     for (int i = 0; i < size; ++i) {
         if (needBroadcast == 0) {
-            inp0 = (inputData0[0]- zeroPoint) * inputScale0[0];
-            inp1 = (inputData1[i]- zeroPoint) * inputScale1[0];
+            inp0 = (inputData0[0]- zeroPoint - inputOffset0[0]) * inputScalesFp32[0];
+            inp1 = (inputData1[i]- zeroPoint - inputOffset1[0]) * inputScalesFp32[1];
             output = f(inp0, inp1);
         } else if (needBroadcast == 1) {
-            inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
-            inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
+            inp0 = (inputData0[i] - zeroPoint - inputOffset0[0]) * inputScalesFp32[0];
+            inp1 = (inputData1[0] - zeroPoint - inputOffset1[0]) * inputScalesFp32[1];
             output = f(inp0, inp1);
         } else {
-            inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
-            inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
+            inp0 = (inputData0[i] - zeroPoint - inputOffset0[0]) * inputScalesFp32[0];
+            inp1 = (inputData1[i] - zeroPoint - inputOffset1[0]) * inputScalesFp32[1];
             output = f(inp0, inp1);
         }
-        int value = (int)roundf(output * outputScale[0]) + zeroPoint;
+        int value = (int)roundf(output * inputScalesFp32[2]) + zeroPoint + outputOffset[0];
         if (value > maxValue) {
             value = maxValue;
         }

diff --git a/source/backend/cpu/CPUBinaryInt8.cpp b/source/backend/cpu/CPUBinaryInt8.cpp
@@ -16,8 +16,6 @@
 #include "BinaryUtils.hpp"
 #include "math/Vec.hpp"
 
-using Vec16 = MNN::Math::Vec<int8_t, 16>;
-
 namespace MNN {
 
 ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
@@ -37,22 +35,24 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
 
     auto core = static_cast<CPUBackend*>(backend())->functions();
 
-    mInputQuant0.resize(core->pack); // prepare for arm neon. float32x4
-    mInputQuant1.resize(core->pack);
-    mOutputQuant.resize(core->pack);
-    std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
-    std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
+    mInputOffset0.resize(1);
+    mInputOffset1.resize(1);
+    mOutputOffset.resize(1);
+    mQuantScalesInt32.resize(2); // When use int32 scales computing, output scale is needless.
+    mQuantScalesFp32.resize(3);
+    mQuantScalesInt32[0] = TensorUtils::getDescribe(inputs[0])->quantAttr->scale * (1 << 16);
+    mQuantScalesInt32[1] = TensorUtils::getDescribe(inputs[1])->quantAttr->scale * (1 << 16);
+    mQuantScalesFp32[0] =  TensorUtils::getDescribe(inputs[0])->quantAttr->scale;
+    mQuantScalesFp32[1] =  TensorUtils::getDescribe(inputs[1])->quantAttr->scale;
     if (TensorUtils::getDescribe(outputs[0])->quantAttr->scale != 0) {
-        std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
+        mQuantScalesFp32[2] = 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale;
     } else {
-        std::fill(mOutputQuant.begin(), mOutputQuant.end(), 0);
+        mQuantScalesFp32[2] = 0;
     }
-
+    mInputOffset0[0] = (int8_t)TensorUtils::getDescribe(inputs[0])->quantAttr->zero;
+    mInputOffset1[0] = (int8_t)TensorUtils::getDescribe(inputs[1])->quantAttr->zero;
+    mOutputOffset[0] = (int8_t)TensorUtils::getDescribe(outputs[0])->quantAttr->zero;
 
-    if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
-        mActivationExe.reset(new CPURelu(backend(), 0.0));
-        mActivationExe->onResize(outputs, outputs);
-    }
     return NO_ERROR;
 }
 
@@ -79,27 +79,24 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
         if (realSize > 0) {
             auto inp0 = input0Ptr + start * inpBytes;
             auto inp1 = input1Ptr + start * inpBytes;
-            auto scale0 = mInputQuant0.data() + start;
-            auto scale1 = mInputQuant1.data() + start;
-            auto scaleDst = mOutputQuant.data() + start;
+            auto offset0 = mInputOffset0.data();
+            auto offset1 = mInputOffset1.data();
+            auto offsetDst = mOutputOffset.data();
             if (mNeedBroadcastIndex == 0) {
                 inp0 = input0Ptr;
             } else if (mNeedBroadcastIndex == 1) {
                 inp1 = input1Ptr;
             }
             auto out = outputPtr + start * outBytes;
 #ifdef MNN_USE_NEON
-            mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize / 4, mNeedBroadcastIndex);
+            mProc(out, inp0, inp1, mQuantScalesInt32.data(), mQuantScalesFp32.data(), offset0, offset1, offsetDst, realSize / 4, mNeedBroadcastIndex);
 #else
-             mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
+            mProc(out, inp0, inp1, mQuantScalesInt32.data(), mQuantScalesFp32.data(), offset0, offset1, offsetDst, realSize, mNeedBroadcastIndex);
 #endif
         }
     }
     MNN_CONCURRENCY_END();
-
-    if(mActivationType == 1 && output->getType().code == halide_type_float) {
-        mActivationExe->onExecute(outputs, outputs);;
-    }
+
     return NO_ERROR;
 }
 

diff --git a/source/backend/cpu/CPUBinaryInt8.hpp b/source/backend/cpu/CPUBinaryInt8.hpp
@@ -31,9 +31,11 @@ class CPUBinaryInt8 : public Execution {
     int mTotalSize;
     int mActivationType = 0;
     std::shared_ptr<Execution> mActivationExe;
-    std::vector<float> mInputQuant0;
-    std::vector<float> mInputQuant1;
-    std::vector<float> mOutputQuant;
+    std::vector<ssize_t> mQuantScalesInt32; // input0 and input1
+    std::vector<float> mQuantScalesFp32;  // input0, input1 and output
+    std::vector<int8_t> mInputOffset0;
+    std::vector<int8_t> mInputOffset1;
+    std::vector<int8_t> mOutputOffset;
 };
 } // namespace MNN
 #endif /* CPUBinary_hpp */