From f85253dbe5981c441c8c11f16a31ca03f3a2dc99 Mon Sep 17 00:00:00 2001 From: Milos Puzovic Date: Wed, 28 Sep 2022 17:20:26 +0100 Subject: [PATCH] Fixes to calls to arm_gemm kernels to get correct accuracy when running in fast math mode --- docker/tensorflow-aarch64/CHANGELOG.md | 4 + .../acl_fixed_format_kernels_striding.patch | 17 +++-- .../onednn_acl_depthwise_convolution.patch | 63 ++++++++-------- .../onednn_acl_fixed_format_kernels.patch | 74 +++++++++++++++++-- .../tensorflow-aarch64/patches/tf_acl.patch | 9 ++- .../scripts/build-tensorflow.sh | 4 + 6 files changed, 122 insertions(+), 49 deletions(-) diff --git a/docker/tensorflow-aarch64/CHANGELOG.md b/docker/tensorflow-aarch64/CHANGELOG.md index 12468b4e..da129b6f 100644 --- a/docker/tensorflow-aarch64/CHANGELOG.md +++ b/docker/tensorflow-aarch64/CHANGELOG.md @@ -14,6 +14,10 @@ where `YY` is the year, and `MM` the month of the increment. ### Removed ### Fixed +- oneDNN uses indirect convolution in fast math mode when input channels require padding. +- ACL uses correct predicate when reading bias in the merge phase of SVE kernels. +- Convolution and matmul oneDNN primitives only cast to BF16 if selected kernel is in fast math mode. +- Correct calculation of multi-stride when fastest changing dimension is padded in fast math mode. ## [r22.09] 2022-09-16 https://github.com/ARM-software/Tool-Solutions/tree/tensorflow-pytorch-aarch64--r22.09/docker/tensorflow-aarch64 diff --git a/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch b/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch index d496c965..b3f38b04 100644 --- a/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch +++ b/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch @@ -16,7 +16,7 @@ ******************************************************************************* diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp -index 77da83070..9b811b4d0 100644 +index 77da83070..44aef28c6 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -506,7 +506,7 @@ void Fallback::run(ITensorPack &tensors) @@ -28,18 +28,21 @@ index 77da83070..9b811b4d0 100644 const int tensor_width = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)]; int tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; const int interleave_by = arm_compute::interleave_by(wf); -@@ -526,8 +526,14 @@ void Fallback::run(ITensorPack &tensors) - } - else if(multi_stride_b == 0 || (ldb == tensor_width && multi_stride_b == tensor_height * tensor_width)) +@@ -528,6 +528,17 @@ void Fallback::run(ITensorPack &tensors) { -+ if(tensor_width % interleave_by != 0) { -+ multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height; -+ } // In this case dimension that is packed is only height // so we need to stride only height by interleave_by + if(tensor_height % blocked_by != 0) { + tensor_height = arm_gemm::iceildiv(tensor_height, blocked_by) * blocked_by; ++ if(multi_stride_b != 0) { ++ multi_stride_b = tensor_height * tensor_width; ++ } ++ } ++ ++ if(tensor_width % interleave_by != 0) { ++ multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height; + } ++ ldb = interleave_by * tensor_height; } else diff --git a/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch b/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch index bbf8822b..78cf4e23 100644 --- a/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch +++ b/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch @@ -14,9 +14,9 @@ See the License for the specific language governing permissions and limitations under the License. ******************************************************************************* - + diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp -index ba5a49c6d..b7caf1479 100644 +index 6a840b973..47a3a4a92 100644 --- a/src/cpu/aarch64/acl_convolution_utils.cpp +++ b/src/cpu/aarch64/acl_convolution_utils.cpp @@ -54,10 +54,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md, @@ -73,9 +73,9 @@ index ba5a49c6d..b7caf1479 100644 acp.weights_info = arm_compute::WeightsInfo( false, kw, -@@ -283,6 +297,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, - memory_desc_t &bias_md, const convolution_desc_t &cd, +@@ -297,6 +311,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, const primitive_attr_t &attr) { + acp.is_indirect = false; + if(weights_md.ndims != 4) { + return status::unimplemented; @@ -84,17 +84,17 @@ index ba5a49c6d..b7caf1479 100644 // General Compute Library checks, memory tags are also set there CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr)); -@@ -309,7 +327,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, - const primitive_attr_t &attr) { +@@ -325,7 +343,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, + auto math_mode = get_fpmath_mode(); // Indirect convolution results in slowdown for low thread count or 1x1 // kernels, so fall back to GEMM-based convolution in these cases - if (one_of(true, weights_md.dims[2] == 1, // kh + if (one_of(true, weights_md.ndims != 4, + weights_md.dims[2] == 1, // kh weights_md.dims[3] == 1, // kw - dnnl_get_max_threads() < 28)) { + (!math_mode && dnnl_get_max_threads() < 28))) { return status::unimplemented; -@@ -334,6 +353,26 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, +@@ -350,6 +369,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, return status::success; } @@ -102,6 +102,7 @@ index ba5a49c6d..b7caf1479 100644 + memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, const convolution_desc_t &cd, + const primitive_attr_t &attr) { ++ acp.is_indirect = false; + // We need to make sure that number of dimensions for weights is either 5 or 3 + if(weights_md.ndims != 5) + return status::unimplemented; @@ -121,7 +122,7 @@ index ba5a49c6d..b7caf1479 100644 status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, memory_desc_t &bias_md, const convolution_desc_t &cd, -@@ -342,7 +381,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, +@@ -359,7 +399,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, // Under these conditions, fallback to faster GEMM-based convolution // unless the user explicitly specifies Winograd algorithm // clang-format off @@ -132,10 +133,10 @@ index ba5a49c6d..b7caf1479 100644 src_md.dims[1] < 64, // ic dst_md.dims[1] < 64, // oc diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp -index 3e56245fa..809dc13c9 100644 +index 44dc8eecb..7eae5cbb1 100644 --- a/src/cpu/aarch64/acl_convolution_utils.hpp +++ b/src/cpu/aarch64/acl_convolution_utils.hpp -@@ -66,6 +66,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, +@@ -67,6 +67,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, memory_desc_t &bias_md, const convolution_desc_t &cd, const primitive_attr_t &attr); @@ -147,6 +148,26 @@ index 3e56245fa..809dc13c9 100644 status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, memory_desc_t &bias_md, const convolution_desc_t &cd, +diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp +index 4142dbc7e..1800aaf58 100644 +--- a/src/cpu/cpu_convolution_list.cpp ++++ b/src/cpu/cpu_convolution_list.cpp +@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64; + #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL + #include "cpu/aarch64/acl_gemm_convolution.hpp" + #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp" ++#include "cpu/aarch64/acl_depthwise_convolution.hpp" + #include "cpu/aarch64/acl_winograd_convolution.hpp" + #endif + using namespace dnnl::impl::cpu::aarch64; +@@ -104,6 +105,7 @@ const std::map> &impl_list_map() + CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t) + CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t) + CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t) ++ CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t) + CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t) + CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t) + CPU_INSTANCE(gemm_convolution_fwd_t) diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp new file mode 100644 index 000000000..1beb8b8af @@ -337,23 +358,3 @@ index 000000000..97e0a3a30 +} // namespace dnnl + +#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP -diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp -index 4142dbc7e..1800aaf58 100644 ---- a/src/cpu/cpu_convolution_list.cpp -+++ b/src/cpu/cpu_convolution_list.cpp -@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64; - #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL - #include "cpu/aarch64/acl_gemm_convolution.hpp" - #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp" -+#include "cpu/aarch64/acl_depthwise_convolution.hpp" - #include "cpu/aarch64/acl_winograd_convolution.hpp" - #endif - using namespace dnnl::impl::cpu::aarch64; -@@ -104,6 +105,7 @@ const std::map> &impl_list_map() - CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t) - CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t) - CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t) -+ CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t) - CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t) - CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t) - CPU_INSTANCE(gemm_convolution_fwd_t) diff --git a/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch b/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch index 6cde48d9..02b25db9 100644 --- a/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch +++ b/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch @@ -14,12 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. ******************************************************************************* - + diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp -index c46d69757..ba5a49c6d 100644 +index c46d69757..6a840b973 100644 --- a/src/cpu/aarch64/acl_convolution_utils.cpp +++ b/src/cpu/aarch64/acl_convolution_utils.cpp -@@ -212,6 +212,69 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md, +@@ -212,6 +212,82 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md, arm_compute::QuantizationInfo(1.0f / scales[0], 0)); } @@ -50,6 +50,12 @@ index c46d69757..ba5a49c6d 100644 + int interleaved_by = arm_compute::interleave_by(expected_weight_format); + int block_by = arm_compute::block_by(expected_weight_format); + ++ bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format); ++ if(!is_fast_math_kernel) { ++ // FP32 kernel is faster then BF16 ++ acp.fast_math = false; ++ } ++ + memory_desc_t want_wei_md = weights_md; + + int ic_multiply = ic; @@ -57,6 +63,13 @@ index c46d69757..ba5a49c6d 100644 + ic_multiply = utils::div_up(ic, block_by) * block_by; + // Also we need to set padded dimensions as well + want_wei_md.padded_dims[1] = ic_multiply; ++ } else { ++ // If we do not need to pad input channels for fast math mode ++ // then it would be faster to run convolution with im2row ++ // instead of using indirect buffer ++ if(acp.fast_math && acp.is_indirect) { ++ return status::unimplemented; ++ } + } + if(oc % interleaved_by != 0) { + int padded_dim = utils::div_up(oc, interleaved_by) * interleaved_by; @@ -79,7 +92,7 @@ index c46d69757..ba5a49c6d 100644 + want_wei_md.format_desc.blocking.inner_blks[1] = block_by; + } + -+ if(acp.fast_math) { ++ if(is_fast_math_kernel) { + // If it is fast math mode we need weights in BFloat16 + want_wei_md.data_type = dnnl_bf16; + } @@ -89,6 +102,49 @@ index c46d69757..ba5a49c6d 100644 return status::success; } +@@ -219,6 +295,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, + memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, const convolution_desc_t &cd, + const primitive_attr_t &attr) { ++ acp.is_indirect = false; + + // General Compute Library checks, memory tags are also set there + CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr)); +@@ -244,11 +321,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, + memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, const convolution_desc_t &cd, + const primitive_attr_t &attr) { ++ acp.is_indirect = true; ++ auto math_mode = get_fpmath_mode(); + // Indirect convolution results in slowdown for low thread count or 1x1 + // kernels, so fall back to GEMM-based convolution in these cases + if (one_of(true, weights_md.dims[2] == 1, // kh + weights_md.dims[3] == 1, // kw +- dnnl_get_max_threads() < 28)) { ++ (!math_mode && dnnl_get_max_threads() < 28))) { + return status::unimplemented; + } + +@@ -275,6 +354,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, + memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, const convolution_desc_t &cd, + const primitive_attr_t &attr) { ++ acp.is_indirect = false; + + // Under these conditions, fallback to faster GEMM-based convolution + // unless the user explicitly specifies Winograd algorithm +diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp +index 3e56245fa..44dc8eecb 100644 +--- a/src/cpu/aarch64/acl_convolution_utils.hpp ++++ b/src/cpu/aarch64/acl_convolution_utils.hpp +@@ -43,6 +43,7 @@ struct acl_conv_conf_t { + // If this is true, the result of the convolution goes into a temporarily + // allocated ACL tensor to be accumulated into the oneDNN dst during postops + bool use_dst_acc; ++ bool is_indirect; + arm_compute::TensorInfo src_info; + arm_compute::TensorInfo wei_info; + arm_compute::TensorInfo bia_info; diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp index bcf031a77..4ddc8cf91 100644 --- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp @@ -258,7 +314,7 @@ index c5e507085..15ea61173 100644 // Validate fully connected layer manually to check for return status ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate( diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp -index 679baec3a..2c0b46978 100644 +index 679baec3a..4aa219376 100644 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp @@ -66,15 +66,12 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, @@ -279,7 +335,7 @@ index 679baec3a..2c0b46978 100644 amp.src_info = arm_compute::TensorInfo( arm_compute::TensorShape(K, M, 1, src_batch), 1, -@@ -103,6 +100,121 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, +@@ -103,6 +100,125 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, ACL_CHECK_VALID(arm_compute::NETranspose::validate( &.wei_acc_info, &.wei_info)); @@ -310,6 +366,10 @@ index 679baec3a..2c0b46978 100644 + // as returned by interleave by from expecting strides + int interleaved_by = arm_compute::interleave_by(expected_weight_format); + int block_by = arm_compute::block_by(expected_weight_format); ++ bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format); ++ if(!is_fast_math_kernel) { ++ amp.gemm_info.set_fast_math(false); ++ } + + int blocked_first_dimension = -1; + int blocked_second_dimension = -1; @@ -392,7 +452,7 @@ index 679baec3a..2c0b46978 100644 + want_wei_md.format_desc.blocking.inner_blks[1] = block_by; + } + -+ if(is_fastmath_enabled) { ++ if(is_fast_math_kernel) { + want_wei_md.data_type = dnnl_bf16; + } + diff --git a/docker/tensorflow-aarch64/patches/tf_acl.patch b/docker/tensorflow-aarch64/patches/tf_acl.patch index 78770ed5..2db3d214 100644 --- a/docker/tensorflow-aarch64/patches/tf_acl.patch +++ b/docker/tensorflow-aarch64/patches/tf_acl.patch @@ -14,8 +14,9 @@ See the License for the specific language governing permissions and limitations under the License. ******************************************************************************* + diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl -index e255103df98..b95f20b6232 100644 +index e255103df98..1b0bb41201f 100644 --- a/tensorflow/workspace2.bzl +++ b/tensorflow/workspace2.bzl @@ -192,18 +192,19 @@ def _tf_repositories(): @@ -30,7 +31,7 @@ index e255103df98..b95f20b6232 100644 + strip_prefix = "oneDNN-80d45b77f1a031a99d628b27aeea45b05f16b8b5", + urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/80d45b77f1a031a99d628b27aeea45b05f16b8b5.tar.gz"), ) - + tf_http_archive( name = "compute_library", - sha256 = "94e2e9ff87c261a9c9987bc9024c449c48014f7fe707311bdfa76b87f3dda5c5", @@ -40,8 +41,8 @@ index e255103df98..b95f20b6232 100644 build_file = "//third_party/compute_library:BUILD", - patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:activation_func_correct_args.patch"], - urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.05.tar.gz"), -+ patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch"], ++ patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch", "//third_party/compute_library:acl_fixup_SVE_merges.patch"], + urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.08.tar.gz"), ) - + tf_http_archive( diff --git a/docker/tensorflow-aarch64/scripts/build-tensorflow.sh b/docker/tensorflow-aarch64/scripts/build-tensorflow.sh index 233cfc7a..0a5b52ba 100755 --- a/docker/tensorflow-aarch64/scripts/build-tensorflow.sh +++ b/docker/tensorflow-aarch64/scripts/build-tensorflow.sh @@ -100,6 +100,10 @@ if [[ $ONEDNN_BUILD ]]; then mv ../acl_fixed_format_kernels_striding.patch ./third_party/compute_library/. mv ../acl_depthwise_updateable_weights.patch ./third_party/compute_library/. + # SVE merge fixup + wget "https://git.mlplatform.org/ml/ComputeLibrary.git/patch/?id=ce79ac6297e6eb2407abd24846b8504dee43770f" -O ../acl_fixup_SVE_merges.patch + mv ../acl_fixup_SVE_merges.patch ./third_party/compute_library/. + # Patch to cap the number of threads for ACL primitives mv ../onednn_acl_threadcap.patch ./third_party/mkl_dnn/.