ARM-software · nSircombe · Sep 28, 2022 · Sep 28, 2022
diff --git a/docker/tensorflow-aarch64/CHANGELOG.md b/docker/tensorflow-aarch64/CHANGELOG.md
@@ -14,6 +14,10 @@ where `YY` is the year, and `MM` the month of the increment.
 ### Removed
 
 ### Fixed
+- oneDNN uses indirect convolution in fast math mode when input channels require padding.
+- ACL uses correct predicate when reading bias in the merge phase of SVE kernels.
+- Convolution and matmul oneDNN primitives only cast to BF16 if selected kernel is in fast math mode.
+- Correct calculation of multi-stride when fastest changing dimension is padded in fast math mode.
 
 ## [r22.09] 2022-09-16
 https://github.com/ARM-software/Tool-Solutions/tree/tensorflow-pytorch-aarch64--r22.09/docker/tensorflow-aarch64

diff --git a/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch b/docker/tensorflow-aarch64/patches/acl_fixed_format_kernels_striding.patch
@@ -16,7 +16,7 @@
  *******************************************************************************
 
 diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-index 77da83070..9b811b4d0 100644
+index 77da83070..44aef28c6 100644
 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
 +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
 @@ -506,7 +506,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
@@ -28,18 +28,21 @@ index 77da83070..9b811b4d0 100644
              const int         tensor_width    = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
              int               tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
              const int         interleave_by   = arm_compute::interleave_by(wf);
-@@ -526,8 +526,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
-             }
-             else if(multi_stride_b == 0 || (ldb == tensor_width && multi_stride_b == tensor_height * tensor_width))
+@@ -528,6 +528,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
              {
-+                if(tensor_width % interleave_by != 0) {
-+                    multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height;
-+                }
                  // In this case dimension that is packed is only height
                  // so we need to stride only height by interleave_by
 +                if(tensor_height % blocked_by != 0) {
 +                    tensor_height = arm_gemm::iceildiv(tensor_height, blocked_by) * blocked_by;
++                    if(multi_stride_b != 0) {
++                        multi_stride_b = tensor_height * tensor_width;
++                    }
++                }
++
++                if(tensor_width % interleave_by != 0) {
++                    multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height;
 +                }
++
                  ldb = interleave_by * tensor_height;
              }
              else
diff --git a/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch b/docker/tensorflow-aarch64/patches/onednn_acl_depthwise_convolution.patch
@@ -14,9 +14,9 @@
  See the License for the specific language governing permissions and
  limitations under the License.
  *******************************************************************************
-
+ 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index ba5a49c6d..b7caf1479 100644
+index 6a840b973..47a3a4a92 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
 @@ -54,10 +54,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -73,9 +73,9 @@ index ba5a49c6d..b7caf1479 100644
      acp.weights_info = arm_compute::WeightsInfo(
          false,
          kw,
-@@ -283,6 +297,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
+@@ -297,6 +311,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          const primitive_attr_t &attr) {
+     acp.is_indirect = false;
 
 +    if(weights_md.ndims != 4) {
 +        return status::unimplemented;
@@ -84,24 +84,25 @@ index ba5a49c6d..b7caf1479 100644
      // General Compute Library checks, memory tags are also set there
      CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
 
-@@ -309,7 +327,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         const primitive_attr_t &attr) {
+@@ -325,7 +343,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+     auto math_mode = get_fpmath_mode();
      // Indirect convolution results in slowdown for low thread count or 1x1
      // kernels, so fall back to GEMM-based convolution in these cases
 -    if (one_of(true, weights_md.dims[2] == 1, // kh
 +    if (one_of(true, weights_md.ndims != 4,
 +                weights_md.dims[2] == 1, // kh
                  weights_md.dims[3] == 1, // kw
-                 dnnl_get_max_threads() < 28)) {
+                 (!math_mode && dnnl_get_max_threads() < 28))) {
          return status::unimplemented;
-@@ -334,6 +353,26 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -350,6 +369,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
      return status::success;
  }
 
 +status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
 +        memory_desc_t &weights_md, memory_desc_t &dst_md,
 +        memory_desc_t &bias_md, const convolution_desc_t &cd,
 +        const primitive_attr_t &attr) {
++    acp.is_indirect = false;
 +    // We need to make sure that number of dimensions for weights is either 5 or 3
 +    if(weights_md.ndims != 5)
 +        return status::unimplemented;
@@ -121,7 +122,7 @@ index ba5a49c6d..b7caf1479 100644
  status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
-@@ -342,7 +381,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -359,7 +399,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
      // Under these conditions, fallback to faster GEMM-based convolution
      // unless the user explicitly specifies Winograd algorithm
      // clang-format off
@@ -132,10 +133,10 @@ index ba5a49c6d..b7caf1479 100644
                  src_md.dims[1] < 64, // ic
                  dst_md.dims[1] < 64, // oc
 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 3e56245fa..809dc13c9 100644
+index 44dc8eecb..7eae5cbb1 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.hpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.hpp
-@@ -66,6 +66,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -67,6 +67,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr);
 
@@ -147,6 +148,26 @@ index 3e56245fa..809dc13c9 100644
  status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
+diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
+index 4142dbc7e..1800aaf58 100644
+--- a/src/cpu/cpu_convolution_list.cpp
++++ b/src/cpu/cpu_convolution_list.cpp
+@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
+ #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
+ #include "cpu/aarch64/acl_gemm_convolution.hpp"
+ #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
++#include "cpu/aarch64/acl_depthwise_convolution.hpp"
+ #include "cpu/aarch64/acl_winograd_convolution.hpp"
+ #endif
+ using namespace dnnl::impl::cpu::aarch64;
+@@ -104,6 +105,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
+             CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
+             CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
+             CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
++            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
+             CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
+             CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
+             CPU_INSTANCE(gemm_convolution_fwd_t)
 diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
 new file mode 100644
 index 000000000..1beb8b8af
@@ -337,23 +358,3 @@ index 000000000..97e0a3a30
 +} // namespace dnnl
 +
 +#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
-diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
-index 4142dbc7e..1800aaf58 100644
---- a/src/cpu/cpu_convolution_list.cpp
-+++ b/src/cpu/cpu_convolution_list.cpp
-@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
- #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
- #include "cpu/aarch64/acl_gemm_convolution.hpp"
- #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
-+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
- #include "cpu/aarch64/acl_winograd_convolution.hpp"
- #endif
- using namespace dnnl::impl::cpu::aarch64;
-@@ -104,6 +105,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
-             CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
-+            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
-             CPU_INSTANCE(gemm_convolution_fwd_t)
diff --git a/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch b/docker/tensorflow-aarch64/patches/onednn_acl_fixed_format_kernels.patch
@@ -14,12 +14,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
  *******************************************************************************
-
+ 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index c46d69757..ba5a49c6d 100644
+index c46d69757..6a840b973 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -212,6 +212,69 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -212,6 +212,82 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                  arm_compute::QuantizationInfo(1.0f / scales[0], 0));
      }
 
@@ -50,13 +50,26 @@ index c46d69757..ba5a49c6d 100644
 +    int interleaved_by = arm_compute::interleave_by(expected_weight_format);
 +    int block_by = arm_compute::block_by(expected_weight_format);
 +
++    bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
++    if(!is_fast_math_kernel) {
++        // FP32 kernel is faster then BF16
++        acp.fast_math = false;
++    }
++
 +    memory_desc_t want_wei_md = weights_md;
 +
 +    int ic_multiply = ic;
 +    if(ic % block_by != 0) {
 +        ic_multiply = utils::div_up(ic, block_by) * block_by;
 +        // Also we need to set padded dimensions as well
 +        want_wei_md.padded_dims[1] = ic_multiply;
++    } else {
++        // If we do not need to pad input channels for fast math mode
++        // then it would be faster to run convolution with im2row
++        // instead of using indirect buffer
++        if(acp.fast_math && acp.is_indirect) {
++            return status::unimplemented;
++        }
 +    }
 +    if(oc % interleaved_by != 0) {
 +        int padded_dim = utils::div_up(oc, interleaved_by) * interleaved_by;
@@ -79,7 +92,7 @@ index c46d69757..ba5a49c6d 100644
 +        want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
 +    }
 +
-+    if(acp.fast_math) {
++    if(is_fast_math_kernel) {
 +        // If it is fast math mode we need weights in BFloat16
 +        want_wei_md.data_type = dnnl_bf16;
 +    }
@@ -89,6 +102,49 @@ index c46d69757..ba5a49c6d 100644
      return status::success;
  }
 
+@@ -219,6 +295,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+         memory_desc_t &weights_md, memory_desc_t &dst_md,
+         memory_desc_t &bias_md, const convolution_desc_t &cd,
+         const primitive_attr_t &attr) {
++    acp.is_indirect = false;
+
+     // General Compute Library checks, memory tags are also set there
+     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
+@@ -244,11 +321,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+         memory_desc_t &weights_md, memory_desc_t &dst_md,
+         memory_desc_t &bias_md, const convolution_desc_t &cd,
+         const primitive_attr_t &attr) {
++    acp.is_indirect = true;
++    auto math_mode = get_fpmath_mode();
+     // Indirect convolution results in slowdown for low thread count or 1x1
+     // kernels, so fall back to GEMM-based convolution in these cases
+     if (one_of(true, weights_md.dims[2] == 1, // kh
+                 weights_md.dims[3] == 1, // kw
+-                dnnl_get_max_threads() < 28)) {
++                (!math_mode && dnnl_get_max_threads() < 28))) {
+         return status::unimplemented;
+     }
+
+@@ -275,6 +354,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+         memory_desc_t &weights_md, memory_desc_t &dst_md,
+         memory_desc_t &bias_md, const convolution_desc_t &cd,
+         const primitive_attr_t &attr) {
++    acp.is_indirect = false;
+
+     // Under these conditions, fallback to faster GEMM-based convolution
+     // unless the user explicitly specifies Winograd algorithm
+diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
+index 3e56245fa..44dc8eecb 100644
+--- a/src/cpu/aarch64/acl_convolution_utils.hpp
++++ b/src/cpu/aarch64/acl_convolution_utils.hpp
+@@ -43,6 +43,7 @@ struct acl_conv_conf_t {
+     // If this is true, the result of the convolution goes into a temporarily
+     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
+     bool use_dst_acc;
++    bool is_indirect;
+     arm_compute::TensorInfo src_info;
+     arm_compute::TensorInfo wei_info;
+     arm_compute::TensorInfo bia_info;
 diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
 index bcf031a77..4ddc8cf91 100644
 --- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
@@ -258,7 +314,7 @@ index c5e507085..15ea61173 100644
              // Validate fully connected layer manually to check for return status
              ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
 diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index 679baec3a..2c0b46978 100644
+index 679baec3a..4aa219376 100644
 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 @@ -66,15 +66,12 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
@@ -279,7 +335,7 @@ index 679baec3a..2c0b46978 100644
 
      amp.src_info = arm_compute::TensorInfo(
              arm_compute::TensorShape(K, M, 1, src_batch), 1,
-@@ -103,6 +100,121 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
+@@ -103,6 +100,125 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
          ACL_CHECK_VALID(arm_compute::NETranspose::validate(
                  &amp.wei_acc_info, &amp.wei_info));
 
@@ -310,6 +366,10 @@ index 679baec3a..2c0b46978 100644
 +    // as returned by interleave by from expecting strides
 +    int interleaved_by = arm_compute::interleave_by(expected_weight_format);
 +    int block_by = arm_compute::block_by(expected_weight_format);
++    bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
++    if(!is_fast_math_kernel) {
++        amp.gemm_info.set_fast_math(false);
++    }
 +
 +    int blocked_first_dimension = -1;
 +    int blocked_second_dimension = -1;
@@ -392,7 +452,7 @@ index 679baec3a..2c0b46978 100644
 +        want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
 +    }
 +
-+    if(is_fastmath_enabled) {
++    if(is_fast_math_kernel) {
 +        want_wei_md.data_type = dnnl_bf16;
 +    }
 +

diff --git a/docker/tensorflow-aarch64/patches/tf_acl.patch b/docker/tensorflow-aarch64/patches/tf_acl.patch
@@ -14,8 +14,9 @@
  See the License for the specific language governing permissions and
  limitations under the License.
  *******************************************************************************
+
 diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
-index e255103df98..b95f20b6232 100644
+index e255103df98..1b0bb41201f 100644
 --- a/tensorflow/workspace2.bzl
 +++ b/tensorflow/workspace2.bzl
 @@ -192,18 +192,19 @@ def _tf_repositories():
@@ -30,7 +31,7 @@ index e255103df98..b95f20b6232 100644
 +        strip_prefix = "oneDNN-80d45b77f1a031a99d628b27aeea45b05f16b8b5",
 +        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/80d45b77f1a031a99d628b27aeea45b05f16b8b5.tar.gz"),
      )
-
+ 
      tf_http_archive(
          name = "compute_library",
 -        sha256 = "94e2e9ff87c261a9c9987bc9024c449c48014f7fe707311bdfa76b87f3dda5c5",
@@ -40,8 +41,8 @@ index e255103df98..b95f20b6232 100644
          build_file = "//third_party/compute_library:BUILD",
 -        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:activation_func_correct_args.patch"],
 -        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.05.tar.gz"),
-+        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch"],
++        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch", "//third_party/compute_library:acl_fixup_SVE_merges.patch"],
 +        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.08.tar.gz"),
      )
-
+ 
      tf_http_archive(
diff --git a/docker/tensorflow-aarch64/scripts/build-tensorflow.sh b/docker/tensorflow-aarch64/scripts/build-tensorflow.sh
@@ -100,6 +100,10 @@ if [[ $ONEDNN_BUILD ]]; then
         mv ../acl_fixed_format_kernels_striding.patch ./third_party/compute_library/.
         mv ../acl_depthwise_updateable_weights.patch ./third_party/compute_library/.
 
+        # SVE merge fixup
+        wget "https://git.mlplatform.org/ml/ComputeLibrary.git/patch/?id=ce79ac6297e6eb2407abd24846b8504dee43770f" -O ../acl_fixup_SVE_merges.patch
+        mv ../acl_fixup_SVE_merges.patch ./third_party/compute_library/.
+
         # Patch to cap the number of threads for ACL primitives
         mv ../onednn_acl_threadcap.patch ./third_party/mkl_dnn/.