divert calls to oneDNN's gemm_api into ACL

ARM-software · Apr 6, 2023 · bbcfdbc · bbcfdbc
1 parent 8208b29
commit bbcfdbc
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 0 deletions.
diff --git a/docker/tensorflow-aarch64/Dockerfile b/docker/tensorflow-aarch64/Dockerfile
@@ -300,6 +300,7 @@ ENV PATH="$PACKAGE_DIR/bazel:$PATH"
 COPY patches/tf_acl.patch $PACKAGE_DIR/.
 COPY patches/tf_recursive_scheduler.patch $PACKAGE_DIR/.
 COPY patches/tf_dispatch_with_heuristics.patch $PACKAGE_DIR/.
+COPY patches/tf_use_acl_instead_of_gemm_api.patch $PACKAGE_DIR/.
 COPY patches/compute_library.patch $PACKAGE_DIR/.
 COPY patches/acl_openmp_fix.patch $PACKAGE_DIR/.
 COPY patches/acl_fixed_format_kernels_striding.patch $PACKAGE_DIR/.

diff --git a/docker/tensorflow-aarch64/patches/tf_use_acl_instead_of_gemm_api.patch b/docker/tensorflow-aarch64/patches/tf_use_acl_instead_of_gemm_api.patch
@@ -0,0 +1,101 @@
+ *******************************************************************************
+ Copyright 2023 Arm Limited and affiliates.
+ SPDX-License-Identifier: Apache-2.0
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ *******************************************************************************
+diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
+index 46f65f31263..735ece1b364 100644
+--- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
++++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
+@@ -49,8 +49,23 @@ template <typename Device, typename Tlhs, typename Trhs, typename Toutput,
+ class BatchMatMulMkl : public OpKernel {
+  public:
+   explicit BatchMatMulMkl(OpKernelConstruction* context) : OpKernel(context) {
+-    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+-    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
++    if (context && context->HasAttr("transpose_a")) {
++      // This is needed for using BatchMatMulMkl as the super class of
++      // MklMatMulOp (below) whose context has a transpose_a attribute which is
++      // effectively the same as adj_x_
++      OP_REQUIRES_OK(context, context->GetAttr("transpose_a", &adj_x_));
++    } else {
++      OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
++    }
++
++    if (context && context->HasAttr("transpose_b")) {
++      // This is needed for using BatchMatMulMkl as the super class of
++      // MklMatMulOp (below) whose context has a transpose_b attribute which is
++      // effectively the same as adj_y_
++      OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &adj_y_));
++    } else {
++      OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
++    }
+   }
+
+   virtual ~BatchMatMulMkl() {}
+@@ -307,6 +322,32 @@ class FusedBatchMatMulMkl
+   }
+ };
+
++// Direct calls for MklMatMulOp to BatchMatMulMkl for aarch64,
++// because the Arm Compute Library does not provide a BLAS SGEMM
++// interface, which is what MklMatMulOp calls by default.
++#ifdef DNNL_AARCH64_USE_ACL
++template <typename Device, typename T, bool USE_CUBLAS>
++class MklMatMulOp : public BatchMatMulMkl<Device, T, T, T, USE_CUBLAS> {
++ public:
++  explicit MklMatMulOp(OpKernelConstruction* ctx)
++      : BatchMatMulMkl<Device, T, T, T, false>(ctx) {}
++
++  virtual ~MklMatMulOp() {}
++};
++
++#define REGISTER_MATMUL_MKL(TYPE)                         \
++  REGISTER_KERNEL_BUILDER(                                \
++      Name("_MklMatMul")                                  \
++          .Device(DEVICE_CPU)                             \
++          .TypeConstraint<TYPE>("T")                      \
++          .Label(mkl_op_registry::kMklNameChangeOpLabel), \
++      MklMatMulOp<CPUDevice, TYPE, false /* cublas, ignored for CPU */>);
++
++TF_CALL_float(REGISTER_MATMUL_MKL);
++TF_CALL_bfloat16(REGISTER_MATMUL_MKL);
++
++#endif  // DNNL_AARCH64_USE_ACL
++
+ #define REGISTER_BATCH_MATMUL_MKL(TYPE)                                       \
+   REGISTER_KERNEL_BUILDER(Name("_MklBatchMatMul")                             \
+                               .Device(DEVICE_CPU)                             \
+diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
+index 03d6bd7d0cc..ca715820d2b 100644
+--- a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
++++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
+@@ -23,7 +23,10 @@ limitations under the License.
+ // and when it is undefined at build time, this file becomes an empty
+ // compilation unit
+
+-#if defined(INTEL_MKL)
++// We do not want to use this kernel for aarch64 because the
++// Arm Compute Library does not provide a BLAS SGEMM
++// interface, which is what MklMatMulOp calls by default.
++#if !defined(DNNL_AARCH64_USE_ACL)
+
+ #include "dnnl.hpp"
+ #include "tensorflow/core/framework/op.h"
+@@ -204,4 +207,4 @@ class MklMatMulOp : public OpKernel {
+ TF_CALL_float(REGISTER_CPU);
+ TF_CALL_bfloat16(REGISTER_CPU);
+ }  // namespace tensorflow
+-#endif  // INTEL_MKL
++#endif  // !DNNL_AARCH64_USE_ACL
diff --git a/docker/tensorflow-aarch64/scripts/build-tensorflow.sh b/docker/tensorflow-aarch64/scripts/build-tensorflow.sh
@@ -90,6 +90,8 @@ if [[ $ONEDNN_BUILD ]]; then
         # Path TensorFlow to use heuristics to decide whether to rewrite
         # node in a graph to use oneDNN primitive or Eigen
         patch -p1 < ../tf_dispatch_with_heuristics.patch
+        # Patch to divert calls to oneDNN's gemm_api into ACL
+        patch -p1 < ../tf_use_acl_instead_of_gemm_api.patch
 
         #  Recursive scheduler patch
         patch -p1 < ../tf_recursive_scheduler.patch