Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to calls to arm_gemm kernels to get correct accuracy when runni… #150

Merged
merged 1 commit into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docker/tensorflow-aarch64/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ where `YY` is the year, and `MM` the month of the increment.
### Removed

### Fixed
- oneDNN uses indirect convolution in fast math mode when input channels require padding.
- ACL uses correct predicate when reading bias in the merge phase of SVE kernels.
- Convolution and matmul oneDNN primitives only cast to BF16 if selected kernel is in fast math mode.
- Correct calculation of multi-stride when fastest changing dimension is padded in fast math mode.

## [r22.09] 2022-09-16
https://github.com/ARM-software/Tool-Solutions/tree/tensorflow-pytorch-aarch64--r22.09/docker/tensorflow-aarch64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
*******************************************************************************

diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 77da83070..9b811b4d0 100644
index 77da83070..44aef28c6 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -506,7 +506,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
Expand All @@ -28,18 +28,21 @@ index 77da83070..9b811b4d0 100644
const int tensor_width = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
int tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
const int interleave_by = arm_compute::interleave_by(wf);
@@ -526,8 +526,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
}
else if(multi_stride_b == 0 || (ldb == tensor_width && multi_stride_b == tensor_height * tensor_width))
@@ -528,6 +528,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
{
+ if(tensor_width % interleave_by != 0) {
+ multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height;
+ }
// In this case dimension that is packed is only height
// so we need to stride only height by interleave_by
+ if(tensor_height % blocked_by != 0) {
+ tensor_height = arm_gemm::iceildiv(tensor_height, blocked_by) * blocked_by;
+ if(multi_stride_b != 0) {
+ multi_stride_b = tensor_height * tensor_width;
+ }
+ }
+
+ if(tensor_width % interleave_by != 0) {
+ multi_stride_b = arm_gemm::iceildiv(tensor_width, interleave_by) * interleave_by * tensor_height;
+ }
+
ldb = interleave_by * tensor_height;
}
else
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************

diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index ba5a49c6d..b7caf1479 100644
index 6a840b973..47a3a4a92 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -54,10 +54,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
Expand Down Expand Up @@ -73,9 +73,9 @@ index ba5a49c6d..b7caf1479 100644
acp.weights_info = arm_compute::WeightsInfo(
false,
kw,
@@ -283,6 +297,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
@@ -297,6 +311,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
const primitive_attr_t &attr) {
acp.is_indirect = false;

+ if(weights_md.ndims != 4) {
+ return status::unimplemented;
Expand All @@ -84,24 +84,25 @@ index ba5a49c6d..b7caf1479 100644
// General Compute Library checks, memory tags are also set there
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

@@ -309,7 +327,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
const primitive_attr_t &attr) {
@@ -325,7 +343,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
auto math_mode = get_fpmath_mode();
// Indirect convolution results in slowdown for low thread count or 1x1
// kernels, so fall back to GEMM-based convolution in these cases
- if (one_of(true, weights_md.dims[2] == 1, // kh
+ if (one_of(true, weights_md.ndims != 4,
+ weights_md.dims[2] == 1, // kh
weights_md.dims[3] == 1, // kw
dnnl_get_max_threads() < 28)) {
(!math_mode && dnnl_get_max_threads() < 28))) {
return status::unimplemented;
@@ -334,6 +353,26 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -350,6 +369,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
return status::success;
}

+status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
+ memory_desc_t &weights_md, memory_desc_t &dst_md,
+ memory_desc_t &bias_md, const convolution_desc_t &cd,
+ const primitive_attr_t &attr) {
+ acp.is_indirect = false;
+ // We need to make sure that number of dimensions for weights is either 5 or 3
+ if(weights_md.ndims != 5)
+ return status::unimplemented;
Expand All @@ -121,7 +122,7 @@ index ba5a49c6d..b7caf1479 100644
status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
@@ -342,7 +381,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -359,7 +399,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
// clang-format off
Expand All @@ -132,10 +133,10 @@ index ba5a49c6d..b7caf1479 100644
src_md.dims[1] < 64, // ic
dst_md.dims[1] < 64, // oc
diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
index 3e56245fa..809dc13c9 100644
index 44dc8eecb..7eae5cbb1 100644
--- a/src/cpu/aarch64/acl_convolution_utils.hpp
+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
@@ -66,6 +66,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -67,6 +67,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

Expand All @@ -147,6 +148,26 @@ index 3e56245fa..809dc13c9 100644
status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 4142dbc7e..1800aaf58 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
#include "cpu/aarch64/acl_gemm_convolution.hpp"
#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
#include "cpu/aarch64/acl_winograd_convolution.hpp"
#endif
using namespace dnnl::impl::cpu::aarch64;
@@ -104,6 +105,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
+ CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
CPU_INSTANCE(gemm_convolution_fwd_t)
diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
new file mode 100644
index 000000000..1beb8b8af
Expand Down Expand Up @@ -337,23 +358,3 @@ index 000000000..97e0a3a30
+} // namespace dnnl
+
+#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 4142dbc7e..1800aaf58 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
#include "cpu/aarch64/acl_gemm_convolution.hpp"
#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
#include "cpu/aarch64/acl_winograd_convolution.hpp"
#endif
using namespace dnnl::impl::cpu::aarch64;
@@ -104,6 +105,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
+ CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
CPU_INSTANCE(gemm_convolution_fwd_t)
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************

diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index c46d69757..ba5a49c6d 100644
index c46d69757..6a840b973 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -212,6 +212,69 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -212,6 +212,82 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
arm_compute::QuantizationInfo(1.0f / scales[0], 0));
}

Expand Down Expand Up @@ -50,13 +50,26 @@ index c46d69757..ba5a49c6d 100644
+ int interleaved_by = arm_compute::interleave_by(expected_weight_format);
+ int block_by = arm_compute::block_by(expected_weight_format);
+
+ bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
+ if(!is_fast_math_kernel) {
+ // FP32 kernel is faster then BF16
+ acp.fast_math = false;
+ }
+
+ memory_desc_t want_wei_md = weights_md;
+
+ int ic_multiply = ic;
+ if(ic % block_by != 0) {
+ ic_multiply = utils::div_up(ic, block_by) * block_by;
+ // Also we need to set padded dimensions as well
+ want_wei_md.padded_dims[1] = ic_multiply;
+ } else {
+ // If we do not need to pad input channels for fast math mode
+ // then it would be faster to run convolution with im2row
+ // instead of using indirect buffer
+ if(acp.fast_math && acp.is_indirect) {
+ return status::unimplemented;
+ }
+ }
+ if(oc % interleaved_by != 0) {
+ int padded_dim = utils::div_up(oc, interleaved_by) * interleaved_by;
Expand All @@ -79,7 +92,7 @@ index c46d69757..ba5a49c6d 100644
+ want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
+ }
+
+ if(acp.fast_math) {
+ if(is_fast_math_kernel) {
+ // If it is fast math mode we need weights in BFloat16
+ want_wei_md.data_type = dnnl_bf16;
+ }
Expand All @@ -89,6 +102,49 @@ index c46d69757..ba5a49c6d 100644
return status::success;
}

@@ -219,6 +295,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {
+ acp.is_indirect = false;

// General Compute Library checks, memory tags are also set there
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
@@ -244,11 +321,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {
+ acp.is_indirect = true;
+ auto math_mode = get_fpmath_mode();
// Indirect convolution results in slowdown for low thread count or 1x1
// kernels, so fall back to GEMM-based convolution in these cases
if (one_of(true, weights_md.dims[2] == 1, // kh
weights_md.dims[3] == 1, // kw
- dnnl_get_max_threads() < 28)) {
+ (!math_mode && dnnl_get_max_threads() < 28))) {
return status::unimplemented;
}

@@ -275,6 +354,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {
+ acp.is_indirect = false;

// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
index 3e56245fa..44dc8eecb 100644
--- a/src/cpu/aarch64/acl_convolution_utils.hpp
+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
@@ -43,6 +43,7 @@ struct acl_conv_conf_t {
// If this is true, the result of the convolution goes into a temporarily
// allocated ACL tensor to be accumulated into the oneDNN dst during postops
bool use_dst_acc;
+ bool is_indirect;
arm_compute::TensorInfo src_info;
arm_compute::TensorInfo wei_info;
arm_compute::TensorInfo bia_info;
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
index bcf031a77..4ddc8cf91 100644
--- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
Expand Down Expand Up @@ -258,7 +314,7 @@ index c5e507085..15ea61173 100644
// Validate fully connected layer manually to check for return status
ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
index 679baec3a..2c0b46978 100644
index 679baec3a..4aa219376 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
@@ -66,15 +66,12 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
Expand All @@ -279,7 +335,7 @@ index 679baec3a..2c0b46978 100644

amp.src_info = arm_compute::TensorInfo(
arm_compute::TensorShape(K, M, 1, src_batch), 1,
@@ -103,6 +100,121 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
@@ -103,6 +100,125 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
ACL_CHECK_VALID(arm_compute::NETranspose::validate(
&amp.wei_acc_info, &amp.wei_info));

Expand Down Expand Up @@ -310,6 +366,10 @@ index 679baec3a..2c0b46978 100644
+ // as returned by interleave by from expecting strides
+ int interleaved_by = arm_compute::interleave_by(expected_weight_format);
+ int block_by = arm_compute::block_by(expected_weight_format);
+ bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
+ if(!is_fast_math_kernel) {
+ amp.gemm_info.set_fast_math(false);
+ }
+
+ int blocked_first_dimension = -1;
+ int blocked_second_dimension = -1;
Expand Down Expand Up @@ -392,7 +452,7 @@ index 679baec3a..2c0b46978 100644
+ want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
+ }
+
+ if(is_fastmath_enabled) {
+ if(is_fast_math_kernel) {
+ want_wei_md.data_type = dnnl_bf16;
+ }
+
Expand Down
9 changes: 5 additions & 4 deletions docker/tensorflow-aarch64/patches/tf_acl.patch
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
See the License for the specific language governing permissions and
limitations under the License.
*******************************************************************************

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index e255103df98..b95f20b6232 100644
index e255103df98..1b0bb41201f 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -192,18 +192,19 @@ def _tf_repositories():
Expand All @@ -30,7 +31,7 @@ index e255103df98..b95f20b6232 100644
+ strip_prefix = "oneDNN-80d45b77f1a031a99d628b27aeea45b05f16b8b5",
+ urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/80d45b77f1a031a99d628b27aeea45b05f16b8b5.tar.gz"),
)

tf_http_archive(
name = "compute_library",
- sha256 = "94e2e9ff87c261a9c9987bc9024c449c48014f7fe707311bdfa76b87f3dda5c5",
Expand All @@ -40,8 +41,8 @@ index e255103df98..b95f20b6232 100644
build_file = "//third_party/compute_library:BUILD",
- patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:activation_func_correct_args.patch"],
- urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.05.tar.gz"),
+ patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch"],
+ patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch", "//third_party/compute_library:acl_fixup_SVE_merges.patch"],
+ urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.08.tar.gz"),
)

tf_http_archive(
4 changes: 4 additions & 0 deletions docker/tensorflow-aarch64/scripts/build-tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ if [[ $ONEDNN_BUILD ]]; then
mv ../acl_fixed_format_kernels_striding.patch ./third_party/compute_library/.
mv ../acl_depthwise_updateable_weights.patch ./third_party/compute_library/.

# SVE merge fixup
wget "https://git.mlplatform.org/ml/ComputeLibrary.git/patch/?id=ce79ac6297e6eb2407abd24846b8504dee43770f" -O ../acl_fixup_SVE_merges.patch
mv ../acl_fixup_SVE_merges.patch ./third_party/compute_library/.

# Patch to cap the number of threads for ACL primitives
mv ../onednn_acl_threadcap.patch ./third_party/mkl_dnn/.

Expand Down