From 93465cf9e34327c526470e807831dca600d26603 Mon Sep 17 00:00:00 2001 From: Aleksei Khomenko Date: Wed, 11 Sep 2024 14:00:22 +0200 Subject: [PATCH 1/9] fix(samples): replace `CL/sycl.hpp` with `sycl/sycl.hpp` (#2893) --- samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp | 2 +- samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp b/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp index 19a9cd06f67..8183638097b 100644 --- a/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp +++ b/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include +#include #include #include diff --git a/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp b/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp index 19ccf8df5ee..1234a6d30fe 100644 --- a/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp +++ b/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include +#include #include #include From 3416bc90f75ac0d1236f6055deeed94cee8f9a27 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Thu, 12 Sep 2024 07:27:58 -0700 Subject: [PATCH 2/9] fix: resolve omp simd error in mdd (#2894) * fix: resolve omp simd error in mdd * updating makefile instead * simplify slightly --- dev/make/compiler_definitions/icx.mkl.32e.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/make/compiler_definitions/icx.mkl.32e.mk b/dev/make/compiler_definitions/icx.mkl.32e.mk index af18b3fe0ec..e61e6bc39bc 100644 --- a/dev/make/compiler_definitions/icx.mkl.32e.mk +++ b/dev/make/compiler_definitions/icx.mkl.32e.mk @@ -33,7 +33,7 @@ CORE.SERV.COMPILER.icx = generic COMPILER.lnx.icx = icx -m64 \ -Werror -Wreturn-type -qopenmp-simd -COMPILER.win.icx = icx $(if $(MSVC_RT_is_release),-MD, -MDd) -nologo -WX -Qopenmp-simd -Wno-deprecated-declarations +COMPILER.win.icx = icx $(if $(MSVC_RT_is_release),-MD -Qopenmp-simd, -MDd) -nologo -WX -Wno-deprecated-declarations link.dynamic.lnx.icx = icx -m64 -no-intel-lib From c27a8fc57c84549c9f823347265ae7f6d5e45ffc Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 16 Sep 2024 16:49:56 +0100 Subject: [PATCH 3/9] Change default COMPILER to icx in makefile (#2898) --- dev/make/function_definitions/32e.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/make/function_definitions/32e.mk b/dev/make/function_definitions/32e.mk index 41dfbb96fe9..cfee5fc4a11 100644 --- a/dev/make/function_definitions/32e.mk +++ b/dev/make/function_definitions/32e.mk @@ -20,7 +20,7 @@ ifeq ($(filter mkl ref,$(BACKEND_CONFIG)),) endif COMPILERs = icc icx gnu clang vc -COMPILER ?= icc +COMPILER ?= icx CPUs := sse2 sse42 avx2 avx512 CPUs.files := nrh neh hsw skx From 00dacc9bef307d2695e138412263831c42499eb1 Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Tue, 17 Sep 2024 05:50:27 -0700 Subject: [PATCH 4/9] Add DEPENDENTLOAD flag to vc linker on Windows (#2896) --- dev/make/compiler_definitions/vc.mkl.32e.mk | 2 ++ examples/cmake/setup_examples.cmake | 3 +++ samples/cmake/setup_samples.cmake | 3 +++ 3 files changed, 8 insertions(+) diff --git a/dev/make/compiler_definitions/vc.mkl.32e.mk b/dev/make/compiler_definitions/vc.mkl.32e.mk index de73bd6cb5a..975dc788fb9 100644 --- a/dev/make/compiler_definitions/vc.mkl.32e.mk +++ b/dev/make/compiler_definitions/vc.mkl.32e.mk @@ -30,6 +30,8 @@ CORE.SERV.COMPILER.vc = generic # Disable C4661 because of false positives COMPILER.win.vc = cl $(if $(MSVC_RT_is_release),-MD, -MDd) -nologo -EHsc -wd4661 -WX +link.dynamic.win.vc = /DEPENDENTLOADFLAG:0x2000 + p4_OPT.vc = mc3_OPT.vc = avx2_OPT.vc = -arch:AVX2 diff --git a/examples/cmake/setup_examples.cmake b/examples/cmake/setup_examples.cmake index 3f2058417c1..7d1fb3e5d50 100644 --- a/examples/cmake/setup_examples.cmake +++ b/examples/cmake/setup_examples.cmake @@ -123,6 +123,9 @@ function (add_examples examples_paths) endif() target_compile_options(${example} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS}) target_link_options(${example} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS}) + if(WIN32 AND "${ONEDAL_LINK}" STREQUAL "dynamic" AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC|IntelLLVM") + target_link_options(${example} PRIVATE /DEPENDENTLOADFLAG:0x2000) + endif() set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}") endforeach() set_common_compiler_options() diff --git a/samples/cmake/setup_samples.cmake b/samples/cmake/setup_samples.cmake index dd7d3cc758b..abf480f02e9 100644 --- a/samples/cmake/setup_samples.cmake +++ b/samples/cmake/setup_samples.cmake @@ -108,6 +108,9 @@ function(add_samples samples_paths) target_compile_options(${sample} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS}) target_link_options(${sample} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS}) + if(WIN32 AND "${ONEDAL_LINK}" STREQUAL "dynamic" AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC|IntelLLVM") + target_link_options(${sample} PRIVATE /DEPENDENTLOADFLAG:0x2000) + endif() set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}") add_custom_target(run_${sample} From b6b6a65a59b46d6770198abfc11cfa476c1338a2 Mon Sep 17 00:00:00 2001 From: david-cortes-intel Date: Thu, 19 Sep 2024 13:08:44 +0200 Subject: [PATCH 5/9] increase timeout for failing jobs (#2900) --- .ci/pipeline/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index a0620f66e9d..2f768b8b801 100755 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -620,7 +620,7 @@ jobs: conda activate CB source $(Pipeline.Workspace)/daal/latest/env/vars.sh ./sklearnex/conda-recipe/run_test.sh - timeoutInMinutes: 15 + timeoutInMinutes: 20 displayName: sklearnex test - script: | source /usr/share/miniconda/etc/profile.d/conda.sh From 9b315cf8124c700f79bdc3007aac103df3b831bf Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Thu, 19 Sep 2024 23:32:38 +0200 Subject: [PATCH 6/9] FIX: `Kmeans` timeouts issue on GPU for sparse inputs (#2906) --- cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp index dba39bb9d01..228e9e01863 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp @@ -371,7 +371,7 @@ sycl::event handle_empty_clusters(const dal::backend::context_gpu& ctx, auto event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(range, [=](auto it) { - const auto local_id = it.get_local_id(1); + const auto local_id = it.get_local_id()[1]; for (std::int64_t cluster_id = rank; cluster_id < num_clusters; cluster_id += rank_count) { // no need to handle non-empty clusters From 488c4006a5a2f29c272c1c48e71b7a3a2ece5a61 Mon Sep 17 00:00:00 2001 From: Victoriya Fedotova Date: Fri, 20 Sep 2024 12:55:02 +0200 Subject: [PATCH 7/9] Add a chapter about threading layer into the docs (#2848) - Added a chapter about internal threading primitives into oneDAL documentation - Improved commenting in the header specifying those primitives APIs (threading.h) --- CONTRIBUTING.md | 6 + cpp/daal/src/threading/threading.cpp | 6 +- cpp/daal/src/threading/threading.h | 333 ++++++++++++++---- docs/source/contribution/threading.rst | 166 +++++++++ .../threading/dot-parallel-init-tls.rst | 30 ++ .../dot-parallel-partial-compute.rst | 40 +++ .../threading/dot-parallel-reduction.rst | 25 ++ .../includes/threading/dot-parallel.rst | 63 ++++ .../includes/threading/dot-sequential.rst | 25 ++ .../threading/dot-static-parallel.rst | 64 ++++ .../includes/threading/nested-parallel-ls.rst | 53 +++ .../includes/threading/nested-parallel.rst | 54 +++ .../threading/sum-parallel-by-blocks.rst | 32 ++ .../includes/threading/sum-parallel.rst | 26 ++ .../includes/threading/sum-sequential.rst | 23 ++ docs/source/index-toc.rst | 9 +- 16 files changed, 875 insertions(+), 80 deletions(-) create mode 100644 docs/source/contribution/threading.rst create mode 100644 docs/source/includes/threading/dot-parallel-init-tls.rst create mode 100644 docs/source/includes/threading/dot-parallel-partial-compute.rst create mode 100644 docs/source/includes/threading/dot-parallel-reduction.rst create mode 100644 docs/source/includes/threading/dot-parallel.rst create mode 100644 docs/source/includes/threading/dot-sequential.rst create mode 100644 docs/source/includes/threading/dot-static-parallel.rst create mode 100644 docs/source/includes/threading/nested-parallel-ls.rst create mode 100644 docs/source/includes/threading/nested-parallel.rst create mode 100644 docs/source/includes/threading/sum-parallel-by-blocks.rst create mode 100644 docs/source/includes/threading/sum-parallel.rst create mode 100644 docs/source/includes/threading/sum-sequential.rst diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2644e0f06fe..d3e45c45a9d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,6 +83,12 @@ editorconfig-checker For your convenience we also added [coding guidelines](http://oneapi-src.github.io/oneDAL/contribution/coding_guide.html) with examples and detailed descriptions of the coding style oneDAL follows. We encourage you to consult them when writing your code. +## Custom Components + +### Threading Layer + +In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form are called the [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms. + ## Documentation Guidelines oneDAL uses `Doxygen` for inline comments in public header files that are used to build the API reference and `reStructuredText` for the Developer Guide. See [oneDAL documentation](https://oneapi-src.github.io/oneDAL/) for reference. diff --git a/cpp/daal/src/threading/threading.cpp b/cpp/daal/src/threading/threading.cpp index 15c39368238..3b280229ee7 100644 --- a/cpp/daal/src/threading/threading.cpp +++ b/cpp/daal/src/threading/threading.cpp @@ -103,7 +103,7 @@ DAAL_EXPORT size_t _setNumberOfThreads(const size_t numThreads, void ** globalCo return 1; } -DAAL_EXPORT void _daal_threader_for(int n, int threads_request, const void * a, daal::functype func) +DAAL_EXPORT void _daal_threader_for(int n, int reserved, const void * a, daal::functype func) { if (daal::threader_env()->getNumberOfThreads() > 1) { @@ -160,7 +160,7 @@ DAAL_EXPORT void _daal_threader_for_blocked_size(size_t n, size_t block, const v } } -DAAL_EXPORT void _daal_threader_for_simple(int n, int threads_request, const void * a, daal::functype func) +DAAL_EXPORT void _daal_threader_for_simple(int n, int reserved, const void * a, daal::functype func) { if (daal::threader_env()->getNumberOfThreads() > 1) { @@ -318,7 +318,7 @@ DAAL_PARALLEL_SORT_IMPL(daal::IdxValType, pair_fp64_uint64) #undef DAAL_PARALLEL_SORT_IMPL -DAAL_EXPORT void _daal_threader_for_blocked(int n, int threads_request, const void * a, daal::functype2 func) +DAAL_EXPORT void _daal_threader_for_blocked(int n, int reserved, const void * a, daal::functype2 func) { if (daal::threader_env()->getNumberOfThreads() > 1) { diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h index 0b4a9881b97..ca8661f2203 100644 --- a/cpp/daal/src/threading/threading.h +++ b/cpp/daal/src/threading/threading.h @@ -198,107 +198,196 @@ inline size_t setNumberOfThreads(const size_t numThreads, void ** globalControl) template inline void threader_func(int i, const void * a) { - const F & lambda = *static_cast(a); - lambda(i); + const F & func = *static_cast(a); + func(i); } template inline void static_threader_func(size_t i, size_t tid, const void * a) { - const F & lambda = *static_cast(a); - lambda(i, tid); + const F & func = *static_cast(a); + func(i, tid); } template inline void threader_func_b(int i0, int in, const void * a) { - const F & lambda = *static_cast(a); - lambda(i0, in); + const F & func = *static_cast(a); + func(i0, in); } template inline void threader_func_break(int i, bool & needBreak, const void * a) { - const F & lambda = *static_cast(a); - lambda(i, needBreak); + const F & func = *static_cast(a); + func(i, needBreak); } +/// Pass a function to be executed in a for loop to the threading layer. +/// The maximal number of iterations in the loop is `2^31 - 1 (INT32_MAX)`. +/// The default scheduling of the threading layer is used to assign +/// the iterations of the loop to threads. +/// Data dependencies between the iterations are allowed, but may requre the use +/// of synchronization primitives. +/// +/// @tparam F Callable object of type `[/* captures */](int i) -> void`, +/// where `i` is the loop's iteration index, `0 <= i < n`. +/// +/// @param[in] n Number of iterations in the for loop. +/// @param[in] reserved Parameter reserved for the future. Currently unused. +/// @param[in] func Callable object that defines the loop body. template -inline void threader_for(int n, int threads_request, const F & lambda) +inline void threader_for(int n, int reserved, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); - _daal_threader_for(n, threads_request, a, threader_func); + _daal_threader_for(n, reserved, a, threader_func); } +/// Pass a function to be executed in a for loop to the threading layer. +/// The maximal number of iterations in the loop is `2^63 - 1 (INT64_MAX)`. +/// The default scheduling of the threading layer is used to assign +/// the iterations of the loop to threads. +/// The iterations of the loop should be logically independent. +/// Data dependencies between the iterations are allowed, but may requre the use +/// of synchronization primitives. +/// +/// @tparam F Callable object of type `[/* captures */](int64_t i) -> void`, +/// where `i` is the loop's iteration index, `0 <= i < n`. +/// +/// @param[in] n Number of iterations in the for loop. +/// @param[in] func Callable object that defines the loop body. template -inline void threader_for_int64(int64_t n, const F & lambda) +inline void threader_for_int64(int64_t n, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); _daal_threader_for_int64(n, a, threader_func); } +/// Pass a function to be executed in a for loop to the threading layer. +/// The maximal number of iterations in the loop is 2^31 - 1. +/// +/// The specifics of this loop comparing to `threader_for` is that the iteration space +/// of the loop is always chunked with chunk size 1. +/// This means the threading layer tries to assign consecutive iterations to +/// different threads, if possible. +/// In case of oneTBB threading backend this means that `simple_partitioner` +/// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html) +/// with chunk size 1 is used to produce iteration to threads mappings. +/// +/// Data dependencies between the iterations are allowed, but may requre the use +/// of synchronization primitives. +/// +/// @tparam F Callable object of type `[/* captures */](int i) -> void`, +/// where `i` is the loop's iteration index, `0 <= i < n`. +/// +/// @param[in] n Number of iterations in the for loop. +/// @param[in] reserved Parameter reserved for the future. Currently unused. +/// @param[in] func Callable object that defines iteration's body. template -inline void threader_for_simple(int n, int threads_request, const F & lambda) +inline void threader_for_simple(int n, int reserved, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); - _daal_threader_for_simple(n, threads_request, a, threader_func); + _daal_threader_for_simple(n, reserved, a, threader_func); } template -inline void threader_for_int32ptr(const int * begin, const int * end, const F & lambda) +inline void threader_for_int32ptr(const int * begin, const int * end, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); _daal_threader_for_int32ptr(begin, end, a, threader_func); } +/// Execute the for loop defined by the input parameters in parallel. +/// The maximal number of iterations in the loop is `SIZE_MAX` in C99 standard. +/// +/// The work is scheduled statically across threads. +/// This means that the work is always scheduled in the same way across the threads: +/// each thread processes the same set of iterations on each invocation of this loop. +/// +/// It is recommended to use this parallel loop if each iteration of the loop +/// performs equal amount of work. +/// +/// Let `t` be the number of threads available to oneDAL. The number of iterations +/// processed by each threads (except maybe the last one) is computed as: +/// `nI = (n + t - 1) / t` +/// +/// Here is how the work is split across the threads: +/// The 1st thread executes iterations `0, ..., nI - 1`; +/// the 2nd thread executes iterations `nI, ..., 2 * nI - 1`; +/// ... +/// the `t`-th thread executes iterations `(t - 1) * nI, ..., n - 1`. +/// +/// @tparam F Callable object of type `[/* captures */](size_t i, size_t tid) -> void`, +/// where +/// `i` is the loop's iteration index, `0 <= i < n`; +/// `tid` is the index of the thread, `0 <= tid < t`. +/// +/// @param[in] n Number of iterations in the for loop. +/// @param[in] func Callable object that defines iteration's body. template -inline void static_threader_for(size_t n, const F & lambda) +inline void static_threader_for(size_t n, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); _daal_static_threader_for(n, a, static_threader_func); } +/// Pass a function to be executed in a for loop to the threading layer. +/// The maximal number of iterations in the loop is `2^31 - 1 INT32_MAX`. +/// The default scheduling of the threading layer is used to assign +/// the iterations of the loop to threads. +/// +/// @tparam F Callable object of type `[/* captures */](int beginRange, int endRange) -> void` +/// where +/// `beginRange` is the starting index of the loop iterations block to be +/// processed by a thread, `0 <= beginRange < n`; +/// `endRange` is the index after the end of the loop's iterations block to be +/// processed by a thread, `beginRange < endRange <= n`; +/// +/// @param[in] n Number of iterations in the for loop. +/// @param[in] reserved Parameter reserved for the future. Currently unused. +/// @param[in] func Callable object that processes the block of loop's iterations +/// `[beginRange, endRange)`. template -inline void threader_for_blocked(int n, int threads_request, const F & lambda) +inline void threader_for_blocked(int n, int reserved, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); - _daal_threader_for_blocked(n, threads_request, a, threader_func_b); + _daal_threader_for_blocked(n, reserved, a, threader_func_b); } template -inline void threader_for_optional(int n, int threads_request, const F & lambda) +inline void threader_for_optional(int n, int threads_request, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); _daal_threader_for_optional(n, threads_request, a, threader_func); } template -inline void threader_for_break(int n, int threads_request, const F & lambda) +inline void threader_for_break(int n, int threads_request, const F & func) { - const void * a = static_cast(&lambda); + const void * a = static_cast(&func); _daal_threader_for_break(n, threads_request, a, threader_func_break); } -template +template inline void * tls_func(const void * a) { - const lambdaType & lambda = *static_cast(a); - return lambda(); + const callableType & func = *static_cast(a); + return func(); } -template +template inline void tls_reduce_func(void * v, const void * a) { - const lambdaType & lambda = *static_cast(a); - lambda((F)v); + const callableType & func = *static_cast(a); + func((F)v); } struct tlsBase @@ -313,32 +402,49 @@ class tls_deleter : public tlsBase virtual void del(void * a) = 0; }; -template +template class tls_deleter_ : public tls_deleter { public: virtual ~tls_deleter_() {} - virtual void del(void * a) { delete static_cast(a); } + virtual void del(void * a) { delete static_cast(a); } }; +/// Thread-local storage (TLS). +/// Can change its local variable after a nested parallel constructs. +/// @note Thread-local storage in nested parallel regions is, in general, not thread local. +/// The use of nested parallelism should be avoided if possible, otherwise extra care +/// must be taken with thread-local values. +/// +/// @tparam F Type of the data located in the storage template class tls : public tlsBase { public: - template - explicit tls(const lambdaType & lambda) + /// Initialize thread-local storage + /// + /// @tparam callableType Callable object of type `[/* captures */]() -> F` + /// + /// @param func Callable object that initializes a thread-local storage + template + explicit tls(const callableType & func) { - lambdaType * locall = new lambdaType(lambda); - d = new tls_deleter_(); + callableType * localfunc = new callableType(func); + d = new tls_deleter_(); - //const void* ac = static_cast(&lambda); - const void * ac = static_cast(locall); + //const void* ac = static_cast(&func); + const void * ac = static_cast(localfunc); void * a = const_cast(ac); voidLambda = a; - tlsPtr = _daal_get_tls_ptr(a, tls_func); + tlsPtr = _daal_get_tls_ptr(a, tls_func); } + /// Destroys the memory associated with a thread-local storage + /// + /// @note TLS does not release the memory allocated by a callable object + /// provided to the constructor. + /// Developers are responsible for deletion of that memory. virtual ~tls() { d->del(voidLambda); @@ -346,26 +452,43 @@ class tls : public tlsBase _daal_del_tls_ptr(tlsPtr); } + /// Access a local data of a thread by value + /// + /// @return When first invoked by a thread, a callable object provided to the constructor is + /// called to initialize the local data of the thread and return it. + /// All the following invocations just return the same thread-local data. F local() { void * pf = _daal_get_tls_local(tlsPtr); return (static_cast(pf)); } - template - void reduce(const lambdaType & lambda) + /// Sequential reduction. + /// + /// @tparam callableType Callable object of type `[/* captures */](F) -> void` + /// + /// @param func Callable object that is applied to each element of thread-local + /// storage sequentially. + template + void reduce(const callableType & func) { - const void * ac = static_cast(&lambda); + const void * ac = static_cast(&func); void * a = const_cast(ac); - _daal_reduce_tls(tlsPtr, a, tls_reduce_func); + _daal_reduce_tls(tlsPtr, a, tls_reduce_func); } - template - void parallel_reduce(const lambdaType & lambda) + /// Parallel reduction. + /// + /// @tparam callableType Callable object of type `[/* captures */](F) -> void` + /// + /// @param func Callable object that is applied to each element of thread-local + /// storage in parallel. + template + void parallel_reduce(const callableType & func) { - const void * ac = static_cast(&lambda); + const void * ac = static_cast(&func); void * a = const_cast(ac); - _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func); + _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func); } private: @@ -374,11 +497,11 @@ class tls : public tlsBase tls_deleter * d; }; -template +template inline void * creater_func(const void * a) { - const lambdaType & lambda = *static_cast(a); - return lambda(); + const callableType & func = *static_cast(a); + return func(); } class static_tls_deleter @@ -388,20 +511,28 @@ class static_tls_deleter virtual void del(void * a) = 0; }; -template +template class static_tls_deleter_ : public static_tls_deleter { public: virtual ~static_tls_deleter_() {} - virtual void del(void * a) { delete static_cast(a); } + virtual void del(void * a) { delete static_cast(a); } }; +/// Thread-local storage (TLS) for the case of static parallel work scheduling. +/// +/// @tparam F Type of the data located in the storage template class static_tls { public: - template - explicit static_tls(const lambdaType & lambda) + /// Initialize thread-local storage. + /// + /// @tparam callableType Callable object of type `[/* captures */]() -> F` + /// + /// @param func Callable object that initializes a thread-local storage + template + explicit static_tls(const callableType & func) { _nThreads = threader_get_max_threads_number(); @@ -417,8 +548,8 @@ class static_tls _storage[i] = nullptr; } - lambdaType * locall = new lambdaType(lambda); - _deleter = new static_tls_deleter_(); + callableType * locall = new callableType(func); + _deleter = new static_tls_deleter_(); if (!locall || !_deleter) { return; @@ -428,9 +559,14 @@ class static_tls void * a = const_cast(ac); _creater = a; - _creater_func = creater_func; + _creater_func = creater_func; } + /// Destroys the memory associated with a thread-local storage. + /// + /// @note Static TLS does not release the memory allocated by a callable object + /// provided to the constructor. + /// Developers are responsible for deletion of that memory. virtual ~static_tls() { if (_deleter) @@ -441,9 +577,16 @@ class static_tls delete[] _storage; } + /// Access a local data of a specified thread by value. + /// + /// @param tid Index of the thread. + /// + /// @return When first invoked by a thread, a callable object provided to the constructor is + /// called to initialize the local data of the thread and return it. + /// All the following invocations just return the same thread-local data. F local(size_t tid) { - if (_storage) + if (_storage && tid < _nThreads) { if (!_storage[tid]) { @@ -458,18 +601,27 @@ class static_tls } } - template - void reduce(const lambdaType & lambda) + /// Sequential reduction. + /// + /// @tparam callableType Callable object of type `[/* captures */](F) -> void` + /// + /// @param func Callable object that is applied to each element of thread-local + /// storage sequentially. + template + void reduce(const callableType & func) { if (_storage) { for (size_t i = 0; i < _nThreads; ++i) { - if (_storage[i]) lambda(_storage[i]); + if (_storage[i]) func(_storage[i]); } } } + /// Full number of threads. + /// + /// @return Total number of threads available to oneDAL. size_t nthreads() const { return _nThreads; } private: @@ -480,25 +632,43 @@ class static_tls static_tls_deleter * _deleter = nullptr; }; +/// Local storage (LS) for the data of a thread. +/// Does not change its local variable after nested parallel constructs, +/// but can have performance penalties compared to thread-local storage type `daal::tls`. +/// Can be safely used in case of nested parallel regions. +/// +/// @tparam F Type of the data located in the storage template class ls : public tlsBase { public: - template - explicit ls(const lambdaType & lambda, const bool isTls = false) + /// Initialize local storage. + /// + /// @tparam callableType Callable object of type `[/* captures */]() -> F` + /// + /// @param func Callable object that initializes local storage + /// @param isTls if `true`, then local storage is a thread-local storage (`daal::tls`) + /// and might have problems in case of nested parallel regions. + template + explicit ls(const callableType & func, const bool isTls = false) { - _isTls = isTls; - lambdaType * locall = new lambdaType(lambda); - d = new tls_deleter_(); + _isTls = isTls; + callableType * localfunc = new callableType(func); + d = new tls_deleter_(); - //const void* ac = static_cast(&lambda); - const void * ac = static_cast(locall); + //const void* ac = static_cast(&func); + const void * ac = static_cast(localfunc); void * a = const_cast(ac); voidLambda = a; - lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func) : _daal_get_ls_ptr(a, tls_func); + lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func) : _daal_get_ls_ptr(a, tls_func); } + /// Destroys the memory associated with local storage. + /// + /// @note `ls` does not release the memory allocated by a callable object + /// provided to the constructor. + /// Developers are responsible for deletion of that memory. virtual ~ls() { d->del(voidLambda); @@ -506,6 +676,11 @@ class ls : public tlsBase _isTls ? _daal_del_tls_ptr(lsPtr) : _daal_del_ls_ptr(lsPtr); } + /// Access the local data of a thread by value. + /// + /// @return When first invoked by a thread, a callable object provided to the constructor is + /// called to initialize the local data of the thread and return it. + /// All the following invocations just return the same thread-local data. F local() { void * pf = _isTls ? _daal_get_tls_local(lsPtr) : _daal_get_ls_local(lsPtr); @@ -517,12 +692,18 @@ class ls : public tlsBase if (!_isTls) _daal_release_ls_local(lsPtr, p); } - template - void reduce(const lambdaType & lambda) + /// Sequential reduction. + /// + /// @tparam callableType Callable object of type `[/* captures */](F) -> void` + /// + /// @param func Callable object that is applied to each element of thread-local + /// storage sequentially. + template + void reduce(const callableType & func) { - const void * ac = static_cast(&lambda); + const void * ac = static_cast(&func); void * a = const_cast(ac); - _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func) : _daal_reduce_ls(lsPtr, a, tls_reduce_func); + _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func) : _daal_reduce_ls(lsPtr, a, tls_reduce_func); } private: diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst new file mode 100644 index 00000000000..cd1acd84e95 --- /dev/null +++ b/docs/source/contribution/threading.rst @@ -0,0 +1,166 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +.. highlight:: cpp + +Threading Layer +^^^^^^^^^^^^^^^ + +oneDAL uses Intel\ |reg|\ oneAPI Threading Building Blocks (Intel\ |reg|\ oneTBB) to do parallel +computations on CPU. + +But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather +use custom primitives that either wrap oneTBB functionality or are in-house developed. +Those primitives form oneDAL's threading layer. + +This is done in order not to be dependent on possible oneTBB API changes and even +on the particular threading technology like oneTBB, C++11 standard threads, etc. + +The API of the layer is defined in +`threading.h `_. +Please be aware that the threading API is not a part of oneDAL product API. +This is the product internal API that aimed to be used only by oneDAL developers, and can be changed at any time +without any prior notification. + +This chapter describes common parallel patterns and primitives of the threading layer. + +threader_for +************ + +Consider a case where you need to compute an elementwise sum of two arrays and store the results +into another array. +Here is a variant of sequential implementation: + +.. include:: ../includes/threading/sum-sequential.rst + +There are several options available in the threading layer of oneDAL to let the iterations of this code +run in parallel. +One of the options is to use ``daal::threader_for`` as shown here: + +.. include:: ../includes/threading/sum-parallel.rst + +The iteration space here goes from ``0`` to ``n-1``. +The last argument is a function object that performs a single iteration of the loop, given loop index ``i``. + +Blocking +-------- + +To have more control over the parallel execution and to increase +`cache locality `_ oneDAL usually splits +the data into blocks and then processes those blocks in parallel. + +This code shows how a typical parallel loop in oneDAL looks like: + +.. include:: ../includes/threading/sum-parallel-by-blocks.rst + +Thread-local Storage (TLS) +************************** + +Consider you need to compute a dot product of two arrays. +Here is a variant of sequential implementation: + +.. include:: ../includes/threading/dot-sequential.rst + +Parallel computations can be performed in two steps: + + 1. Compute partial dot product in each thread. + 2. Perform a reduction: Add the partial results from all threads to compute the final dot product. + +``daal::tls`` provides a local storage where each thread can accumulate its local results. +The following code allocates memory that would store partial dot products for each thread: + +.. include:: ../includes/threading/dot-parallel-init-tls.rst + +``SafeStatus`` in this code denotes a thread-safe counterpart of the ``Status`` class. +``SafeStatus`` allows to collect errors from all threads and report them to the user using the +``detach()`` method. An example will be shown later in the documentation. + +Checking the status right after the initialization code won't show the allocation errors, +because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS +is evaluated on first use of the thread-local storage (TLS). + +There are several options available in the threading layer of oneDAL to compute the partial +dot product results at each thread. +One of the options is to use the already mentioned ``daal::threader_for`` and blocking approach +as shown here: + +.. include:: ../includes/threading/dot-parallel-partial-compute.rst + +To compute the final result it is required to reduce each thread's partial results +as shown here: + +.. include:: ../includes/threading/dot-parallel-reduction.rst + +Local memory of the threads should be released when it is no longer needed. + +The complete parallel version of dot product computations would look like: + +.. include:: ../includes/threading/dot-parallel.rst + +Static Work Scheduling +********************** + +By default, oneTBB uses +`dynamic work scheduling `_ +and work stealing. +It means that two different runs of the same parallel loop can produce different +mappings of the loop's iteration space to the available threads. +This strategy is beneficial when it is difficult to estimate the amount of work performed +by each iteration. + +In the cases when it is known that the iterations perform an equal amount of work, it +is more performant to use predefined mapping of the loop's iterations to threads. +This is what static work scheduling does. + +``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static +work scheduling within oneDAL. + +Here is a variant of parallel dot product computation with static scheduling: + +.. include:: ../includes/threading/dot-static-parallel.rst + +Nested Parallelism +****************** + +oneDAL supports nested parallel loops. +It is important to know that: + + "when a parallel construct calls another parallel construct, a thread can obtain a task + from the outer-level construct while waiting for completion of the inner-level one." + + -- `oneTBB documentation `_ + +In practice, this means that a thread-local variable might unexpectedly +change its value after a nested parallel construct: + +.. include:: ../includes/threading/nested-parallel.rst + +In some scenarios this can lead to deadlocks, segmentation faults and other issues. + +oneTBB provides ways to isolate execution of a parallel construct, for its tasks +to not interfere with other simultaneously running tasks. + +Those options are preferred when the parallel loops are initially written as nested. +But in oneDAL there are cases when one parallel algorithm, the outer one, +calls another parallel algorithm, the inner one, within a parallel region. + +The inner algorithm in this case can also be called solely, without additional nesting. +And we do not always want to make it isolated. + +For the cases like that, oneDAL provides ``daal::ls``. Its ``local()`` method always +returns the same value for the same thread, regardless of the nested execution: + +.. include:: ../includes/threading/nested-parallel-ls.rst diff --git a/docs/source/includes/threading/dot-parallel-init-tls.rst b/docs/source/includes/threading/dot-parallel-init-tls.rst new file mode 100644 index 00000000000..8e72646a7ca --- /dev/null +++ b/docs/source/includes/threading/dot-parallel-init-tls.rst @@ -0,0 +1,30 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/algorithms/service_error_handling.h" + #include "src/threading/threading.h" + + SafeStatus safeStat; + daal::tls dotProductTLS([=, &safeStat]() { + float * dotProductPtr = new (std::nothrow) float; + if (!dotProductPtr) { + safeStat.add(services::ErrorMemoryAllocationFailed); + } + dotProductPtr[0] = 0.0f; + return dotProductPtr; + }); diff --git a/docs/source/includes/threading/dot-parallel-partial-compute.rst b/docs/source/includes/threading/dot-parallel-partial-compute.rst new file mode 100644 index 00000000000..0521e4bb2cf --- /dev/null +++ b/docs/source/includes/threading/dot-parallel-partial-compute.rst @@ -0,0 +1,40 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + constexpr size_t blockSize = 1024; + const size_t nBlocks = (n + blockSize - 1) / blockSize; + + daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) { + const size_t iStart = iBlock * blockSize; + const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n; + + // Compute partial result for this block + float partialDotProduct = 0.0f; + for (size_t i = iStart; i < iEnd; ++i) { + partialDotProduct += a[i] * b[i]; + } + + // Update thread-local result + float * localDotProduct = dotProductTLS.local(); + if (!localDotProduct) { + // Allocation error happened earlier + return; + } + localDotProduct[0] += partialDotProduct; + }); + DAAL_CHECK_SAFE_STATUS(); // if (!safeStat) return safeStat.detach(); diff --git a/docs/source/includes/threading/dot-parallel-reduction.rst b/docs/source/includes/threading/dot-parallel-reduction.rst new file mode 100644 index 00000000000..e86ca030246 --- /dev/null +++ b/docs/source/includes/threading/dot-parallel-reduction.rst @@ -0,0 +1,25 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + float dotProduct = 0.0f; + tls.reduce([&](float * localDotProduct) { + if (localDotProduct) { + dotProduct += localDotProduct[0]; + delete localDotProduct; + } + }); diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst new file mode 100644 index 00000000000..d0230715a01 --- /dev/null +++ b/docs/source/includes/threading/dot-parallel.rst @@ -0,0 +1,63 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/algorithms/service_error_handling.h" + #include "src/threading/threading.h" + + services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) { + SafeStatus safeStat; + daal::tls dotProductTLS([=, &safeStat]() { + float * dotProductPtr = new (std::nothrow) float; + if (!dotProductPtr) { + safeStat.add(services::ErrorMemoryAllocationFailed); + } + dotProductPtr[0] = 0.0f; + return dotProductPtr; + }); + + constexpr size_t blockSize = 1024; + const size_t nBlocks = (n + blockSize - 1) / blockSize; + + daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) { + const size_t iStart = iBlock * blockSize; + const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n; + + // Compute partial result for this block + float partialDotProduct = 0.0f; + for (size_t i = iStart; i < iEnd; ++i) { + partialDotProduct += a[i] * b[i]; + } + + // Update thread-local result + float * localDotProduct = dotProductTLS.local(); + if (!localDotProduct) { + // Allocation error happened earlier + return; + } + localDotProduct[0] += partialDotProduct; + }); + DAAL_CHECK_SAFE_STATUS(); + + tls.reduce([&](float * localDotProduct) { + if (localDotProduct) { + dotProduct += localDotProduct[0]; + delete localDotProduct; + } + }); + return services::Status(); + } diff --git a/docs/source/includes/threading/dot-sequential.rst b/docs/source/includes/threading/dot-sequential.rst new file mode 100644 index 00000000000..93300053c32 --- /dev/null +++ b/docs/source/includes/threading/dot-sequential.rst @@ -0,0 +1,25 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + float dot(const size_t n, const float* a, const float* b) { + float result = 0.0f; + for (size_t i = 0; i < n; ++i) { + result += a[i] * b[i]; + } + return result; + } diff --git a/docs/source/includes/threading/dot-static-parallel.rst b/docs/source/includes/threading/dot-static-parallel.rst new file mode 100644 index 00000000000..dfec18b6d21 --- /dev/null +++ b/docs/source/includes/threading/dot-static-parallel.rst @@ -0,0 +1,64 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/algorithms/service_error_handling.h" + #include "src/threading/threading.h" + + services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) { + SafeStatus safeStat; + daal::static_tls dotProductTLS([=, &safeStat]() { + float * dotProductPtr = new (std::nothrow) float; + if (!dotProductPtr) { + safeStat.add(services::ErrorMemoryAllocationFailed); + } + dotProductPtr[0] = 0.0f; + return dotProductPtr; + }); + + constexpr size_t blockSize = 1024; + const size_t nBlocks = (n + blockSize - 1) / blockSize; + + daal::static_threader_for(nBlocks, [&](size_t iBlock, size_t threadId) { + const size_t iStart = iBlock * blockSize; + const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n; + + // Compute partial result for this block + float partialDotProduct = 0.0f; + for (size_t i = iStart; i < iEnd; ++i) { + partialDotProduct += a[i] * b[i]; + } + + // Update thread-local result + // Note that exact thread index is used to get access to the thread's data + float * localDotProduct = dotProductTLS.local(threadId); + if (!localDotProduct) { + // Allocation error happened earlier + return; + } + localDotProduct[0] += partialDotProduct; + }); + DAAL_CHECK_SAFE_STATUS(); + + tls.reduce([&](float * localDotProduct) { + if (localDotProduct) { + dotProduct += localDotProduct[0]; + delete localDotProduct; + } + }); + return services::Status(); + } diff --git a/docs/source/includes/threading/nested-parallel-ls.rst b/docs/source/includes/threading/nested-parallel-ls.rst new file mode 100644 index 00000000000..1ceb0414c0b --- /dev/null +++ b/docs/source/includes/threading/nested-parallel-ls.rst @@ -0,0 +1,53 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/algorithms/service_error_handling.h" + #include "src/threading/threading.h" + + SafeStatus safeStat; + daal::ls ls([=, &safeStat]() { + float * localBuffer = new (std::nothrow) float[localSize]; + if (!localBuffer) { + safeStat.add(services::ErrorMemoryAllocationFailed); + } + return localBuffer; + }) + daal::threader_for(n, n, [&](size_t i) { + float * localBuffer = ls.local(); + if (!localBuffer) { + // Allocation error happened earlier + return; + } + + // Initialize localBuffer with some data here + + daal::threader_for(m, m, [&](size_t j) { + /* Some work */ + }); + + // The thread specific value always stays unchanged after the nested execution. + assert(localBuffer == ls.local()); // Assertion never fails! + }); + DAAL_CHECK_SAFE_STATUS() + + ls.reduce([&](float * localBuffer) { + if (localBuffer) { + /* Do reduction */ + delete localBuffer; + } + }); diff --git a/docs/source/includes/threading/nested-parallel.rst b/docs/source/includes/threading/nested-parallel.rst new file mode 100644 index 00000000000..fc41f70398b --- /dev/null +++ b/docs/source/includes/threading/nested-parallel.rst @@ -0,0 +1,54 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/algorithms/service_error_handling.h" + #include "src/threading/threading.h" + + SafeStatus safeStat; + daal::tls tls([=, &safeStat]() { + float * localBuffer = new (std::nothrow) float[localSize]; + if (!localBuffer) { + safeStat.add(services::ErrorMemoryAllocationFailed); + } + return localBuffer; + }) + daal::threader_for(n, n, [&](size_t i) { + float * localBuffer = tls.local(); + if (!localBuffer) { + // Allocation error happened earlier + return; + } + + // Initialize localBuffer with some data here + + daal::threader_for(m, m, [&](size_t j) { + /* Some work */ + }); + + // While executing the above parallel_for, the thread might have run iterations + // of the outer parallel_for, and so might have changed the thread specific value. + assert(localBuffer == tls.local()); // The assertion may fail! + }); + DAAL_CHECK_SAFE_STATUS() + + tls.reduce([&](float * localBuffer) { + if (localBuffer) { + /* Do reduction */ + delete localBuffer; + } + }); diff --git a/docs/source/includes/threading/sum-parallel-by-blocks.rst b/docs/source/includes/threading/sum-parallel-by-blocks.rst new file mode 100644 index 00000000000..0e05cae1008 --- /dev/null +++ b/docs/source/includes/threading/sum-parallel-by-blocks.rst @@ -0,0 +1,32 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/threading/threading.h" + + void sum(const size_t n, const float* a, const float* b, float* c) { + constexpr size_t blockSize = 256; + const size_t nBlocks = (n + blockSize - 1) / blockSize; + + daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) { + const size_t iStart = iBlock * blockSize; + const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n; + for (size_t i = iStart; i < iEnd; ++i) { + c[i] = a[i] + b[i]; + } + }); + } diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst new file mode 100644 index 00000000000..ba4ee4ae591 --- /dev/null +++ b/docs/source/includes/threading/sum-parallel.rst @@ -0,0 +1,26 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + #include "src/threading/threading.h" + + void sum(const size_t n, const float* a, const float* b, float* c) { + daal::threader_for(n, n, [&](size_t i) { + c[i] = a[i] + b[i]; + }); + } + diff --git a/docs/source/includes/threading/sum-sequential.rst b/docs/source/includes/threading/sum-sequential.rst new file mode 100644 index 00000000000..b91c7c2d836 --- /dev/null +++ b/docs/source/includes/threading/sum-sequential.rst @@ -0,0 +1,23 @@ +.. ****************************************************************************** +.. * Copyright contributors to the oneDAL project +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +:: + + void sum(const size_t n, const float* a, const float* b, float* c) { + for (size_t i = 0; i < n; ++i) { + c[i] = a[i] + b[i]; + } + } diff --git a/docs/source/index-toc.rst b/docs/source/index-toc.rst index 2c0ff1e6011..6ce2d33a458 100644 --- a/docs/source/index-toc.rst +++ b/docs/source/index-toc.rst @@ -22,7 +22,7 @@ data-analytics-pipeline.rst system-requirements.rst - + .. toctree:: :caption: Get Started :maxdepth: 2 @@ -52,3 +52,10 @@ :caption: Contributing Guide contribution/coding_guide.rst + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Custom Components + + contribution/threading.rst From 716a02a449c5d2824dd18e7b41037639edd5a3fe Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Mon, 23 Sep 2024 06:23:34 -0700 Subject: [PATCH 8/9] Update docs codeowners (#2909) --- .github/CODEOWNERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d6a47717935..789ff63d6e2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,6 +1,6 @@ # Documentation owners and reviewers -/docs/ @Vika-F @maria-Petrova @Alexsandruss @bdmoore1 -*.md @Vika-F @maria-Petrova @Alexsandruss @bdmoore1 +/docs/ @Vika-F @maria-Petrova @Alexsandruss @emmwalsh +*.md @Vika-F @maria-Petrova @Alexsandruss @emmwalsh # TTP files third-party* @maria-Petrova From a4e440bdb4402730c04ab33af1ff34257c8e06a7 Mon Sep 17 00:00:00 2001 From: Aleksei Khomenko Date: Mon, 23 Sep 2024 15:25:43 +0200 Subject: [PATCH 9/9] chore(ci): add `read` permissions for `label-enforcement` workflow (#2901) --- .github/workflows/label-enforcement.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/label-enforcement.yml b/.github/workflows/label-enforcement.yml index 5ad1f6ff8d9..c862f64b798 100644 --- a/.github/workflows/label-enforcement.yml +++ b/.github/workflows/label-enforcement.yml @@ -3,6 +3,9 @@ on: pull_request: branches: [ "main" ] +permissions: + contents: read + jobs: label_checker: name: Please include labels on your pull request