From 93465cf9e34327c526470e807831dca600d26603 Mon Sep 17 00:00:00 2001
From: Aleksei Khomenko <aleksei.khomenko@intel.com>
Date: Wed, 11 Sep 2024 14:00:22 +0200
Subject: [PATCH 1/9] fix(samples): replace `CL/sycl.hpp` with `sycl/sycl.hpp`
 (#2893)

---
 samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp | 2 +-
 samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp b/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp
index 19a9cd06f67..8183638097b 100644
--- a/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp
+++ b/samples/oneapi/dpc/ccl/sources/knn_bf_distr_ccl.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <iomanip>
 #include <iostream>
 
diff --git a/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp b/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp
index 19ccf8df5ee..1234a6d30fe 100644
--- a/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp
+++ b/samples/oneapi/dpc/mpi/sources/knn_bf_distr_mpi.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <iomanip>
 #include <iostream>
 

From 3416bc90f75ac0d1236f6055deeed94cee8f9a27 Mon Sep 17 00:00:00 2001
From: ethanglaser <42726565+ethanglaser@users.noreply.github.com>
Date: Thu, 12 Sep 2024 07:27:58 -0700
Subject: [PATCH 2/9] fix: resolve omp simd error in mdd (#2894)

* fix: resolve omp simd error in mdd

* updating makefile instead

* simplify slightly
---
 dev/make/compiler_definitions/icx.mkl.32e.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/make/compiler_definitions/icx.mkl.32e.mk b/dev/make/compiler_definitions/icx.mkl.32e.mk
index af18b3fe0ec..e61e6bc39bc 100644
--- a/dev/make/compiler_definitions/icx.mkl.32e.mk
+++ b/dev/make/compiler_definitions/icx.mkl.32e.mk
@@ -33,7 +33,7 @@ CORE.SERV.COMPILER.icx = generic
 COMPILER.lnx.icx = icx -m64 \
                      -Werror -Wreturn-type -qopenmp-simd
 
-COMPILER.win.icx = icx $(if $(MSVC_RT_is_release),-MD, -MDd) -nologo -WX -Qopenmp-simd -Wno-deprecated-declarations
+COMPILER.win.icx = icx $(if $(MSVC_RT_is_release),-MD -Qopenmp-simd, -MDd) -nologo -WX -Wno-deprecated-declarations
 
 link.dynamic.lnx.icx = icx -m64 -no-intel-lib
 

From c27a8fc57c84549c9f823347265ae7f6d5e45ffc Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Mon, 16 Sep 2024 16:49:56 +0100
Subject: [PATCH 3/9] Change default COMPILER to icx in makefile (#2898)

---
 dev/make/function_definitions/32e.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/make/function_definitions/32e.mk b/dev/make/function_definitions/32e.mk
index 41dfbb96fe9..cfee5fc4a11 100644
--- a/dev/make/function_definitions/32e.mk
+++ b/dev/make/function_definitions/32e.mk
@@ -20,7 +20,7 @@ ifeq ($(filter mkl ref,$(BACKEND_CONFIG)),)
 endif
 
 COMPILERs = icc icx gnu clang vc
-COMPILER ?= icc
+COMPILER ?= icx
 CPUs := sse2 sse42 avx2 avx512
 CPUs.files := nrh neh hsw skx
 

From 00dacc9bef307d2695e138412263831c42499eb1 Mon Sep 17 00:00:00 2001
From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com>
Date: Tue, 17 Sep 2024 05:50:27 -0700
Subject: [PATCH 4/9] Add DEPENDENTLOAD flag to vc linker on Windows (#2896)

---
 dev/make/compiler_definitions/vc.mkl.32e.mk | 2 ++
 examples/cmake/setup_examples.cmake         | 3 +++
 samples/cmake/setup_samples.cmake           | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/dev/make/compiler_definitions/vc.mkl.32e.mk b/dev/make/compiler_definitions/vc.mkl.32e.mk
index de73bd6cb5a..975dc788fb9 100644
--- a/dev/make/compiler_definitions/vc.mkl.32e.mk
+++ b/dev/make/compiler_definitions/vc.mkl.32e.mk
@@ -30,6 +30,8 @@ CORE.SERV.COMPILER.vc = generic
 # Disable C4661 because of false positives
 COMPILER.win.vc = cl $(if $(MSVC_RT_is_release),-MD, -MDd) -nologo -EHsc -wd4661 -WX
 
+link.dynamic.win.vc = /DEPENDENTLOADFLAG:0x2000
+
 p4_OPT.vc   =
 mc3_OPT.vc  =
 avx2_OPT.vc = -arch:AVX2
diff --git a/examples/cmake/setup_examples.cmake b/examples/cmake/setup_examples.cmake
index 3f2058417c1..7d1fb3e5d50 100644
--- a/examples/cmake/setup_examples.cmake
+++ b/examples/cmake/setup_examples.cmake
@@ -123,6 +123,9 @@ function (add_examples examples_paths)
         endif()
         target_compile_options(${example} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS})
         target_link_options(${example} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS})
+        if(WIN32 AND "${ONEDAL_LINK}" STREQUAL "dynamic" AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC|IntelLLVM")
+            target_link_options(${example} PRIVATE /DEPENDENTLOADFLAG:0x2000)
+        endif()
         set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}")
     endforeach()
     set_common_compiler_options()
diff --git a/samples/cmake/setup_samples.cmake b/samples/cmake/setup_samples.cmake
index dd7d3cc758b..abf480f02e9 100644
--- a/samples/cmake/setup_samples.cmake
+++ b/samples/cmake/setup_samples.cmake
@@ -108,6 +108,9 @@ function(add_samples samples_paths)
 
         target_compile_options(${sample} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS})
         target_link_options(${sample} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS})
+        if(WIN32 AND "${ONEDAL_LINK}" STREQUAL "dynamic" AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC|IntelLLVM")
+            target_link_options(${sample} PRIVATE /DEPENDENTLOADFLAG:0x2000)
+        endif()
         set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}")
 
         add_custom_target(run_${sample}

From b6b6a65a59b46d6770198abfc11cfa476c1338a2 Mon Sep 17 00:00:00 2001
From: david-cortes-intel <david.cortes@intel.com>
Date: Thu, 19 Sep 2024 13:08:44 +0200
Subject: [PATCH 5/9] increase timeout for failing jobs (#2900)

---
 .ci/pipeline/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml
index a0620f66e9d..2f768b8b801 100755
--- a/.ci/pipeline/ci.yml
+++ b/.ci/pipeline/ci.yml
@@ -620,7 +620,7 @@ jobs:
       conda activate CB
       source $(Pipeline.Workspace)/daal/latest/env/vars.sh
       ./sklearnex/conda-recipe/run_test.sh
-    timeoutInMinutes: 15
+    timeoutInMinutes: 20
     displayName: sklearnex test
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh

From 9b315cf8124c700f79bdc3007aac103df3b831bf Mon Sep 17 00:00:00 2001
From: Samir Nasibli <samir.nasibli@intel.com>
Date: Thu, 19 Sep 2024 23:32:38 +0200
Subject: [PATCH 6/9] FIX: `Kmeans` timeouts issue on GPU for sparse inputs
 (#2906)

---
 cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
index dba39bb9d01..228e9e01863 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
@@ -371,7 +371,7 @@ sycl::event handle_empty_clusters(const dal::backend::context_gpu& ctx,
     auto event = queue.submit([&](sycl::handler& cgh) {
         cgh.depends_on(deps);
         cgh.parallel_for(range, [=](auto it) {
-            const auto local_id = it.get_local_id(1);
+            const auto local_id = it.get_local_id()[1];
             for (std::int64_t cluster_id = rank; cluster_id < num_clusters;
                  cluster_id += rank_count) {
                 // no need to handle non-empty clusters

From 488c4006a5a2f29c272c1c48e71b7a3a2ece5a61 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Fri, 20 Sep 2024 12:55:02 +0200
Subject: [PATCH 7/9] Add a chapter about threading layer into the docs (#2848)

- Added a chapter about internal threading primitives into oneDAL documentation
- Improved commenting in the header specifying those primitives APIs (threading.h)
---
 CONTRIBUTING.md                               |   6 +
 cpp/daal/src/threading/threading.cpp          |   6 +-
 cpp/daal/src/threading/threading.h            | 333 ++++++++++++++----
 docs/source/contribution/threading.rst        | 166 +++++++++
 .../threading/dot-parallel-init-tls.rst       |  30 ++
 .../dot-parallel-partial-compute.rst          |  40 +++
 .../threading/dot-parallel-reduction.rst      |  25 ++
 .../includes/threading/dot-parallel.rst       |  63 ++++
 .../includes/threading/dot-sequential.rst     |  25 ++
 .../threading/dot-static-parallel.rst         |  64 ++++
 .../includes/threading/nested-parallel-ls.rst |  53 +++
 .../includes/threading/nested-parallel.rst    |  54 +++
 .../threading/sum-parallel-by-blocks.rst      |  32 ++
 .../includes/threading/sum-parallel.rst       |  26 ++
 .../includes/threading/sum-sequential.rst     |  23 ++
 docs/source/index-toc.rst                     |   9 +-
 16 files changed, 875 insertions(+), 80 deletions(-)
 create mode 100644 docs/source/contribution/threading.rst
 create mode 100644 docs/source/includes/threading/dot-parallel-init-tls.rst
 create mode 100644 docs/source/includes/threading/dot-parallel-partial-compute.rst
 create mode 100644 docs/source/includes/threading/dot-parallel-reduction.rst
 create mode 100644 docs/source/includes/threading/dot-parallel.rst
 create mode 100644 docs/source/includes/threading/dot-sequential.rst
 create mode 100644 docs/source/includes/threading/dot-static-parallel.rst
 create mode 100644 docs/source/includes/threading/nested-parallel-ls.rst
 create mode 100644 docs/source/includes/threading/nested-parallel.rst
 create mode 100644 docs/source/includes/threading/sum-parallel-by-blocks.rst
 create mode 100644 docs/source/includes/threading/sum-parallel.rst
 create mode 100644 docs/source/includes/threading/sum-sequential.rst

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2644e0f06fe..d3e45c45a9d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,6 +83,12 @@ editorconfig-checker
 
 For your convenience we also added [coding guidelines](http://oneapi-src.github.io/oneDAL/contribution/coding_guide.html) with examples and detailed descriptions of the coding style oneDAL follows. We encourage you to consult them when writing your code.
 
+## Custom Components
+
+### Threading Layer
+
+In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form are called the [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms.
+
 ## Documentation Guidelines
 
 oneDAL uses `Doxygen` for inline comments in public header files that are used to build the API reference and  `reStructuredText` for the Developer Guide. See [oneDAL documentation](https://oneapi-src.github.io/oneDAL/) for reference.
diff --git a/cpp/daal/src/threading/threading.cpp b/cpp/daal/src/threading/threading.cpp
index 15c39368238..3b280229ee7 100644
--- a/cpp/daal/src/threading/threading.cpp
+++ b/cpp/daal/src/threading/threading.cpp
@@ -103,7 +103,7 @@ DAAL_EXPORT size_t _setNumberOfThreads(const size_t numThreads, void ** globalCo
     return 1;
 }
 
-DAAL_EXPORT void _daal_threader_for(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -160,7 +160,7 @@ DAAL_EXPORT void _daal_threader_for_blocked_size(size_t n, size_t block, const v
     }
 }
 
-DAAL_EXPORT void _daal_threader_for_simple(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for_simple(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -318,7 +318,7 @@ DAAL_PARALLEL_SORT_IMPL(daal::IdxValType<double>, pair_fp64_uint64)
 
 #undef DAAL_PARALLEL_SORT_IMPL
 
-DAAL_EXPORT void _daal_threader_for_blocked(int n, int threads_request, const void * a, daal::functype2 func)
+DAAL_EXPORT void _daal_threader_for_blocked(int n, int reserved, const void * a, daal::functype2 func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 0b4a9881b97..ca8661f2203 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -198,107 +198,196 @@ inline size_t setNumberOfThreads(const size_t numThreads, void ** globalControl)
 template <typename F>
 inline void threader_func(int i, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i);
+    const F & func = *static_cast<const F *>(a);
+    func(i);
 }
 
 template <typename F>
 inline void static_threader_func(size_t i, size_t tid, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, tid);
+    const F & func = *static_cast<const F *>(a);
+    func(i, tid);
 }
 
 template <typename F>
 inline void threader_func_b(int i0, int in, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i0, in);
+    const F & func = *static_cast<const F *>(a);
+    func(i0, in);
 }
 
 template <typename F>
 inline void threader_func_break(int i, bool & needBreak, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, needBreak);
+    const F & func = *static_cast<const F *>(a);
+    func(i, needBreak);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^31 - 1 (INT32_MAX)`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for(int n, int threads_request, const F & lambda)
+inline void threader_for(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for(n, threads_request, a, threader_func<F>);
+    _daal_threader_for(n, reserved, a, threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^63 - 1 (INT64_MAX)`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// The iterations of the loop should be logically independent.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int64_t i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for_int64(int64_t n, const F & lambda)
+inline void threader_for_int64(int64_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int64(n, a, threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is 2^31 - 1.
+///
+/// The specifics of this loop comparing to `threader_for` is that the iteration space
+/// of the loop is always chunked with chunk size 1.
+/// This means the threading layer tries to assign consecutive iterations to
+/// different threads, if possible.
+/// In case of oneTBB threading backend this means that `simple_partitioner`
+/// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
+/// with chunk size 1 is used to produce iteration to threads mappings.
+///
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void threader_for_simple(int n, int threads_request, const F & lambda)
+inline void threader_for_simple(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for_simple(n, threads_request, a, threader_func<F>);
+    _daal_threader_for_simple(n, reserved, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_int32ptr(const int * begin, const int * end, const F & lambda)
+inline void threader_for_int32ptr(const int * begin, const int * end, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int32ptr(begin, end, a, threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is `SIZE_MAX` in C99 standard.
+///
+/// The work is scheduled statically across threads.
+/// This means that the work is always scheduled in the same way across the threads:
+/// each thread processes the same set of iterations on each invocation of this loop.
+///
+/// It is recommended to use this parallel loop if each iteration of the loop
+/// performs equal amount of work.
+///
+/// Let `t` be the number of threads available to oneDAL. The number of iterations
+/// processed by each threads (except maybe the last one) is computed as:
+/// `nI = (n + t - 1) / t`
+///
+/// Here is how the work is split across the threads:
+/// The 1st thread executes iterations `0, ..., nI - 1`;
+/// the 2nd thread executes iterations `nI, ..., 2 * nI - 1`;
+/// ...
+/// the `t`-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
+///
+/// @tparam F   Callable object of type `[/* captures */](size_t i, size_t tid) -> void`,
+///             where
+///                 `i` is the loop's iteration index, `0 <= i < n`;
+///                 `tid` is the index of the thread, `0 <= tid < t`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void static_threader_for(size_t n, const F & lambda)
+inline void static_threader_for(size_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_static_threader_for(n, a, static_threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^31 - 1 INT32_MAX`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+///
+/// @tparam F   Callable object of type `[/* captures */](int beginRange, int endRange) -> void`
+///             where
+///                 `beginRange` is the starting index of the loop iterations block to be
+///                                processed by a thread, `0 <= beginRange < n`;
+///                 `endRange`   is the index after the end of the loop's iterations block to be
+///                                processed by a thread, `beginRange < endRange <= n`;
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that processes the block of loop's iterations
+///                     `[beginRange, endRange)`.
 template <typename F>
-inline void threader_for_blocked(int n, int threads_request, const F & lambda)
+inline void threader_for_blocked(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for_blocked(n, threads_request, a, threader_func_b<F>);
+    _daal_threader_for_blocked(n, reserved, a, threader_func_b<F>);
 }
 
 template <typename F>
-inline void threader_for_optional(int n, int threads_request, const F & lambda)
+inline void threader_for_optional(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_optional(n, threads_request, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_break(int n, int threads_request, const F & lambda)
+inline void threader_for_break(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_break(n, threads_request, a, threader_func_break<F>);
 }
 
-template <typename lambdaType>
+template <typename callableType>
 inline void * tls_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void tls_reduce_func(void * v, const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    lambda((F)v);
+    const callableType & func = *static_cast<const callableType *>(a);
+    func((F)v);
 }
 
 struct tlsBase
@@ -313,32 +402,49 @@ class tls_deleter : public tlsBase
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class tls_deleter_ : public tls_deleter
 {
 public:
     virtual ~tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
+/// Thread-local storage (TLS).
+/// Can change its local variable after a nested parallel constructs.
+/// @note Thread-local storage in nested parallel regions is, in general, not thread local.
+/// The use of nested parallelism should be avoided if possible, otherwise extra care
+/// must be taken with thread-local values.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class tls : public tlsBase
 {
 public:
-    template <typename lambdaType>
-    explicit tls(const lambdaType & lambda)
+    /// Initialize thread-local storage
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit tls(const callableType & func)
     {
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        tlsPtr = _daal_get_tls_ptr(a, tls_func<lambdaType>);
+        tlsPtr = _daal_get_tls_ptr(a, tls_func<callableType>);
     }
 
+    /// Destroys the memory associated with a thread-local storage
+    ///
+    /// @note TLS does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~tls()
     {
         d->del(voidLambda);
@@ -346,26 +452,43 @@ class tls : public tlsBase
         _daal_del_tls_ptr(tlsPtr);
     }
 
+    /// Access a local data of a thread by value
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _daal_get_tls_local(tlsPtr);
         return (static_cast<F>(pf));
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
-    template <typename lambdaType>
-    void parallel_reduce(const lambdaType & lambda)
+    /// Parallel reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func     Callable object that is applied to each element of thread-local
+    ///                 storage in parallel.
+    template <typename callableType>
+    void parallel_reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:
@@ -374,11 +497,11 @@ class tls : public tlsBase
     tls_deleter * d;
 };
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void * creater_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
 class static_tls_deleter
@@ -388,20 +511,28 @@ class static_tls_deleter
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class static_tls_deleter_ : public static_tls_deleter
 {
 public:
     virtual ~static_tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
+/// Thread-local storage (TLS) for the case of static parallel work scheduling.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class static_tls
 {
 public:
-    template <typename lambdaType>
-    explicit static_tls(const lambdaType & lambda)
+    /// Initialize thread-local storage.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit static_tls(const callableType & func)
     {
         _nThreads = threader_get_max_threads_number();
 
@@ -417,8 +548,8 @@ class static_tls
             _storage[i] = nullptr;
         }
 
-        lambdaType * locall = new lambdaType(lambda);
-        _deleter            = new static_tls_deleter_<lambdaType>();
+        callableType * locall = new callableType(func);
+        _deleter              = new static_tls_deleter_<callableType>();
         if (!locall || !_deleter)
         {
             return;
@@ -428,9 +559,14 @@ class static_tls
         void * a        = const_cast<void *>(ac);
         _creater        = a;
 
-        _creater_func = creater_func<F, lambdaType>;
+        _creater_func = creater_func<F, callableType>;
     }
 
+    /// Destroys the memory associated with a thread-local storage.
+    ///
+    /// @note Static TLS does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~static_tls()
     {
         if (_deleter)
@@ -441,9 +577,16 @@ class static_tls
         delete[] _storage;
     }
 
+    /// Access a local data of a specified thread by value.
+    ///
+    /// @param tid  Index of the thread.
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local(size_t tid)
     {
-        if (_storage)
+        if (_storage && tid < _nThreads)
         {
             if (!_storage[tid])
             {
@@ -458,18 +601,27 @@ class static_tls
         }
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
         if (_storage)
         {
             for (size_t i = 0; i < _nThreads; ++i)
             {
-                if (_storage[i]) lambda(_storage[i]);
+                if (_storage[i]) func(_storage[i]);
             }
         }
     }
 
+    /// Full number of threads.
+    ///
+    /// @return Total number of threads available to oneDAL.
     size_t nthreads() const { return _nThreads; }
 
 private:
@@ -480,25 +632,43 @@ class static_tls
     static_tls_deleter * _deleter    = nullptr;
 };
 
+/// Local storage (LS) for the data of a thread.
+/// Does not change its local variable after nested parallel constructs,
+/// but can have performance penalties compared to thread-local storage type `daal::tls`.
+/// Can be safely used in case of nested parallel regions.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class ls : public tlsBase
 {
 public:
-    template <typename lambdaType>
-    explicit ls(const lambdaType & lambda, const bool isTls = false)
+    /// Initialize local storage.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func     Callable object that initializes local storage
+    /// @param isTls    if `true`, then local storage is a thread-local storage (`daal::tls`)
+    ///                 and might have problems in case of nested parallel regions.
+    template <typename callableType>
+    explicit ls(const callableType & func, const bool isTls = false)
     {
-        _isTls              = isTls;
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        _isTls                   = isTls;
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<lambdaType>) : _daal_get_ls_ptr(a, tls_func<lambdaType>);
+        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<callableType>) : _daal_get_ls_ptr(a, tls_func<callableType>);
     }
 
+    /// Destroys the memory associated with local storage.
+    ///
+    /// @note `ls` does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~ls()
     {
         d->del(voidLambda);
@@ -506,6 +676,11 @@ class ls : public tlsBase
         _isTls ? _daal_del_tls_ptr(lsPtr) : _daal_del_ls_ptr(lsPtr);
     }
 
+    /// Access the local data of a thread by value.
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _isTls ? _daal_get_tls_local(lsPtr) : _daal_get_ls_local(lsPtr);
@@ -517,12 +692,18 @@ class ls : public tlsBase
         if (!_isTls) _daal_release_ls_local(lsPtr, p);
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, lambdaType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, lambdaType>);
+        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, callableType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
new file mode 100644
index 00000000000..cd1acd84e95
--- /dev/null
+++ b/docs/source/contribution/threading.rst
@@ -0,0 +1,166 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+.. highlight:: cpp
+
+Threading Layer
+^^^^^^^^^^^^^^^
+
+oneDAL uses Intel\ |reg|\  oneAPI Threading Building Blocks (Intel\ |reg|\  oneTBB) to do parallel
+computations on CPU.
+
+But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
+use custom primitives that either wrap oneTBB functionality or are in-house developed.
+Those primitives form oneDAL's threading layer.
+
+This is done in order not to be dependent on possible oneTBB API changes and even
+on the particular threading technology like oneTBB, C++11 standard threads, etc.
+
+The API of the layer is defined in
+`threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
+Please be aware that the threading API is not a part of oneDAL product API.
+This is the product internal API that aimed to be used only by oneDAL developers, and can be changed at any time
+without any prior notification.
+
+This chapter describes common parallel patterns and primitives of the threading layer.
+
+threader_for
+************
+
+Consider a case where you need to compute an elementwise sum of two arrays and store the results
+into another array.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/sum-sequential.rst
+
+There are several options available in the threading layer of oneDAL to let the iterations of this code
+run in parallel.
+One of the options is to use ``daal::threader_for`` as shown here:
+
+.. include:: ../includes/threading/sum-parallel.rst
+
+The iteration space here goes from ``0`` to ``n-1``.
+The last argument is a function object that performs a single iteration of the loop, given loop index ``i``.
+
+Blocking
+--------
+
+To have more control over the parallel execution and to increase
+`cache locality <https://en.wikipedia.org/wiki/Locality_of_reference>`_ oneDAL usually splits
+the data into blocks and then processes those blocks in parallel.
+
+This code shows how a typical parallel loop in oneDAL looks like:
+
+.. include:: ../includes/threading/sum-parallel-by-blocks.rst
+
+Thread-local Storage (TLS)
+**************************
+
+Consider you need to compute a dot product of two arrays.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/dot-sequential.rst
+
+Parallel computations can be performed in two steps:
+
+    1. Compute partial dot product in each thread.
+    2. Perform a reduction: Add the partial results from all threads to compute the final dot product.
+
+``daal::tls`` provides a local storage where each thread can accumulate its local results.
+The following code allocates memory that would store partial dot products for each thread:
+
+.. include:: ../includes/threading/dot-parallel-init-tls.rst
+
+``SafeStatus`` in this code denotes a thread-safe counterpart of the ``Status`` class.
+``SafeStatus`` allows to collect errors from all threads and report them to the user using the
+``detach()`` method. An example will be shown later in the documentation.
+
+Checking the status right after the initialization code won't show the allocation errors,
+because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
+is evaluated on first use of the thread-local storage (TLS).
+
+There are several options available in the threading layer of oneDAL to compute the partial
+dot product results at each thread.
+One of the options is to use the already mentioned ``daal::threader_for`` and blocking approach
+as shown here:
+
+.. include:: ../includes/threading/dot-parallel-partial-compute.rst
+
+To compute the final result it is required to reduce each thread's partial results
+as shown here:
+
+.. include:: ../includes/threading/dot-parallel-reduction.rst
+
+Local memory of the threads should be released when it is no longer needed.
+
+The complete parallel version of dot product computations would look like:
+
+.. include:: ../includes/threading/dot-parallel.rst
+
+Static Work Scheduling
+**********************
+
+By default, oneTBB uses
+`dynamic work scheduling <https://oneapi-src.github.io/oneTBB/main/tbb_userguide/How_Task_Scheduler_Works.html>`_
+and work stealing.
+It means that two different runs of the same parallel loop can produce different
+mappings of the loop's iteration space to the available threads.
+This strategy is beneficial when it is difficult to estimate the amount of work performed
+by each iteration.
+
+In the cases when it is known that the iterations perform an equal amount of work, it
+is more performant to use predefined mapping of the loop's iterations to threads.
+This is what static work scheduling does.
+
+``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static
+work scheduling within oneDAL.
+
+Here is a variant of parallel dot product computation with static scheduling:
+
+.. include:: ../includes/threading/dot-static-parallel.rst
+
+Nested Parallelism
+******************
+
+oneDAL supports nested parallel loops.
+It is important to know that:
+
+    "when a parallel construct calls another parallel construct, a thread can obtain a task
+     from the outer-level construct while waiting for completion of the inner-level one."
+
+    -- `oneTBB documentation <https://www.intel.com/content/www/us/en/docs/onetbb/developer-guide-api-reference/2021-13/work-isolation.html>`_
+
+In practice, this means that a thread-local variable might unexpectedly
+change its value after a nested parallel construct:
+
+.. include:: ../includes/threading/nested-parallel.rst
+
+In some scenarios this can lead to deadlocks, segmentation faults and other issues.
+
+oneTBB provides ways to isolate execution of a parallel construct, for its tasks
+to not interfere with other simultaneously running tasks.
+
+Those options are preferred when the parallel loops are initially written as nested.
+But in oneDAL there are cases when one parallel algorithm, the outer one,
+calls another parallel algorithm, the inner one, within a parallel region.
+
+The inner algorithm in this case can also be called solely, without additional nesting.
+And we do not always want to make it isolated.
+
+For the cases like that, oneDAL provides ``daal::ls``. Its ``local()`` method always
+returns the same value for the same thread, regardless of the nested execution:
+
+.. include:: ../includes/threading/nested-parallel-ls.rst
diff --git a/docs/source/includes/threading/dot-parallel-init-tls.rst b/docs/source/includes/threading/dot-parallel-init-tls.rst
new file mode 100644
index 00000000000..8e72646a7ca
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-init-tls.rst
@@ -0,0 +1,30 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> dotProductTLS([=, &safeStat]() {
+      float * dotProductPtr = new (std::nothrow) float;
+      if (!dotProductPtr) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      dotProductPtr[0] = 0.0f;
+      return dotProductPtr;
+   });
diff --git a/docs/source/includes/threading/dot-parallel-partial-compute.rst b/docs/source/includes/threading/dot-parallel-partial-compute.rst
new file mode 100644
index 00000000000..0521e4bb2cf
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-partial-compute.rst
@@ -0,0 +1,40 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   constexpr size_t blockSize = 1024;
+   const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+   daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+      const size_t iStart = iBlock * blockSize;
+      const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+      // Compute partial result for this block
+      float partialDotProduct = 0.0f;
+      for (size_t i = iStart; i < iEnd; ++i) {
+         partialDotProduct += a[i] * b[i];
+      }
+
+      // Update thread-local result
+      float * localDotProduct = dotProductTLS.local();
+      if (!localDotProduct) {
+         // Allocation error happened earlier
+         return;
+      }
+      localDotProduct[0] += partialDotProduct;
+   });
+   DAAL_CHECK_SAFE_STATUS();  // if (!safeStat) return safeStat.detach();
diff --git a/docs/source/includes/threading/dot-parallel-reduction.rst b/docs/source/includes/threading/dot-parallel-reduction.rst
new file mode 100644
index 00000000000..e86ca030246
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-reduction.rst
@@ -0,0 +1,25 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+  float dotProduct = 0.0f;
+  tls.reduce([&](float * localDotProduct) {
+    if (localDotProduct) {
+      dotProduct += localDotProduct[0];
+      delete localDotProduct;
+    }
+  });
diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst
new file mode 100644
index 00000000000..d0230715a01
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel.rst
@@ -0,0 +1,63 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr) {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         float * localDotProduct = dotProductTLS.local();
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/dot-sequential.rst b/docs/source/includes/threading/dot-sequential.rst
new file mode 100644
index 00000000000..93300053c32
--- /dev/null
+++ b/docs/source/includes/threading/dot-sequential.rst
@@ -0,0 +1,25 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   float dot(const size_t n, const float* a, const float* b) {
+      float result = 0.0f;
+      for (size_t i = 0; i < n; ++i) {
+         result += a[i] * b[i];
+      }
+      return result;
+   }
diff --git a/docs/source/includes/threading/dot-static-parallel.rst b/docs/source/includes/threading/dot-static-parallel.rst
new file mode 100644
index 00000000000..dfec18b6d21
--- /dev/null
+++ b/docs/source/includes/threading/dot-static-parallel.rst
@@ -0,0 +1,64 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::static_tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr) {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::static_threader_for(nBlocks, [&](size_t iBlock, size_t threadId) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         // Note that exact thread index is used to get access to the thread's data
+         float * localDotProduct = dotProductTLS.local(threadId);
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/nested-parallel-ls.rst b/docs/source/includes/threading/nested-parallel-ls.rst
new file mode 100644
index 00000000000..1ceb0414c0b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel-ls.rst
@@ -0,0 +1,53 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::ls<float *> ls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = ls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // The thread specific value always stays unchanged after the nested execution.
+      assert(localBuffer == ls.local()); // Assertion never fails!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   ls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });
diff --git a/docs/source/includes/threading/nested-parallel.rst b/docs/source/includes/threading/nested-parallel.rst
new file mode 100644
index 00000000000..fc41f70398b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel.rst
@@ -0,0 +1,54 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> tls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = tls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // While executing the above parallel_for, the thread might have run iterations
+      // of the outer parallel_for, and so might have changed the thread specific value.
+      assert(localBuffer == tls.local()); // The assertion may fail!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   tls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });
diff --git a/docs/source/includes/threading/sum-parallel-by-blocks.rst b/docs/source/includes/threading/sum-parallel-by-blocks.rst
new file mode 100644
index 00000000000..0e05cae1008
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel-by-blocks.rst
@@ -0,0 +1,32 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/threading/threading.h"
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      constexpr size_t blockSize = 256;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            c[i] = a[i] + b[i];
+         }
+      });
+   }
diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
new file mode 100644
index 00000000000..ba4ee4ae591
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -0,0 +1,26 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/threading/threading.h"
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      daal::threader_for(n, n, [&](size_t i) {
+         c[i] = a[i] + b[i];
+      });
+   }
+
diff --git a/docs/source/includes/threading/sum-sequential.rst b/docs/source/includes/threading/sum-sequential.rst
new file mode 100644
index 00000000000..b91c7c2d836
--- /dev/null
+++ b/docs/source/includes/threading/sum-sequential.rst
@@ -0,0 +1,23 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      for (size_t i = 0; i < n; ++i) {
+        c[i] = a[i] + b[i];
+      }
+   }
diff --git a/docs/source/index-toc.rst b/docs/source/index-toc.rst
index 2c0ff1e6011..6ce2d33a458 100644
--- a/docs/source/index-toc.rst
+++ b/docs/source/index-toc.rst
@@ -22,7 +22,7 @@
 
    data-analytics-pipeline.rst
    system-requirements.rst
-   
+
 .. toctree::
    :caption: Get Started
    :maxdepth: 2
@@ -52,3 +52,10 @@
    :caption: Contributing Guide
 
    contribution/coding_guide.rst
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :caption: Custom Components
+
+   contribution/threading.rst

From 716a02a449c5d2824dd18e7b41037639edd5a3fe Mon Sep 17 00:00:00 2001
From: ethanglaser <42726565+ethanglaser@users.noreply.github.com>
Date: Mon, 23 Sep 2024 06:23:34 -0700
Subject: [PATCH 8/9] Update docs codeowners (#2909)

---
 .github/CODEOWNERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6a47717935..789ff63d6e2 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,6 +1,6 @@
 # Documentation owners and reviewers
-/docs/         @Vika-F @maria-Petrova @Alexsandruss @bdmoore1
-*.md           @Vika-F @maria-Petrova @Alexsandruss @bdmoore1
+/docs/         @Vika-F @maria-Petrova @Alexsandruss @emmwalsh
+*.md           @Vika-F @maria-Petrova @Alexsandruss @emmwalsh
 
 # TTP files
 third-party*   @maria-Petrova

From a4e440bdb4402730c04ab33af1ff34257c8e06a7 Mon Sep 17 00:00:00 2001
From: Aleksei Khomenko <aleksei.khomenko@intel.com>
Date: Mon, 23 Sep 2024 15:25:43 +0200
Subject: [PATCH 9/9] chore(ci): add `read` permissions for `label-enforcement`
 workflow (#2901)

---
 .github/workflows/label-enforcement.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/label-enforcement.yml b/.github/workflows/label-enforcement.yml
index 5ad1f6ff8d9..c862f64b798 100644
--- a/.github/workflows/label-enforcement.yml
+++ b/.github/workflows/label-enforcement.yml
@@ -3,6 +3,9 @@ on:
   pull_request:
     branches: [ "main" ]
 
+permissions:
+  contents: read
+
 jobs:
   label_checker:
     name: Please include labels on your pull request