From 67e40de7e49317f8f899798aca86060fbc57881f Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Tue, 16 Jul 2024 03:25:32 -0700
Subject: [PATCH 01/19] Add a chapter about threading layer into the docs

---
 CONTRIBUTING.md                        |  8 +++++++-
 docs/source/contribution/threading.rst | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/contribution/threading.rst

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9fc5122a5c7..533fc1e7af7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -25,7 +25,7 @@ Refer to our guidelines on [pull requests](#pull-requests) and [issues](#issues)
 
 ## Contacting maintainers
 You may reach out to Intel project maintainers privately at onedal.maintainers@intel.com.
-[Codeowners](https://github.com/oneapi-src/oneDAL/blob/main/.github/CODEOWNERS) configuration defines specific maintainers for corresponding code sections, however it's currently limited to Intel members. With further migration to UXL we will be changing this, but here are non-Intel contacts: 
+[Codeowners](https://github.com/oneapi-src/oneDAL/blob/main/.github/CODEOWNERS) configuration defines specific maintainers for corresponding code sections, however it's currently limited to Intel members. With further migration to UXL we will be changing this, but here are non-Intel contacts:
 
 For ARM specifics you may contact: [@rakshithgb-fujitsu](https://github.com/rakshithgb-fujitsu/)
 
@@ -69,6 +69,12 @@ Refer to [ClangFormat documentation](https://clang.llvm.org/docs/ClangFormat.htm
 
 For your convenience we also added [coding guidelines](http://oneapi-src.github.io/oneDAL/contribution/coding_guide.html) with examples and detailed descriptions of the coding style oneDAL follows. We encourage you to consult them when writing your code.
 
+## Custom Components
+
+### Threading Layer
+
+In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form so called [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html).
+
 ## Documentation Guidelines
 
 oneDAL uses `Doxygen` for inline comments in public header files that are used to build the API reference and  `reStructuredText` for the Developer Guide. See [oneDAL documentation](https://oneapi-src.github.io/oneDAL/) for reference.
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
new file mode 100644
index 00000000000..f9820e3b044
--- /dev/null
+++ b/docs/source/contribution/threading.rst
@@ -0,0 +1,22 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+.. highlight:: cpp
+
+Threading Layer
+^^^^^^^^^^^^^^^
+
+DEscription of the threading layer to go here.

From 67bd2b786ba94098db02c63d4fb250e9ac805686 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Tue, 16 Jul 2024 04:08:12 -0700
Subject: [PATCH 02/19] Add chapter about threading into table of context

---
 CONTRIBUTING.md                        | 2 +-
 docs/source/contribution/threading.rst | 2 +-
 docs/source/index-toc.rst              | 9 ++++++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 533fc1e7af7..be6ce220456 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -73,7 +73,7 @@ For your convenience we also added [coding guidelines](http://oneapi-src.github.
 
 ### Threading Layer
 
-In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form so called [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html).
+In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form so called [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms.
 
 ## Documentation Guidelines
 
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index f9820e3b044..dc333a2198c 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -19,4 +19,4 @@
 Threading Layer
 ^^^^^^^^^^^^^^^
 
-DEscription of the threading layer to go here.
+oneDAL uses Intel oneAPI Threading Building Blocks (Intel oneTBB)
diff --git a/docs/source/index-toc.rst b/docs/source/index-toc.rst
index 2c0ff1e6011..6ce2d33a458 100644
--- a/docs/source/index-toc.rst
+++ b/docs/source/index-toc.rst
@@ -22,7 +22,7 @@
 
    data-analytics-pipeline.rst
    system-requirements.rst
-   
+
 .. toctree::
    :caption: Get Started
    :maxdepth: 2
@@ -52,3 +52,10 @@
    :caption: Contributing Guide
 
    contribution/coding_guide.rst
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :caption: Custom Components
+
+   contribution/threading.rst

From 125d3be6bef4fc974dbf49e4c6cd12de70337790 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 18 Jul 2024 03:04:04 -0700
Subject: [PATCH 03/19] Add chapter about threader_for primitive

---
 docs/source/contribution/threading.rst        | 34 ++++++++++++++++++-
 .../includes/threading/sum-parallel.rst       | 26 ++++++++++++++
 .../includes/threading/sum-sequential.rst     | 23 +++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/includes/threading/sum-parallel.rst
 create mode 100644 docs/source/includes/threading/sum-sequential.rst

diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index dc333a2198c..63b2ac78dbe 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -19,4 +19,36 @@
 Threading Layer
 ^^^^^^^^^^^^^^^
 
-oneDAL uses Intel oneAPI Threading Building Blocks (Intel oneTBB)
+oneDAL uses Intel(R) oneAPI Threading Building Blocks (Intel(R) oneTBB) to do parallel
+computations on CPU.
+
+But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
+use custom primitives that either wrap oneTBB functionality or are inhome developed.
+Those primitives form oneDAL's threading layer.
+
+This is done in order not to be dependent on possible oneTBB API changes and even
+on the particular threading technology.
+
+The API of the layer is defined in `threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
+
+This chapter describes common parallel patterns and primitives of the threading layer.
+
+threader_for
+************
+
+Lets consider you need to compute an elementwise sum of two arrays and store the results
+into another array.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/sum-sequential.rst
+
+There are several options available in oneDAL's threaded layer to let the iterations of this code
+run in parallel.
+One of the options is to use `daal::threader_for` as shown here:
+
+.. include:: ../includes/threading/sum-parallel.rst
+
+The iteration space here goes from `0` to `n-1`.
+`nThreads` is the number of threads that execute the loop's body.
+And the last argument is the lambda function that defines a function object that proceeds `i`-th
+iteration of the loop.
diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
new file mode 100644
index 00000000000..4de12c968a6
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -0,0 +1,26 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+  #include "src/threading/threading.h"
+
+  void sum(const size_t n, const float* a, const float* b, float* c) {
+    const size_t nThreads = 32;
+    daal::threader_for(n, nThreads, [&](size_t i) {
+      c[i] = a[i] + b[i];
+    });
+  }
+
diff --git a/docs/source/includes/threading/sum-sequential.rst b/docs/source/includes/threading/sum-sequential.rst
new file mode 100644
index 00000000000..b91c7c2d836
--- /dev/null
+++ b/docs/source/includes/threading/sum-sequential.rst
@@ -0,0 +1,23 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      for (size_t i = 0; i < n; ++i) {
+        c[i] = a[i] + b[i];
+      }
+   }

From 97b804b709ba61be231e8cd1537c29ffd26af34a Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 18 Jul 2024 03:14:23 -0700
Subject: [PATCH 04/19] Attempt to fix Sphinx unexpected indentation error

---
 docs/source/includes/threading/sum-parallel.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
index 4de12c968a6..4011bdfe524 100644
--- a/docs/source/includes/threading/sum-parallel.rst
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -15,6 +15,7 @@
 .. *******************************************************************************/
 
 ::
+
   #include "src/threading/threading.h"
 
   void sum(const size_t n, const float* a, const float* b, float* c) {

From edae16a8d21acc894f7ffd986a4e4531f72e2775 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Mon, 22 Jul 2024 04:33:16 -0700
Subject: [PATCH 05/19] Add section about TLS

---
 docs/source/contribution/threading.rst        | 70 +++++++++++++++++--
 .../threading/dot-parallel-init-tls.rst       | 31 ++++++++
 .../dot-parallel-partial-compute.rst          | 41 +++++++++++
 .../threading/dot-parallel-reduction.rst      | 23 ++++++
 .../includes/threading/dot-parallel.rst       | 65 +++++++++++++++++
 .../includes/threading/dot-sequential.rst     | 25 +++++++
 .../threading/sum-parallel-by-block.rst       | 33 +++++++++
 .../includes/threading/sum-parallel.rst       |  2 +-
 8 files changed, 282 insertions(+), 8 deletions(-)
 create mode 100644 docs/source/includes/threading/dot-parallel-init-tls.rst
 create mode 100644 docs/source/includes/threading/dot-parallel-partial-compute.rst
 create mode 100644 docs/source/includes/threading/dot-parallel-reduction.rst
 create mode 100644 docs/source/includes/threading/dot-parallel.rst
 create mode 100644 docs/source/includes/threading/dot-sequential.rst
 create mode 100644 docs/source/includes/threading/sum-parallel-by-block.rst

diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 63b2ac78dbe..a5298d389aa 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -19,7 +19,7 @@
 Threading Layer
 ^^^^^^^^^^^^^^^
 
-oneDAL uses Intel(R) oneAPI Threading Building Blocks (Intel(R) oneTBB) to do parallel
+oneDAL uses Intel\ |reg|\ oneAPI Threading Building Blocks (Intel\ |reg|\ oneTBB) to do parallel
 computations on CPU.
 
 But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
@@ -29,7 +29,8 @@ Those primitives form oneDAL's threading layer.
 This is done in order not to be dependent on possible oneTBB API changes and even
 on the particular threading technology.
 
-The API of the layer is defined in `threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
+The API of the layer is defined in
+`threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
 
 This chapter describes common parallel patterns and primitives of the threading layer.
 
@@ -42,13 +43,68 @@ Here is a variant of sequential implementation:
 
 .. include:: ../includes/threading/sum-sequential.rst
 
-There are several options available in oneDAL's threaded layer to let the iterations of this code
+There are several options available in oneDAL's threading layer to let the iterations of this code
 run in parallel.
-One of the options is to use `daal::threader_for` as shown here:
+One of the options is to use ``daal::threader_for`` as shown here:
 
 .. include:: ../includes/threading/sum-parallel.rst
 
-The iteration space here goes from `0` to `n-1`.
-`nThreads` is the number of threads that execute the loop's body.
-And the last argument is the lambda function that defines a function object that proceeds `i`-th
+The iteration space here goes from ``0`` to ``n-1``.
+``nThreads`` is the number of threads that execute the loop's body.
+And the last argument is the lambda function that defines a function object that proceeds ``i``-th
 iteration of the loop.
+
+Blocking
+--------
+
+To have more control over the parallel execution and to increase
+`cache locality <https://en.wikipedia.org/wiki/Locality_of_reference>`_ oneDAL usually splits
+the data into blocks and then processes those blocks in parallel.
+
+This code shows how a typical parallel loop in oneDAL looks like:
+
+.. include:: ../includes/threading/sum-parallel-by-blocks.rst
+
+Thread-local Storage (TLS)
+**************************
+
+Lets consider you need to compute a dot product of two arrays.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/dot-sequential.rst
+
+Parallel computations can be performed in two steps:
+
+    1. Compute partial dot product at each threaded.
+    2. Perform a reduction: Sum the partial results from all threads to compute the final dot product.
+
+``daal::tls`` provides a local storage where each thread can accumulate its local results.
+Following code allocates memory that would store partial dot products for each thread:
+
+.. include:: ../includes/threading/dot-parallel-init-tls.rst
+
+``SafeStatus`` in this code denotes a thread-safe counterpart of oneDAL's ``Status`` class.
+``SafeStatus`` allows to collect errors from all threads and report them to user using
+``detach()`` method as it will be shown later in the code.
+
+Checking the status right after the initialization code won't show the allocation errors though.
+Because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
+is evaluated in the moment of the TLS's first use.
+
+Again, there are several options available in oneDAL's threading layer to compute the partial
+dot product results at each thread.
+One of the options is to use already mentioned ``daal::threader_for`` and blocking approach
+as shown here:
+
+.. include:: ../includes/threading/dot-parallel-partial-compute.rst
+
+To compute the final result it is requred to reduce TLS's partial results over all threads
+as it is shown here:
+
+.. include:: ../includes/threading/dot-parallel-reduction.rst
+
+Local memory of the threads should also be released when it is no longer needed.
+
+The complete parallel verision of dot product computations would look like:
+
+.. include:: ../includes/threading/dot-parallel.rst
diff --git a/docs/source/includes/threading/dot-parallel-init-tls.rst b/docs/source/includes/threading/dot-parallel-init-tls.rst
new file mode 100644
index 00000000000..f97c1960f47
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-init-tls.rst
@@ -0,0 +1,31 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> dotProductTLS([=, &safeStat]() {
+      float * dotProductPtr = new (std::nothrow) float;
+      if (!dotProductPtr)
+      {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      dotProductPtr[0] = 0.0f;
+      return dotProductPtr;
+   });
diff --git a/docs/source/includes/threading/dot-parallel-partial-compute.rst b/docs/source/includes/threading/dot-parallel-partial-compute.rst
new file mode 100644
index 00000000000..931a8fb2cd6
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-partial-compute.rst
@@ -0,0 +1,41 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+  constexpr size_t nThreads = 64;
+  constexpr size_t blockSize = 1024;
+  const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+  daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
+    const size_t iStart = iBlock * blockSize;
+    const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+    // Compute partial result for this block
+    float partialDotProduct = 0.0f;
+    for (size_t i = iStart; i < iEnd; ++i) {
+      partialDotProduct += a[i] * b[i];
+    }
+
+    // Update thread-local result
+    float * localDotProduct = dotProductTLS.local();
+    if (!localDotProduct) {
+      // Allocation error happened earlier
+      return;
+    }
+    localDotProduct[0] += partialDotProduct;
+  });
+  DAAL_CHECK_SAFE_STATUS();  // if (!safeStat) return safeStat.detach();
diff --git a/docs/source/includes/threading/dot-parallel-reduction.rst b/docs/source/includes/threading/dot-parallel-reduction.rst
new file mode 100644
index 00000000000..6cfc07353d2
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-reduction.rst
@@ -0,0 +1,23 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+  float dotProduct = 0.0f;
+  tls.reduce([&](float * localDotProduct) {
+    if (localDotProduct) {
+      dotProduct += localDotProduct[0];
+      delete localDotProduct;
+    }
+  });
diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst
new file mode 100644
index 00000000000..6242de6c9fd
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel.rst
@@ -0,0 +1,65 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr)
+         {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t nThreads = 64;
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         float * localDotProduct = dotProductTLS.local();
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/dot-sequential.rst b/docs/source/includes/threading/dot-sequential.rst
new file mode 100644
index 00000000000..93300053c32
--- /dev/null
+++ b/docs/source/includes/threading/dot-sequential.rst
@@ -0,0 +1,25 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   float dot(const size_t n, const float* a, const float* b) {
+      float result = 0.0f;
+      for (size_t i = 0; i < n; ++i) {
+         result += a[i] * b[i];
+      }
+      return result;
+   }
diff --git a/docs/source/includes/threading/sum-parallel-by-block.rst b/docs/source/includes/threading/sum-parallel-by-block.rst
new file mode 100644
index 00000000000..f87c81ec7f3
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel-by-block.rst
@@ -0,0 +1,33 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+  #include "src/threading/threading.h"
+
+  void sum(const size_t n, const float* a, const float* b, float* c) {
+    constexpr size_t nThreads = 32;
+    constexpr size_t blockSize = 256;
+    const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+    daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
+      const size_t iStart = iBlock * blockSize;
+      const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+      for (size_t i = iStart; i < iEnd; ++i) {
+        c[i] = a[i] + b[i];
+      }
+    });
+  }
diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
index 4011bdfe524..3631924c318 100644
--- a/docs/source/includes/threading/sum-parallel.rst
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -19,7 +19,7 @@
   #include "src/threading/threading.h"
 
   void sum(const size_t n, const float* a, const float* b, float* c) {
-    const size_t nThreads = 32;
+    constexpr size_t nThreads = 32;
     daal::threader_for(n, nThreads, [&](size_t i) {
       c[i] = a[i] + b[i];
     });

From 029ec41834f7fc2ca0824085637a12688170c37a Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Mon, 22 Jul 2024 04:43:05 -0700
Subject: [PATCH 06/19] Fix example naming

---
 .../{sum-parallel-by-block.rst => sum-parallel-by-blocks.rst}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/source/includes/threading/{sum-parallel-by-block.rst => sum-parallel-by-blocks.rst} (100%)

diff --git a/docs/source/includes/threading/sum-parallel-by-block.rst b/docs/source/includes/threading/sum-parallel-by-blocks.rst
similarity index 100%
rename from docs/source/includes/threading/sum-parallel-by-block.rst
rename to docs/source/includes/threading/sum-parallel-by-blocks.rst

From 1221b9c79436a3809ef0775c714ee1ff68de1ad8 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Wed, 24 Jul 2024 02:39:28 -0700
Subject: [PATCH 07/19] Improve descriptions in threading.h

---
 cpp/daal/src/threading/threading.cpp          |  6 +-
 cpp/daal/src/threading/threading.h            | 88 +++++++++++++++++--
 docs/source/contribution/threading.rst        | 16 +++-
 .../dot-parallel-partial-compute.rst          | 39 ++++----
 .../includes/threading/dot-parallel.rst       |  3 +-
 .../threading/sum-parallel-by-blocks.rst      | 25 +++---
 .../includes/threading/sum-parallel.rst       | 13 ++-
 7 files changed, 136 insertions(+), 54 deletions(-)

diff --git a/cpp/daal/src/threading/threading.cpp b/cpp/daal/src/threading/threading.cpp
index 15c39368238..3b280229ee7 100644
--- a/cpp/daal/src/threading/threading.cpp
+++ b/cpp/daal/src/threading/threading.cpp
@@ -103,7 +103,7 @@ DAAL_EXPORT size_t _setNumberOfThreads(const size_t numThreads, void ** globalCo
     return 1;
 }
 
-DAAL_EXPORT void _daal_threader_for(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -160,7 +160,7 @@ DAAL_EXPORT void _daal_threader_for_blocked_size(size_t n, size_t block, const v
     }
 }
 
-DAAL_EXPORT void _daal_threader_for_simple(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for_simple(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -318,7 +318,7 @@ DAAL_PARALLEL_SORT_IMPL(daal::IdxValType<double>, pair_fp64_uint64)
 
 #undef DAAL_PARALLEL_SORT_IMPL
 
-DAAL_EXPORT void _daal_threader_for_blocked(int n, int threads_request, const void * a, daal::functype2 func)
+DAAL_EXPORT void _daal_threader_for_blocked(int n, int reserved, const void * a, daal::functype2 func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 0b4a9881b97..2852b9c9ee3 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -223,14 +223,33 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
     lambda(i, needBreak);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is 2^31 - 1.
+/// The work is scheduled dynamically across threads.
+///
+/// @tparam F   Lambda function of type ``[/* captures */](int i) -> void``,
+///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] lambda   Lambda function that defines iteration's body.
 template <typename F>
-inline void threader_for(int n, int threads_request, const F & lambda)
+inline void threader_for(int n, int reserved, const F & lambda)
 {
     const void * a = static_cast<const void *>(&lambda);
 
-    _daal_threader_for(n, threads_request, a, threader_func<F>);
+    _daal_threader_for(n, reserved, a, threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is 2^63 - 1.
+/// The work is scheduled dynamically across threads.
+///
+/// @tparam F   Lambda function of type [/* captures */](int64_t i) -> void,
+///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] lambda   Lambda function that defines iteration's body.
 template <typename F>
 inline void threader_for_int64(int64_t n, const F & lambda)
 {
@@ -239,12 +258,25 @@ inline void threader_for_int64(int64_t n, const F & lambda)
     _daal_threader_for_int64(n, a, threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is 2^31 - 1.
+/// The work is scheduled dynamically across threads.
+/// The iteration space is chunked using oneTBB ``simple_partitioner``
+/// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
+/// with chunk size 1.
+///
+/// @tparam F   Lambda function of type [/* captures */](int i) -> void,
+///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] lambda   Lambda function that defines iteration's body.
 template <typename F>
-inline void threader_for_simple(int n, int threads_request, const F & lambda)
+inline void threader_for_simple(int n, int reserved, const F & lambda)
 {
     const void * a = static_cast<const void *>(&lambda);
 
-    _daal_threader_for_simple(n, threads_request, a, threader_func<F>);
+    _daal_threader_for_simple(n, reserved, a, threader_func<F>);
 }
 
 template <typename F>
@@ -255,6 +287,35 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
     _daal_threader_for_int32ptr(begin, end, a, threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is ``SIZE_MAX`` in C99 standard.
+///
+/// The work is scheduled statically across threads.
+/// This means that the work is always scheduled in the same way across the threads:
+/// each thread processes the same set of iterations on each invocation of this loop.
+///
+/// It is recommended to use this parallel loop if each iteration of the loop
+/// performs equal amount of work.
+///
+/// Let ``t`` be the number of threads available to oneDAL.
+///
+/// Then the number of iterations processed by each threads (except maybe the last one)
+/// is copmputed as:
+/// ``nI = (n + t - 1) / t``
+///
+/// Here is how the work in split across the threads:
+/// The 1st thread executes iterations ``0``, ..., ``nI - 1``;
+/// the 2nd thread executes iterations ``nI``, ..., ``2 * nI - 1``;
+/// ...
+/// the t-th thread executes iterations ``(t - 1) * nI``, ..., ``n - 1``.
+///
+/// @tparam F   Lambda function of type [/* captures */](size_t i, size_t tid) -> void,
+///             where
+///                 ``i`` is the loop's iteration index, ``0 <= i < n``;
+///                 ``tid`` is the index of the thread, ``0 <= tid < t``.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] lambda   Lambda function that defines iteration's body.
 template <typename F>
 inline void static_threader_for(size_t n, const F & lambda)
 {
@@ -263,12 +324,27 @@ inline void static_threader_for(size_t n, const F & lambda)
     _daal_static_threader_for(n, a, static_threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is 2^31 - 1.
+/// The work is scheduled dynamically across threads.
+///
+/// @tparam F   Lambda function of type [/* captures */](int beginRange, int endRange) -> void
+///             where
+///                 ``beginRange`` is the starting index of the loop's iterations block to be
+///                                processed by a thread, ``0 <= beginRange < n``;
+///                 ``endRange``   is the index after the end of the loop's iterations block to be
+///                                processed by a thread, ``beginRange < endRange <= n``;
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] lambda   Lambda function that processes the block of loop's iterations
+///                     ``[beginRange, endRange)``.
 template <typename F>
-inline void threader_for_blocked(int n, int threads_request, const F & lambda)
+inline void threader_for_blocked(int n, int reserved, const F & lambda)
 {
     const void * a = static_cast<const void *>(&lambda);
 
-    _daal_threader_for_blocked(n, threads_request, a, threader_func_b<F>);
+    _daal_threader_for_blocked(n, reserved, a, threader_func_b<F>);
 }
 
 template <typename F>
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index a5298d389aa..42ff5e491ce 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -19,7 +19,7 @@
 Threading Layer
 ^^^^^^^^^^^^^^^
 
-oneDAL uses Intel\ |reg|\ oneAPI Threading Building Blocks (Intel\ |reg|\ oneTBB) to do parallel
+oneDAL uses Intel\ |reg|\  oneAPI Threading Building Blocks (Intel\ |reg|\  oneTBB) to do parallel
 computations on CPU.
 
 But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
@@ -31,6 +31,8 @@ on the particular threading technology.
 
 The API of the layer is defined in
 `threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
+Please be aware that those APIs are not publicly defined. So they can be changed at any time
+without any notification.
 
 This chapter describes common parallel patterns and primitives of the threading layer.
 
@@ -50,8 +52,7 @@ One of the options is to use ``daal::threader_for`` as shown here:
 .. include:: ../includes/threading/sum-parallel.rst
 
 The iteration space here goes from ``0`` to ``n-1``.
-``nThreads`` is the number of threads that execute the loop's body.
-And the last argument is the lambda function that defines a function object that proceeds ``i``-th
+The last argument is the lambda function that defines a function object that proceeds ``i``-th
 iteration of the loop.
 
 Blocking
@@ -108,3 +109,12 @@ Local memory of the threads should also be released when it is no longer needed.
 The complete parallel verision of dot product computations would look like:
 
 .. include:: ../includes/threading/dot-parallel.rst
+
+Static work scheduling
+**********************
+
+By default oneTBB uses dynamic work scheaduling and work stealing.
+It means that
+
+Nested parallelism
+******************
diff --git a/docs/source/includes/threading/dot-parallel-partial-compute.rst b/docs/source/includes/threading/dot-parallel-partial-compute.rst
index 931a8fb2cd6..0521e4bb2cf 100644
--- a/docs/source/includes/threading/dot-parallel-partial-compute.rst
+++ b/docs/source/includes/threading/dot-parallel-partial-compute.rst
@@ -16,26 +16,25 @@
 
 ::
 
-  constexpr size_t nThreads = 64;
-  constexpr size_t blockSize = 1024;
-  const size_t nBlocks = (n + blockSize - 1) / blockSize;
+   constexpr size_t blockSize = 1024;
+   const size_t nBlocks = (n + blockSize - 1) / blockSize;
 
-  daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
-    const size_t iStart = iBlock * blockSize;
-    const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+   daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+      const size_t iStart = iBlock * blockSize;
+      const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
 
-    // Compute partial result for this block
-    float partialDotProduct = 0.0f;
-    for (size_t i = iStart; i < iEnd; ++i) {
-      partialDotProduct += a[i] * b[i];
-    }
+      // Compute partial result for this block
+      float partialDotProduct = 0.0f;
+      for (size_t i = iStart; i < iEnd; ++i) {
+         partialDotProduct += a[i] * b[i];
+      }
 
-    // Update thread-local result
-    float * localDotProduct = dotProductTLS.local();
-    if (!localDotProduct) {
-      // Allocation error happened earlier
-      return;
-    }
-    localDotProduct[0] += partialDotProduct;
-  });
-  DAAL_CHECK_SAFE_STATUS();  // if (!safeStat) return safeStat.detach();
+      // Update thread-local result
+      float * localDotProduct = dotProductTLS.local();
+      if (!localDotProduct) {
+         // Allocation error happened earlier
+         return;
+      }
+      localDotProduct[0] += partialDotProduct;
+   });
+   DAAL_CHECK_SAFE_STATUS();  // if (!safeStat) return safeStat.detach();
diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst
index 6242de6c9fd..8fb2212b7ac 100644
--- a/docs/source/includes/threading/dot-parallel.rst
+++ b/docs/source/includes/threading/dot-parallel.rst
@@ -31,11 +31,10 @@
          return dotProductPtr;
       });
 
-      constexpr size_t nThreads = 64;
       constexpr size_t blockSize = 1024;
       const size_t nBlocks = (n + blockSize - 1) / blockSize;
 
-      daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
          const size_t iStart = iBlock * blockSize;
          const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
 
diff --git a/docs/source/includes/threading/sum-parallel-by-blocks.rst b/docs/source/includes/threading/sum-parallel-by-blocks.rst
index f87c81ec7f3..0e05cae1008 100644
--- a/docs/source/includes/threading/sum-parallel-by-blocks.rst
+++ b/docs/source/includes/threading/sum-parallel-by-blocks.rst
@@ -16,18 +16,17 @@
 
 ::
 
-  #include "src/threading/threading.h"
+   #include "src/threading/threading.h"
 
-  void sum(const size_t n, const float* a, const float* b, float* c) {
-    constexpr size_t nThreads = 32;
-    constexpr size_t blockSize = 256;
-    const size_t nBlocks = (n + blockSize - 1) / blockSize;
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      constexpr size_t blockSize = 256;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
 
-    daal::threader_for(nBlocks, nThreads, [&](size_t iBlock) {
-      const size_t iStart = iBlock * blockSize;
-      const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
-      for (size_t i = iStart; i < iEnd; ++i) {
-        c[i] = a[i] + b[i];
-      }
-    });
-  }
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            c[i] = a[i] + b[i];
+         }
+      });
+   }
diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
index 3631924c318..ba4ee4ae591 100644
--- a/docs/source/includes/threading/sum-parallel.rst
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -16,12 +16,11 @@
 
 ::
 
-  #include "src/threading/threading.h"
+   #include "src/threading/threading.h"
 
-  void sum(const size_t n, const float* a, const float* b, float* c) {
-    constexpr size_t nThreads = 32;
-    daal::threader_for(n, nThreads, [&](size_t i) {
-      c[i] = a[i] + b[i];
-    });
-  }
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      daal::threader_for(n, n, [&](size_t i) {
+         c[i] = a[i] + b[i];
+      });
+   }
 

From 5dae0446141227cfbd030c5dca4bca7ffc703c5d Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Tue, 30 Jul 2024 04:23:52 -0700
Subject: [PATCH 08/19] Add section about nesting

---
 cpp/daal/src/threading/threading.h            | 30 +++++++++
 docs/source/contribution/threading.rst        | 50 ++++++++++++++-
 .../threading/dot-parallel-init-tls.rst       |  3 +-
 .../includes/threading/dot-parallel.rst       |  3 +-
 .../threading/dot-static-parallel.rst         | 64 +++++++++++++++++++
 .../includes/threading/nested-parallel-ls.rst | 53 +++++++++++++++
 .../includes/threading/nested-parallel.rst    | 54 ++++++++++++++++
 7 files changed, 251 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/includes/threading/dot-static-parallel.rst
 create mode 100644 docs/source/includes/threading/nested-parallel-ls.rst
 create mode 100644 docs/source/includes/threading/nested-parallel.rst

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 2852b9c9ee3..71db62c070e 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -397,10 +397,18 @@ class tls_deleter_ : public tls_deleter
     virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
 };
 
+/// Thread-local storage (TLS)
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class tls : public tlsBase
 {
 public:
+    /// Initialize thread-local storage
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    ///
+    /// @param lambda       Lambda function that initializes a thread-local storage
     template <typename lambdaType>
     explicit tls(const lambdaType & lambda)
     {
@@ -415,6 +423,11 @@ class tls : public tlsBase
         tlsPtr = _daal_get_tls_ptr(a, tls_func<lambdaType>);
     }
 
+    /// Destroys the memory associated with a thread-local storage
+    ///
+    /// @note TLS does not release the memory allocated by a lambda-function
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~tls()
     {
         d->del(voidLambda);
@@ -422,12 +435,23 @@ class tls : public tlsBase
         _daal_del_tls_ptr(tlsPtr);
     }
 
+    /// Access a local data of a thread by value
+    ///
+    /// @return When first ionvoced by a thread, a lambda provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _daal_get_tls_local(tlsPtr);
         return (static_cast<F>(pf));
     }
 
+    /// Sequential reduction.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    ///
+    /// @param lambda       Lambda function that is applied to each element of thread-local
+    ///                     storage sequentially.
     template <typename lambdaType>
     void reduce(const lambdaType & lambda)
     {
@@ -436,6 +460,12 @@ class tls : public tlsBase
         _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
     }
 
+    /// Parallel reduction.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    ///
+    /// @param lambda       Lambda function that is applied to each element of thread-local
+    ///                     storage in parallel.
     template <typename lambdaType>
     void parallel_reduce(const lambdaType & lambda)
     {
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 42ff5e491ce..7e365491054 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -113,8 +113,54 @@ The complete parallel verision of dot product computations would look like:
 Static work scheduling
 **********************
 
-By default oneTBB uses dynamic work scheaduling and work stealing.
-It means that
+By default oneTBB uses
+`dynamic work scheduling <https://oneapi-src.github.io/oneTBB/main/tbb_userguide/How_Task_Scheduler_Works.html>`_
+and work stealing.
+It means that two different runs of the same parallel loop can produce different
+mappings of the loop's iteration space to the available threads.
+This strategy is benefitial when it is hard to estimate the amount of work performed
+by each iteration.
+
+In the cases when it is known that the iterations perform equal amount of work it
+might be benefitial to use pre-defined mapping of the loop's iterations to threads.
+This is what static work scheduling does.
+
+``daal::static_threader_for`` and ``daal::static_tls`` allow to implement static
+work scheduling within oneDAL.
+
+Here is a variant of parallel dot product computation with static scheduling:
+
+.. include:: ../includes/threading/dot-static-parallel.rst
 
 Nested parallelism
 ******************
+
+It is allowed to have nested parallel loops within oneDAL.
+What is important to know is that
+
+    "when a parallel construct calls another parallel construct, a thread can obtain a task
+     from the outer-level construct while waiting for completion of the inner-level one."
+
+    -- `oneTBB documentation <https://www.intel.com/content/www/us/en/docs/onetbb/developer-guide-api-reference/2021-13/work-isolation.html>`_
+
+In practice this means that, for example, a thread-local variable might unexpectedly
+change its value after a nested parallel construct:
+
+.. include:: ../includes/threading/nested-parallel.rst
+
+In some scenarios this can lead to deadlocks, segmentation faults and other issues.
+
+oneTBB provides ways to isolate execution of a parallel construct, for its tasks
+to not interfere with other simultaneously running tasks.
+
+Those options are preferred when the parallel loops are initially written as nested.
+But in oneDAL there are cases when one parallel algorithm, the outer one,
+calls another parallel algorithm, the inner one, within a parallel region.
+
+The inner algorithm in this case can also be called solely, without additional nesting.
+And we do not want to always make it isolated.
+
+For the cases like that oneDAL provides ``daal::ls``. Its ``local()`` method always
+returns the same value for the same thread, regardless of the nested execution:
+
+.. include:: ../includes/threading/nested-parallel-ls.rst
diff --git a/docs/source/includes/threading/dot-parallel-init-tls.rst b/docs/source/includes/threading/dot-parallel-init-tls.rst
index f97c1960f47..8e72646a7ca 100644
--- a/docs/source/includes/threading/dot-parallel-init-tls.rst
+++ b/docs/source/includes/threading/dot-parallel-init-tls.rst
@@ -22,8 +22,7 @@
    SafeStatus safeStat;
    daal::tls<float *> dotProductTLS([=, &safeStat]() {
       float * dotProductPtr = new (std::nothrow) float;
-      if (!dotProductPtr)
-      {
+      if (!dotProductPtr) {
          safeStat.add(services::ErrorMemoryAllocationFailed);
       }
       dotProductPtr[0] = 0.0f;
diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst
index 8fb2212b7ac..d0230715a01 100644
--- a/docs/source/includes/threading/dot-parallel.rst
+++ b/docs/source/includes/threading/dot-parallel.rst
@@ -23,8 +23,7 @@
       SafeStatus safeStat;
       daal::tls<float *> dotProductTLS([=, &safeStat]() {
          float * dotProductPtr = new (std::nothrow) float;
-         if (!dotProductPtr)
-         {
+         if (!dotProductPtr) {
             safeStat.add(services::ErrorMemoryAllocationFailed);
          }
          dotProductPtr[0] = 0.0f;
diff --git a/docs/source/includes/threading/dot-static-parallel.rst b/docs/source/includes/threading/dot-static-parallel.rst
new file mode 100644
index 00000000000..dfec18b6d21
--- /dev/null
+++ b/docs/source/includes/threading/dot-static-parallel.rst
@@ -0,0 +1,64 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::static_tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr) {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::static_threader_for(nBlocks, [&](size_t iBlock, size_t threadId) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         // Note that exact thread index is used to get access to the thread's data
+         float * localDotProduct = dotProductTLS.local(threadId);
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/nested-parallel-ls.rst b/docs/source/includes/threading/nested-parallel-ls.rst
new file mode 100644
index 00000000000..1ceb0414c0b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel-ls.rst
@@ -0,0 +1,53 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::ls<float *> ls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = ls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // The thread specific value always stays unchanged after the nested execution.
+      assert(localBuffer == ls.local()); // Assertion never fails!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   ls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });
diff --git a/docs/source/includes/threading/nested-parallel.rst b/docs/source/includes/threading/nested-parallel.rst
new file mode 100644
index 00000000000..fc41f70398b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel.rst
@@ -0,0 +1,54 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> tls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = tls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // While executing the above parallel_for, the thread might have run iterations
+      // of the outer parallel_for, and so might have changed the thread specific value.
+      assert(localBuffer == tls.local()); // The assertion may fail!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   tls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });

From 2419cde3fe42e4bf7e22bfefbae1a71276a2c8f4 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Wed, 31 Jul 2024 01:54:01 -0700
Subject: [PATCH 09/19] Improve comments in threading.h

---
 cpp/daal/src/threading/threading.h | 66 ++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 71db62c070e..8fd1c188d54 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -397,7 +397,9 @@ class tls_deleter_ : public tls_deleter
     virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
 };
 
-/// Thread-local storage (TLS)
+/// Thread-local storage (TLS).
+/// Can change its local variable after a nested parallel constructs.
+/// @note Use carefully in case of nested parallel regions.
 ///
 /// @tparam F  Type of the data located in the storage
 template <typename F>
@@ -437,7 +439,7 @@ class tls : public tlsBase
 
     /// Access a local data of a thread by value
     ///
-    /// @return When first ionvoced by a thread, a lambda provided to the constructor is
+    /// @return When first invoced by a thread, a lambda provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local()
@@ -502,10 +504,18 @@ class static_tls_deleter_ : public static_tls_deleter
     virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
 };
 
+/// Thread-local storage (TLS) for the case of static parallel work scheduling.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class static_tls
 {
 public:
+    /// Initialize thread-local storage.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    ///
+    /// @param lambda       Lambda function that initializes a thread-local storage
     template <typename lambdaType>
     explicit static_tls(const lambdaType & lambda)
     {
@@ -537,6 +547,11 @@ class static_tls
         _creater_func = creater_func<F, lambdaType>;
     }
 
+    /// Destroys the memory associated with a thread-local storage.
+    ///
+    /// @note Static TLS does not release the memory allocated by a lambda-function
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~static_tls()
     {
         if (_deleter)
@@ -547,9 +562,16 @@ class static_tls
         delete[] _storage;
     }
 
+    /// Access a local data of a specified thread by value.
+    ///
+    /// @param tid  Index of the thread.
+    ///
+    /// @return When first invoced by a thread, a lambda provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local(size_t tid)
     {
-        if (_storage)
+        if (_storage && tid < _nThreads)
         {
             if (!_storage[tid])
             {
@@ -564,6 +586,12 @@ class static_tls
         }
     }
 
+    /// Sequential reduction.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    ///
+    /// @param lambda       Lambda function that is applied to each element of thread-local
+    ///                     storage sequentially.
     template <typename lambdaType>
     void reduce(const lambdaType & lambda)
     {
@@ -576,6 +604,9 @@ class static_tls
         }
     }
 
+    /// Full number of threads.
+    ///
+    /// @return Number of threads available.
     size_t nthreads() const { return _nThreads; }
 
 private:
@@ -586,10 +617,23 @@ class static_tls
     static_tls_deleter * _deleter    = nullptr;
 };
 
+/// Local storage (LS) for a data of a thread.
+/// Does not change its local variable after nested parallel constructs,
+/// but can have performance penalties comparing to daal::tls.
+/// Can be safely used in case of nested parallel regions.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class ls : public tlsBase
 {
 public:
+    /// Initialize a local storage.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    ///
+    /// @param lambda   Lambda function that initializes a local storage
+    /// @param isTls    if true, then local storage is a thread-local storage (daal::tls)
+    ///                 and might have problems in case of nested parallel regions.
     template <typename lambdaType>
     explicit ls(const lambdaType & lambda, const bool isTls = false)
     {
@@ -605,6 +649,11 @@ class ls : public tlsBase
         lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<lambdaType>) : _daal_get_ls_ptr(a, tls_func<lambdaType>);
     }
 
+    /// Destroys the memory associated with a local storage.
+    ///
+    /// @note LS does not release the memory allocated by a lambda-function
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~ls()
     {
         d->del(voidLambda);
@@ -612,6 +661,11 @@ class ls : public tlsBase
         _isTls ? _daal_del_tls_ptr(lsPtr) : _daal_del_ls_ptr(lsPtr);
     }
 
+    /// Access a local data of a thread by value.
+    ///
+    /// @return When first invoced by a thread, a lambda provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _isTls ? _daal_get_tls_local(lsPtr) : _daal_get_ls_local(lsPtr);
@@ -623,6 +677,12 @@ class ls : public tlsBase
         if (!_isTls) _daal_release_ls_local(lsPtr, p);
     }
 
+    /// Sequential reduction.
+    ///
+    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    ///
+    /// @param lambda       Lambda function that is applied to each element of thread-local
+    ///                     storage sequentially.
     template <typename lambdaType>
     void reduce(const lambdaType & lambda)
     {

From 8ae4036c03341e47b7e4d7d8e468fd7b1ec3e6f9 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Wed, 31 Jul 2024 02:25:36 -0700
Subject: [PATCH 10/19] Apply reviewer's suggestions

---
 CONTRIBUTING.md                        |  2 +-
 cpp/daal/src/threading/threading.h     | 10 +++---
 docs/source/contribution/threading.rst | 44 +++++++++++++-------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index be6ce220456..0804750897d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -73,7 +73,7 @@ For your convenience we also added [coding guidelines](http://oneapi-src.github.
 
 ### Threading Layer
 
-In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form so called [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms.
+In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form are called the [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms.
 
 ## Documentation Guidelines
 
diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 8fd1c188d54..a4ff895a5ab 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -300,10 +300,10 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
 /// Let ``t`` be the number of threads available to oneDAL.
 ///
 /// Then the number of iterations processed by each threads (except maybe the last one)
-/// is copmputed as:
+/// is computed as:
 /// ``nI = (n + t - 1) / t``
 ///
-/// Here is how the work in split across the threads:
+/// Here is how the work is split across the threads:
 /// The 1st thread executes iterations ``0``, ..., ``nI - 1``;
 /// the 2nd thread executes iterations ``nI``, ..., ``2 * nI - 1``;
 /// ...
@@ -439,7 +439,7 @@ class tls : public tlsBase
 
     /// Access a local data of a thread by value
     ///
-    /// @return When first invoced by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a lambda provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local()
@@ -566,7 +566,7 @@ class static_tls
     ///
     /// @param tid  Index of the thread.
     ///
-    /// @return When first invoced by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a lambda provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local(size_t tid)
@@ -663,7 +663,7 @@ class ls : public tlsBase
 
     /// Access a local data of a thread by value.
     ///
-    /// @return When first invoced by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a lambda provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local()
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 7e365491054..4a40d460b13 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -23,7 +23,7 @@ oneDAL uses Intel\ |reg|\  oneAPI Threading Building Blocks (Intel\ |reg|\  oneT
 computations on CPU.
 
 But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
-use custom primitives that either wrap oneTBB functionality or are inhome developed.
+use custom primitives that either wrap oneTBB functionality or are in-house developed.
 Those primitives form oneDAL's threading layer.
 
 This is done in order not to be dependent on possible oneTBB API changes and even
@@ -31,7 +31,7 @@ on the particular threading technology.
 
 The API of the layer is defined in
 `threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
-Please be aware that those APIs are not publicly defined. So they can be changed at any time
+Please be aware that those APIs are not publicly defined, so they can be changed at any time
 without any notification.
 
 This chapter describes common parallel patterns and primitives of the threading layer.
@@ -39,13 +39,13 @@ This chapter describes common parallel patterns and primitives of the threading
 threader_for
 ************
 
-Lets consider you need to compute an elementwise sum of two arrays and store the results
+Consider a case where you need to compute an elementwise sum of two arrays and store the results
 into another array.
 Here is a variant of sequential implementation:
 
 .. include:: ../includes/threading/sum-sequential.rst
 
-There are several options available in oneDAL's threading layer to let the iterations of this code
+There are several options available in the threading layer of oneDAL to let the iterations of this code
 run in parallel.
 One of the options is to use ``daal::threader_for`` as shown here:
 
@@ -84,23 +84,23 @@ Following code allocates memory that would store partial dot products for each t
 
 .. include:: ../includes/threading/dot-parallel-init-tls.rst
 
-``SafeStatus`` in this code denotes a thread-safe counterpart of oneDAL's ``Status`` class.
+``SafeStatus`` in this code denotes a thread-safe counterpart of the ``Status`` class.
 ``SafeStatus`` allows to collect errors from all threads and report them to user using
 ``detach()`` method as it will be shown later in the code.
 
-Checking the status right after the initialization code won't show the allocation errors though.
-Because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
+Checking the status right after the initialization code won't show the allocation errors,
+because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
 is evaluated in the moment of the TLS's first use.
 
-Again, there are several options available in oneDAL's threading layer to compute the partial
+Again, there are several options available in the threading layer of oneDAL to compute the partial
 dot product results at each thread.
-One of the options is to use already mentioned ``daal::threader_for`` and blocking approach
+One of the options is to use the already mentioned ``daal::threader_for`` and blocking approach
 as shown here:
 
 .. include:: ../includes/threading/dot-parallel-partial-compute.rst
 
 To compute the final result it is requred to reduce TLS's partial results over all threads
-as it is shown here:
+as shown here:
 
 .. include:: ../includes/threading/dot-parallel-reduction.rst
 
@@ -110,40 +110,40 @@ The complete parallel verision of dot product computations would look like:
 
 .. include:: ../includes/threading/dot-parallel.rst
 
-Static work scheduling
+Static Work Scheduling
 **********************
 
-By default oneTBB uses
+By default, oneTBB uses
 `dynamic work scheduling <https://oneapi-src.github.io/oneTBB/main/tbb_userguide/How_Task_Scheduler_Works.html>`_
 and work stealing.
 It means that two different runs of the same parallel loop can produce different
 mappings of the loop's iteration space to the available threads.
-This strategy is benefitial when it is hard to estimate the amount of work performed
+This strategy is benefitial when it is difficult to estimate the amount of work performed
 by each iteration.
 
-In the cases when it is known that the iterations perform equal amount of work it
-might be benefitial to use pre-defined mapping of the loop's iterations to threads.
+In the cases when it is known that the iterations perform an equal amount of work, it
+might be benefitial to use predefined mapping of the loop's iterations to threads.
 This is what static work scheduling does.
 
-``daal::static_threader_for`` and ``daal::static_tls`` allow to implement static
+``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static
 work scheduling within oneDAL.
 
 Here is a variant of parallel dot product computation with static scheduling:
 
 .. include:: ../includes/threading/dot-static-parallel.rst
 
-Nested parallelism
+Nested Parallelism
 ******************
 
-It is allowed to have nested parallel loops within oneDAL.
-What is important to know is that
+oneDAL supports nested parallel loops.
+It is important to know that:
 
     "when a parallel construct calls another parallel construct, a thread can obtain a task
      from the outer-level construct while waiting for completion of the inner-level one."
 
     -- `oneTBB documentation <https://www.intel.com/content/www/us/en/docs/onetbb/developer-guide-api-reference/2021-13/work-isolation.html>`_
 
-In practice this means that, for example, a thread-local variable might unexpectedly
+In practice, this means that a thread-local variable might unexpectedly
 change its value after a nested parallel construct:
 
 .. include:: ../includes/threading/nested-parallel.rst
@@ -158,9 +158,9 @@ But in oneDAL there are cases when one parallel algorithm, the outer one,
 calls another parallel algorithm, the inner one, within a parallel region.
 
 The inner algorithm in this case can also be called solely, without additional nesting.
-And we do not want to always make it isolated.
+And we do not always want to make it isolated.
 
-For the cases like that oneDAL provides ``daal::ls``. Its ``local()`` method always
+For the cases like that, oneDAL provides ``daal::ls``. Its ``local()`` method always
 returns the same value for the same thread, regardless of the nested execution:
 
 .. include:: ../includes/threading/nested-parallel-ls.rst

From 28fdc89033c79abd9f8343ae799556749a57183c Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Wed, 31 Jul 2024 02:31:46 -0700
Subject: [PATCH 11/19] Minor change: sum -> add

---
 docs/source/contribution/threading.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 4a40d460b13..2400a312fb6 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -77,7 +77,7 @@ Here is a variant of sequential implementation:
 Parallel computations can be performed in two steps:
 
     1. Compute partial dot product at each threaded.
-    2. Perform a reduction: Sum the partial results from all threads to compute the final dot product.
+    2. Perform a reduction: Add the partial results from all threads to compute the final dot product.
 
 ``daal::tls`` provides a local storage where each thread can accumulate its local results.
 Following code allocates memory that would store partial dot products for each thread:

From 9278d4cdfd6e062060261c37687bbecec8f2b7a9 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 1 Aug 2024 00:59:42 -0700
Subject: [PATCH 12/19] Spelling: benefitial -> beneficial

---
 docs/source/contribution/threading.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 2400a312fb6..6a25b3c9c78 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -118,11 +118,11 @@ By default, oneTBB uses
 and work stealing.
 It means that two different runs of the same parallel loop can produce different
 mappings of the loop's iteration space to the available threads.
-This strategy is benefitial when it is difficult to estimate the amount of work performed
+This strategy is beneficial when it is difficult to estimate the amount of work performed
 by each iteration.
 
 In the cases when it is known that the iterations perform an equal amount of work, it
-might be benefitial to use predefined mapping of the loop's iterations to threads.
+might be beneficial to use predefined mapping of the loop's iterations to threads.
 This is what static work scheduling does.
 
 ``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static

From dc9739a991c67514e764e00aa8a8e0bf62f15bb6 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 1 Aug 2024 01:20:25 -0700
Subject: [PATCH 13/19] Fix in parallel reduction example

---
 docs/source/includes/threading/dot-parallel-reduction.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/includes/threading/dot-parallel-reduction.rst b/docs/source/includes/threading/dot-parallel-reduction.rst
index 6cfc07353d2..e86ca030246 100644
--- a/docs/source/includes/threading/dot-parallel-reduction.rst
+++ b/docs/source/includes/threading/dot-parallel-reduction.rst
@@ -14,6 +14,8 @@
 .. * limitations under the License.
 .. *******************************************************************************/
 
+::
+
   float dotProduct = 0.0f;
   tls.reduce([&](float * localDotProduct) {
     if (localDotProduct) {

From 54b8e943d9ec85bf1cf74aaba3ee8a5d9ccc608c Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Mon, 12 Aug 2024 00:51:47 -0700
Subject: [PATCH 14/19] Replace double backtick with single backtick in
 comments

---
 cpp/daal/src/threading/threading.h | 62 +++++++++++++++---------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index a4ff895a5ab..85fe5c09481 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -227,8 +227,8 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
 /// The maximal number of iterations in the loop is 2^31 - 1.
 /// The work is scheduled dynamically across threads.
 ///
-/// @tparam F   Lambda function of type ``[/* captures */](int i) -> void``,
-///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+/// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
@@ -245,8 +245,8 @@ inline void threader_for(int n, int reserved, const F & lambda)
 /// The maximal number of iterations in the loop is 2^63 - 1.
 /// The work is scheduled dynamically across threads.
 ///
-/// @tparam F   Lambda function of type [/* captures */](int64_t i) -> void,
-///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+/// @tparam F   Lambda function of type `[/* captures */](int64_t i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] lambda   Lambda function that defines iteration's body.
@@ -261,12 +261,12 @@ inline void threader_for_int64(int64_t n, const F & lambda)
 /// Execute the for loop defined by the input parameters in parallel.
 /// The maximal number of iterations in the loop is 2^31 - 1.
 /// The work is scheduled dynamically across threads.
-/// The iteration space is chunked using oneTBB ``simple_partitioner``
+/// The iteration space is chunked using oneTBB `simple_partitioner`
 /// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
 /// with chunk size 1.
 ///
-/// @tparam F   Lambda function of type [/* captures */](int i) -> void,
-///             where ``i`` is the loop's iteration index, ``0 <= i < n``.
+/// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
@@ -288,7 +288,7 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
 }
 
 /// Execute the for loop defined by the input parameters in parallel.
-/// The maximal number of iterations in the loop is ``SIZE_MAX`` in C99 standard.
+/// The maximal number of iterations in the loop is `SIZE_MAX` in C99 standard.
 ///
 /// The work is scheduled statically across threads.
 /// This means that the work is always scheduled in the same way across the threads:
@@ -297,22 +297,22 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
 /// It is recommended to use this parallel loop if each iteration of the loop
 /// performs equal amount of work.
 ///
-/// Let ``t`` be the number of threads available to oneDAL.
+/// Let `t` be the number of threads available to oneDAL.
 ///
 /// Then the number of iterations processed by each threads (except maybe the last one)
 /// is computed as:
-/// ``nI = (n + t - 1) / t``
+/// `nI = (n + t - 1) / t`
 ///
 /// Here is how the work is split across the threads:
-/// The 1st thread executes iterations ``0``, ..., ``nI - 1``;
-/// the 2nd thread executes iterations ``nI``, ..., ``2 * nI - 1``;
+/// The 1st thread executes iterations `0, ..., nI - 1`;
+/// the 2nd thread executes iterations `nI, ..., 2 * nI - 1`;
 /// ...
-/// the t-th thread executes iterations ``(t - 1) * nI``, ..., ``n - 1``.
+/// the t-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
 ///
-/// @tparam F   Lambda function of type [/* captures */](size_t i, size_t tid) -> void,
+/// @tparam F   Lambda function of type `[/* captures */](size_t i, size_t tid) -> void`,
 ///             where
-///                 ``i`` is the loop's iteration index, ``0 <= i < n``;
-///                 ``tid`` is the index of the thread, ``0 <= tid < t``.
+///                 `i` is the loop's iteration index, `0 <= i < n`;
+///                 `tid` is the index of the thread, `0 <= tid < t`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] lambda   Lambda function that defines iteration's body.
@@ -328,17 +328,17 @@ inline void static_threader_for(size_t n, const F & lambda)
 /// The maximal number of iterations in the loop is 2^31 - 1.
 /// The work is scheduled dynamically across threads.
 ///
-/// @tparam F   Lambda function of type [/* captures */](int beginRange, int endRange) -> void
+/// @tparam F   Lambda function of type `[/* captures */](int beginRange, int endRange) -> void`
 ///             where
-///                 ``beginRange`` is the starting index of the loop's iterations block to be
-///                                processed by a thread, ``0 <= beginRange < n``;
-///                 ``endRange``   is the index after the end of the loop's iterations block to be
-///                                processed by a thread, ``beginRange < endRange <= n``;
+///                 `beginRange` is the starting index of the loop's iterations block to be
+///                                processed by a thread, `0 <= beginRange < n`;
+///                 `endRange`   is the index after the end of the loop's iterations block to be
+///                                processed by a thread, `beginRange < endRange <= n`;
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
 /// @param[in] lambda   Lambda function that processes the block of loop's iterations
-///                     ``[beginRange, endRange)``.
+///                     `[beginRange, endRange)`.
 template <typename F>
 inline void threader_for_blocked(int n, int reserved, const F & lambda)
 {
@@ -408,7 +408,7 @@ class tls : public tlsBase
 public:
     /// Initialize thread-local storage
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
     ///
     /// @param lambda       Lambda function that initializes a thread-local storage
     template <typename lambdaType>
@@ -450,7 +450,7 @@ class tls : public tlsBase
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
     ///
     /// @param lambda       Lambda function that is applied to each element of thread-local
     ///                     storage sequentially.
@@ -464,7 +464,7 @@ class tls : public tlsBase
 
     /// Parallel reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
     ///
     /// @param lambda       Lambda function that is applied to each element of thread-local
     ///                     storage in parallel.
@@ -513,7 +513,7 @@ class static_tls
 public:
     /// Initialize thread-local storage.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
     ///
     /// @param lambda       Lambda function that initializes a thread-local storage
     template <typename lambdaType>
@@ -588,7 +588,7 @@ class static_tls
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
     ///
     /// @param lambda       Lambda function that is applied to each element of thread-local
     ///                     storage sequentially.
@@ -619,7 +619,7 @@ class static_tls
 
 /// Local storage (LS) for a data of a thread.
 /// Does not change its local variable after nested parallel constructs,
-/// but can have performance penalties comparing to daal::tls.
+/// but can have performance penalties comparing to `daal::tls`.
 /// Can be safely used in case of nested parallel regions.
 ///
 /// @tparam F  Type of the data located in the storage
@@ -629,10 +629,10 @@ class ls : public tlsBase
 public:
     /// Initialize a local storage.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */]() -> F
+    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
     ///
     /// @param lambda   Lambda function that initializes a local storage
-    /// @param isTls    if true, then local storage is a thread-local storage (daal::tls)
+    /// @param isTls    if true, then local storage is a thread-local storage (`daal::tls`)
     ///                 and might have problems in case of nested parallel regions.
     template <typename lambdaType>
     explicit ls(const lambdaType & lambda, const bool isTls = false)
@@ -679,7 +679,7 @@ class ls : public tlsBase
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type [/* captures */](F) -> void
+    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
     ///
     /// @param lambda       Lambda function that is applied to each element of thread-local
     ///                     storage sequentially.

From 267e7b65d4b85f258b45356ea7892c0312077724 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Tue, 13 Aug 2024 05:29:19 -0700
Subject: [PATCH 15/19] Updated comments in threading.h

---
 cpp/daal/src/threading/threading.h | 63 +++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 85fe5c09481..78edceb627d 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -223,16 +223,20 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
     lambda(i, needBreak);
 }
 
-/// Execute the for loop defined by the input parameters in parallel.
+/// Pass a function to be executed in a for loop to the threading layer.
 /// The maximal number of iterations in the loop is 2^31 - 1.
-/// The work is scheduled dynamically across threads.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// The iterations of the loop should be logically independent.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
-/// @param[in] lambda   Lambda function that defines iteration's body.
+/// @param[in] lambda   Lambda function that defines the loop body.
 template <typename F>
 inline void threader_for(int n, int reserved, const F & lambda)
 {
@@ -241,15 +245,19 @@ inline void threader_for(int n, int reserved, const F & lambda)
     _daal_threader_for(n, reserved, a, threader_func<F>);
 }
 
-/// Execute the for loop defined by the input parameters in parallel.
+/// Pass a function to be executed in a for loop to the threading layer.
 /// The maximal number of iterations in the loop is 2^63 - 1.
-/// The work is scheduled dynamically across threads.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// The iterations of the loop should be logically independent.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](int64_t i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
-/// @param[in] lambda   Lambda function that defines iteration's body.
+/// @param[in] lambda   Lambda function that defines the loop body.
 template <typename F>
 inline void threader_for_int64(int64_t n, const F & lambda)
 {
@@ -258,12 +266,20 @@ inline void threader_for_int64(int64_t n, const F & lambda)
     _daal_threader_for_int64(n, a, threader_func<F>);
 }
 
-/// Execute the for loop defined by the input parameters in parallel.
+/// Pass a function to be executed in a for loop to the threading layer.
 /// The maximal number of iterations in the loop is 2^31 - 1.
-/// The work is scheduled dynamically across threads.
-/// The iteration space is chunked using oneTBB `simple_partitioner`
+///
+/// The specifics of this loop comparing to `threader_for` is that the iteration spase
+/// of the loop is always chunked to the chunks of size 1.
+/// This means the threading layer tries to assign the consecutive iterations to
+/// a different threads if possible.
+/// In case of oneTBB threading backend this means that `simple_partitioner`
 /// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
-/// with chunk size 1.
+/// with chunk size 1 is used to produce iterations to threads mapping.
+///
+/// The iterations of the loop should be logically independent.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
@@ -324,9 +340,10 @@ inline void static_threader_for(size_t n, const F & lambda)
     _daal_static_threader_for(n, a, static_threader_func<F>);
 }
 
-/// Execute the for loop defined by the input parameters in parallel.
+/// Pass a function to be executed in a for loop to the threading layer.
 /// The maximal number of iterations in the loop is 2^31 - 1.
-/// The work is scheduled dynamically across threads.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](int beginRange, int endRange) -> void`
 ///             where
@@ -399,7 +416,9 @@ class tls_deleter_ : public tls_deleter
 
 /// Thread-local storage (TLS).
 /// Can change its local variable after a nested parallel constructs.
-/// @note Use carefully in case of nested parallel regions.
+/// @note Thread-local storage in nested parallel regions is, in general, not thread local.
+/// The use of nested parallelism should be avioded if possible, otherwise extra care
+/// must be taken with thread-local values.
 ///
 /// @tparam F  Type of the data located in the storage
 template <typename F>
@@ -606,7 +625,7 @@ class static_tls
 
     /// Full number of threads.
     ///
-    /// @return Number of threads available.
+    /// @return Total number of threads available to oneDAL.
     size_t nthreads() const { return _nThreads; }
 
 private:
@@ -617,9 +636,9 @@ class static_tls
     static_tls_deleter * _deleter    = nullptr;
 };
 
-/// Local storage (LS) for a data of a thread.
+/// Local storage (LS) for the data of a thread.
 /// Does not change its local variable after nested parallel constructs,
-/// but can have performance penalties comparing to `daal::tls`.
+/// but can have performance penalties compared to thread-local storage type `daal::tls`.
 /// Can be safely used in case of nested parallel regions.
 ///
 /// @tparam F  Type of the data located in the storage
@@ -627,12 +646,12 @@ template <typename F>
 class ls : public tlsBase
 {
 public:
-    /// Initialize a local storage.
+    /// Initialize local storage.
     ///
     /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
     ///
-    /// @param lambda   Lambda function that initializes a local storage
-    /// @param isTls    if true, then local storage is a thread-local storage (`daal::tls`)
+    /// @param lambda   Lambda function that initializes local storage
+    /// @param isTls    if `true`, then local storage is a thread-local storage (`daal::tls`)
     ///                 and might have problems in case of nested parallel regions.
     template <typename lambdaType>
     explicit ls(const lambdaType & lambda, const bool isTls = false)
@@ -649,9 +668,9 @@ class ls : public tlsBase
         lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<lambdaType>) : _daal_get_ls_ptr(a, tls_func<lambdaType>);
     }
 
-    /// Destroys the memory associated with a local storage.
+    /// Destroys the memory associated with local storage.
     ///
-    /// @note LS does not release the memory allocated by a lambda-function
+    /// @note `ls` does not release the memory allocated by a lambda-function
     ///       provided to the constructor.
     ///       Developers are responsible for deletion of that memory.
     virtual ~ls()
@@ -661,7 +680,7 @@ class ls : public tlsBase
         _isTls ? _daal_del_tls_ptr(lsPtr) : _daal_del_ls_ptr(lsPtr);
     }
 
-    /// Access a local data of a thread by value.
+    /// Access the local data of a thread by value.
     ///
     /// @return When first invoked by a thread, a lambda provided to the constructor is
     ///         called to initialize the local data of the thread and return it.

From 74be85a33620a33bcf44ea7ae4ae4d1a05d13d41 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Tue, 13 Aug 2024 05:57:41 -0700
Subject: [PATCH 16/19] Apply review comments to threading.rst

---
 docs/source/contribution/threading.rst | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 6a25b3c9c78..91ce3e2a2ad 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -27,12 +27,13 @@ use custom primitives that either wrap oneTBB functionality or are in-house deve
 Those primitives form oneDAL's threading layer.
 
 This is done in order not to be dependent on possible oneTBB API changes and even
-on the particular threading technology.
+on the particular threading technology like oneTBB, C++11 standard threads, etc.
 
 The API of the layer is defined in
 `threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
-Please be aware that those APIs are not publicly defined, so they can be changed at any time
-without any notification.
+Please be aware that the threading API is not a part of oneDAL product API.
+This is the product internal API that aimed to be used only by oneDAL developers, and can be changed at any time
+without any prior notification.
 
 This chapter describes common parallel patterns and primitives of the threading layer.
 
@@ -52,8 +53,7 @@ One of the options is to use ``daal::threader_for`` as shown here:
 .. include:: ../includes/threading/sum-parallel.rst
 
 The iteration space here goes from ``0`` to ``n-1``.
-The last argument is the lambda function that defines a function object that proceeds ``i``-th
-iteration of the loop.
+The last argument is a function object that performs a single iteration of the loop, given loop index ``i``.
 
 Blocking
 --------
@@ -76,37 +76,37 @@ Here is a variant of sequential implementation:
 
 Parallel computations can be performed in two steps:
 
-    1. Compute partial dot product at each threaded.
+    1. Compute partial dot product in each thread.
     2. Perform a reduction: Add the partial results from all threads to compute the final dot product.
 
 ``daal::tls`` provides a local storage where each thread can accumulate its local results.
-Following code allocates memory that would store partial dot products for each thread:
+The following code allocates memory that would store partial dot products for each thread:
 
 .. include:: ../includes/threading/dot-parallel-init-tls.rst
 
 ``SafeStatus`` in this code denotes a thread-safe counterpart of the ``Status`` class.
-``SafeStatus`` allows to collect errors from all threads and report them to user using
-``detach()`` method as it will be shown later in the code.
+``SafeStatus`` allows to collect errors from all threads and report them to the user using the
+``detach()`` method. An example will be shown later in the documentation.
 
 Checking the status right after the initialization code won't show the allocation errors,
 because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
-is evaluated in the moment of the TLS's first use.
+is evaluated on first use of the thread-local storage (TLS).
 
-Again, there are several options available in the threading layer of oneDAL to compute the partial
+There are several options available in the threading layer of oneDAL to compute the partial
 dot product results at each thread.
 One of the options is to use the already mentioned ``daal::threader_for`` and blocking approach
 as shown here:
 
 .. include:: ../includes/threading/dot-parallel-partial-compute.rst
 
-To compute the final result it is requred to reduce TLS's partial results over all threads
+To compute the final result it is required to reduce each thread's partial results
 as shown here:
 
 .. include:: ../includes/threading/dot-parallel-reduction.rst
 
-Local memory of the threads should also be released when it is no longer needed.
+Local memory of the threads should be released when it is no longer needed.
 
-The complete parallel verision of dot product computations would look like:
+The complete parallel version of dot product computations would look like:
 
 .. include:: ../includes/threading/dot-parallel.rst
 
@@ -122,7 +122,7 @@ This strategy is beneficial when it is difficult to estimate the amount of work
 by each iteration.
 
 In the cases when it is known that the iterations perform an equal amount of work, it
-might be beneficial to use predefined mapping of the loop's iterations to threads.
+is more performant to use predefined mapping of the loop's iterations to threads.
 This is what static work scheduling does.
 
 ``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static

From e728c8c60d00a856973366037155161516f7cfc7 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 19 Sep 2024 03:03:27 -0700
Subject: [PATCH 17/19] Minor fixes

---
 cpp/daal/src/threading/threading.h     | 30 +++++++++++---------------
 docs/source/contribution/threading.rst |  2 +-
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 78edceb627d..b83564f25c1 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -224,10 +224,9 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
 }
 
 /// Pass a function to be executed in a for loop to the threading layer.
-/// The maximal number of iterations in the loop is 2^31 - 1.
+/// The maximal number of iterations in the loop is `2^31 - 1 (INT32_MAX)`.
 /// The default scheduling of the threading layer is used to assign
 /// the iterations of the loop to threads.
-/// The iterations of the loop should be logically independent.
 /// Data dependencies between the iterations are allowed, but may requre the use
 /// of synchronization primitives.
 ///
@@ -246,7 +245,7 @@ inline void threader_for(int n, int reserved, const F & lambda)
 }
 
 /// Pass a function to be executed in a for loop to the threading layer.
-/// The maximal number of iterations in the loop is 2^63 - 1.
+/// The maximal number of iterations in the loop is `2^63 - 1 (INT64_MAX)`.
 /// The default scheduling of the threading layer is used to assign
 /// the iterations of the loop to threads.
 /// The iterations of the loop should be logically independent.
@@ -269,15 +268,14 @@ inline void threader_for_int64(int64_t n, const F & lambda)
 /// Pass a function to be executed in a for loop to the threading layer.
 /// The maximal number of iterations in the loop is 2^31 - 1.
 ///
-/// The specifics of this loop comparing to `threader_for` is that the iteration spase
-/// of the loop is always chunked to the chunks of size 1.
-/// This means the threading layer tries to assign the consecutive iterations to
-/// a different threads if possible.
+/// The specifics of this loop comparing to `threader_for` is that the iteration space
+/// of the loop is always chunked with chunk size 1.
+/// This means the threading layer tries to assign consecutive iterations to
+/// different threads, if possible.
 /// In case of oneTBB threading backend this means that `simple_partitioner`
 /// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
-/// with chunk size 1 is used to produce iterations to threads mapping.
+/// with chunk size 1 is used to produce iteration to threads mappings.
 ///
-/// The iterations of the loop should be logically independent.
 /// Data dependencies between the iterations are allowed, but may requre the use
 /// of synchronization primitives.
 ///
@@ -313,17 +311,15 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
 /// It is recommended to use this parallel loop if each iteration of the loop
 /// performs equal amount of work.
 ///
-/// Let `t` be the number of threads available to oneDAL.
-///
-/// Then the number of iterations processed by each threads (except maybe the last one)
-/// is computed as:
+/// Let `t` be the number of threads available to oneDAL. The number of iterations
+/// processed by each threads (except maybe the last one) is computed as:
 /// `nI = (n + t - 1) / t`
 ///
 /// Here is how the work is split across the threads:
 /// The 1st thread executes iterations `0, ..., nI - 1`;
 /// the 2nd thread executes iterations `nI, ..., 2 * nI - 1`;
 /// ...
-/// the t-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
+/// the `t`-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](size_t i, size_t tid) -> void`,
 ///             where
@@ -341,13 +337,13 @@ inline void static_threader_for(size_t n, const F & lambda)
 }
 
 /// Pass a function to be executed in a for loop to the threading layer.
-/// The maximal number of iterations in the loop is 2^31 - 1.
+/// The maximal number of iterations in the loop is `2^31 - 1 INT32_MAX`.
 /// The default scheduling of the threading layer is used to assign
 /// the iterations of the loop to threads.
 ///
 /// @tparam F   Lambda function of type `[/* captures */](int beginRange, int endRange) -> void`
 ///             where
-///                 `beginRange` is the starting index of the loop's iterations block to be
+///                 `beginRange` is the starting index of the loop iterations block to be
 ///                                processed by a thread, `0 <= beginRange < n`;
 ///                 `endRange`   is the index after the end of the loop's iterations block to be
 ///                                processed by a thread, `beginRange < endRange <= n`;
@@ -417,7 +413,7 @@ class tls_deleter_ : public tls_deleter
 /// Thread-local storage (TLS).
 /// Can change its local variable after a nested parallel constructs.
 /// @note Thread-local storage in nested parallel regions is, in general, not thread local.
-/// The use of nested parallelism should be avioded if possible, otherwise extra care
+/// The use of nested parallelism should be avoided if possible, otherwise extra care
 /// must be taken with thread-local values.
 ///
 /// @tparam F  Type of the data located in the storage
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
index 91ce3e2a2ad..cd1acd84e95 100644
--- a/docs/source/contribution/threading.rst
+++ b/docs/source/contribution/threading.rst
@@ -69,7 +69,7 @@ This code shows how a typical parallel loop in oneDAL looks like:
 Thread-local Storage (TLS)
 **************************
 
-Lets consider you need to compute a dot product of two arrays.
+Consider you need to compute a dot product of two arrays.
 Here is a variant of sequential implementation:
 
 .. include:: ../includes/threading/dot-sequential.rst

From 2b44995c07226199292c26f23313605ad56f5735 Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 19 Sep 2024 04:28:35 -0700
Subject: [PATCH 18/19] 'Lambda function' term was changed to more generic and
 appropriate 'Callable object' term

---
 cpp/daal/src/threading/threading.h | 212 ++++++++++++++---------------
 1 file changed, 106 insertions(+), 106 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index b83564f25c1..832b1ef8c13 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -198,29 +198,29 @@ inline size_t setNumberOfThreads(const size_t numThreads, void ** globalControl)
 template <typename F>
 inline void threader_func(int i, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i);
+    const F & func = *static_cast<const F *>(a);
+    func(i);
 }
 
 template <typename F>
 inline void static_threader_func(size_t i, size_t tid, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, tid);
+    const F & func = *static_cast<const F *>(a);
+    func(i, tid);
 }
 
 template <typename F>
 inline void threader_func_b(int i0, int in, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i0, in);
+    const F & func = *static_cast<const F *>(a);
+    func(i0, in);
 }
 
 template <typename F>
 inline void threader_func_break(int i, bool & needBreak, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, needBreak);
+    const F & func = *static_cast<const F *>(a);
+    func(i, needBreak);
 }
 
 /// Pass a function to be executed in a for loop to the threading layer.
@@ -230,16 +230,16 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
 /// Data dependencies between the iterations are allowed, but may requre the use
 /// of synchronization primitives.
 ///
-/// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
-/// @param[in] lambda   Lambda function that defines the loop body.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for(int n, int reserved, const F & lambda)
+inline void threader_for(int n, int reserved, const F & callable)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&callable);
 
     _daal_threader_for(n, reserved, a, threader_func<F>);
 }
@@ -252,15 +252,15 @@ inline void threader_for(int n, int reserved, const F & lambda)
 /// Data dependencies between the iterations are allowed, but may requre the use
 /// of synchronization primitives.
 ///
-/// @tparam F   Lambda function of type `[/* captures */](int64_t i) -> void`,
+/// @tparam F   Callable object of type `[/* captures */](int64_t i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
-/// @param[in] lambda   Lambda function that defines the loop body.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for_int64(int64_t n, const F & lambda)
+inline void threader_for_int64(int64_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int64(n, a, threader_func<F>);
 }
@@ -279,24 +279,24 @@ inline void threader_for_int64(int64_t n, const F & lambda)
 /// Data dependencies between the iterations are allowed, but may requre the use
 /// of synchronization primitives.
 ///
-/// @tparam F   Lambda function of type `[/* captures */](int i) -> void`,
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
 ///             where `i` is the loop's iteration index, `0 <= i < n`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
-/// @param[in] lambda   Lambda function that defines iteration's body.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void threader_for_simple(int n, int reserved, const F & lambda)
+inline void threader_for_simple(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_simple(n, reserved, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_int32ptr(const int * begin, const int * end, const F & lambda)
+inline void threader_for_int32ptr(const int * begin, const int * end, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int32ptr(begin, end, a, threader_func<F>);
 }
@@ -321,17 +321,17 @@ inline void threader_for_int32ptr(const int * begin, const int * end, const F &
 /// ...
 /// the `t`-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
 ///
-/// @tparam F   Lambda function of type `[/* captures */](size_t i, size_t tid) -> void`,
+/// @tparam F   Callable object of type `[/* captures */](size_t i, size_t tid) -> void`,
 ///             where
 ///                 `i` is the loop's iteration index, `0 <= i < n`;
 ///                 `tid` is the index of the thread, `0 <= tid < t`.
 ///
 /// @param[in] n        Number of iterations in the for loop.
-/// @param[in] lambda   Lambda function that defines iteration's body.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void static_threader_for(size_t n, const F & lambda)
+inline void static_threader_for(size_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_static_threader_for(n, a, static_threader_func<F>);
 }
@@ -341,7 +341,7 @@ inline void static_threader_for(size_t n, const F & lambda)
 /// The default scheduling of the threading layer is used to assign
 /// the iterations of the loop to threads.
 ///
-/// @tparam F   Lambda function of type `[/* captures */](int beginRange, int endRange) -> void`
+/// @tparam F   Callable object of type `[/* captures */](int beginRange, int endRange) -> void`
 ///             where
 ///                 `beginRange` is the starting index of the loop iterations block to be
 ///                                processed by a thread, `0 <= beginRange < n`;
@@ -350,44 +350,44 @@ inline void static_threader_for(size_t n, const F & lambda)
 ///
 /// @param[in] n        Number of iterations in the for loop.
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
-/// @param[in] lambda   Lambda function that processes the block of loop's iterations
+/// @param[in] func     Callable object that processes the block of loop's iterations
 ///                     `[beginRange, endRange)`.
 template <typename F>
-inline void threader_for_blocked(int n, int reserved, const F & lambda)
+inline void threader_for_blocked(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_blocked(n, reserved, a, threader_func_b<F>);
 }
 
 template <typename F>
-inline void threader_for_optional(int n, int threads_request, const F & lambda)
+inline void threader_for_optional(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_optional(n, threads_request, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_break(int n, int threads_request, const F & lambda)
+inline void threader_for_break(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_break(n, threads_request, a, threader_func_break<F>);
 }
 
-template <typename lambdaType>
+template <typename callableType>
 inline void * tls_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void tls_reduce_func(void * v, const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    lambda((F)v);
+    const callableType & func = *static_cast<const callableType *>(a);
+    func((F)v);
 }
 
 struct tlsBase
@@ -402,12 +402,12 @@ class tls_deleter : public tlsBase
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class tls_deleter_ : public tls_deleter
 {
 public:
     virtual ~tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
 /// Thread-local storage (TLS).
@@ -423,26 +423,26 @@ class tls : public tlsBase
 public:
     /// Initialize thread-local storage
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
     ///
-    /// @param lambda       Lambda function that initializes a thread-local storage
-    template <typename lambdaType>
-    explicit tls(const lambdaType & lambda)
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit tls(const callableType & func)
     {
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        tlsPtr = _daal_get_tls_ptr(a, tls_func<lambdaType>);
+        tlsPtr = _daal_get_tls_ptr(a, tls_func<callableType>);
     }
 
     /// Destroys the memory associated with a thread-local storage
     ///
-    /// @note TLS does not release the memory allocated by a lambda-function
+    /// @note TLS does not release the memory allocated by a callable object
     ///       provided to the constructor.
     ///       Developers are responsible for deletion of that memory.
     virtual ~tls()
@@ -454,7 +454,7 @@ class tls : public tlsBase
 
     /// Access a local data of a thread by value
     ///
-    /// @return When first invoked by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local()
@@ -465,30 +465,30 @@ class tls : public tlsBase
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
     ///
-    /// @param lambda       Lambda function that is applied to each element of thread-local
-    ///                     storage sequentially.
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
     /// Parallel reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
     ///
-    /// @param lambda       Lambda function that is applied to each element of thread-local
-    ///                     storage in parallel.
-    template <typename lambdaType>
-    void parallel_reduce(const lambdaType & lambda)
+    /// @param func     Callable object that is applied to each element of thread-local
+    ///                 storage in parallel.
+    template <typename callableType>
+    void parallel_reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:
@@ -497,11 +497,11 @@ class tls : public tlsBase
     tls_deleter * d;
 };
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void * creater_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
 class static_tls_deleter
@@ -511,12 +511,12 @@ class static_tls_deleter
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class static_tls_deleter_ : public static_tls_deleter
 {
 public:
     virtual ~static_tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
 /// Thread-local storage (TLS) for the case of static parallel work scheduling.
@@ -528,11 +528,11 @@ class static_tls
 public:
     /// Initialize thread-local storage.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
     ///
-    /// @param lambda       Lambda function that initializes a thread-local storage
-    template <typename lambdaType>
-    explicit static_tls(const lambdaType & lambda)
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit static_tls(const callableType & func)
     {
         _nThreads = threader_get_max_threads_number();
 
@@ -548,8 +548,8 @@ class static_tls
             _storage[i] = nullptr;
         }
 
-        lambdaType * locall = new lambdaType(lambda);
-        _deleter            = new static_tls_deleter_<lambdaType>();
+        callableType * locall = new callableType(func);
+        _deleter              = new static_tls_deleter_<callableType>();
         if (!locall || !_deleter)
         {
             return;
@@ -559,12 +559,12 @@ class static_tls
         void * a        = const_cast<void *>(ac);
         _creater        = a;
 
-        _creater_func = creater_func<F, lambdaType>;
+        _creater_func = creater_func<F, callableType>;
     }
 
     /// Destroys the memory associated with a thread-local storage.
     ///
-    /// @note Static TLS does not release the memory allocated by a lambda-function
+    /// @note Static TLS does not release the memory allocated by a callable object
     ///       provided to the constructor.
     ///       Developers are responsible for deletion of that memory.
     virtual ~static_tls()
@@ -581,7 +581,7 @@ class static_tls
     ///
     /// @param tid  Index of the thread.
     ///
-    /// @return When first invoked by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local(size_t tid)
@@ -603,18 +603,18 @@ class static_tls
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
     ///
-    /// @param lambda       Lambda function that is applied to each element of thread-local
-    ///                     storage sequentially.
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
         if (_storage)
         {
             for (size_t i = 0; i < _nThreads; ++i)
             {
-                if (_storage[i]) lambda(_storage[i]);
+                if (_storage[i]) func(_storage[i]);
             }
         }
     }
@@ -644,29 +644,29 @@ class ls : public tlsBase
 public:
     /// Initialize local storage.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */]() -> F`
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
     ///
-    /// @param lambda   Lambda function that initializes local storage
+    /// @param func     Callable object that initializes local storage
     /// @param isTls    if `true`, then local storage is a thread-local storage (`daal::tls`)
     ///                 and might have problems in case of nested parallel regions.
-    template <typename lambdaType>
-    explicit ls(const lambdaType & lambda, const bool isTls = false)
+    template <typename callableType>
+    explicit ls(const callableType & func, const bool isTls = false)
     {
-        _isTls              = isTls;
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        _isTls                   = isTls;
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<lambdaType>) : _daal_get_ls_ptr(a, tls_func<lambdaType>);
+        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<callableType>) : _daal_get_ls_ptr(a, tls_func<callableType>);
     }
 
     /// Destroys the memory associated with local storage.
     ///
-    /// @note `ls` does not release the memory allocated by a lambda-function
+    /// @note `ls` does not release the memory allocated by a callable object
     ///       provided to the constructor.
     ///       Developers are responsible for deletion of that memory.
     virtual ~ls()
@@ -678,7 +678,7 @@ class ls : public tlsBase
 
     /// Access the local data of a thread by value.
     ///
-    /// @return When first invoked by a thread, a lambda provided to the constructor is
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
     ///         called to initialize the local data of the thread and return it.
     ///         All the following invocations just return the same thread-local data.
     F local()
@@ -694,16 +694,16 @@ class ls : public tlsBase
 
     /// Sequential reduction.
     ///
-    /// @tparam lambdaType  Lambda function of type `[/* captures */](F) -> void`
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
     ///
-    /// @param lambda       Lambda function that is applied to each element of thread-local
-    ///                     storage sequentially.
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, lambdaType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, lambdaType>);
+        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, callableType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:

From 050e6bef713bf21ac6cf91c5a3cf2706ea0a705c Mon Sep 17 00:00:00 2001
From: Victoriya Fedotova <victoriya.s.fedotova@intel.com>
Date: Thu, 19 Sep 2024 04:31:17 -0700
Subject: [PATCH 19/19] Minor fix

---
 cpp/daal/src/threading/threading.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 832b1ef8c13..ca8661f2203 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -237,9 +237,9 @@ inline void threader_func_break(int i, bool & needBreak, const void * a)
 /// @param[in] reserved Parameter reserved for the future. Currently unused.
 /// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for(int n, int reserved, const F & callable)
+inline void threader_for(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&callable);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for(n, reserved, a, threader_func<F>);
 }