diff --git a/include/ur_api.h b/include/ur_api.h
index 9d88eecbc6..80df5a6fc0 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -224,6 +224,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222,                     ///< Enumerator for ::urCommandBufferCommandGetInfoExp
     UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223,                         ///< Enumerator for ::urEnqueueTimestampRecordingExp
     UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224,                        ///< Enumerator for ::urEnqueueKernelLaunchCustomExp
+    UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225,                    ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -5230,6 +5231,43 @@ urKernelCreateWithNativeHandle(
     ur_kernel_handle_t *phKernel                      ///< [out] pointer to the handle of the kernel object created.
 );
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel,      ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,        ///< [in] handle of the queue object
+    uint32_t numWorkDim,             ///< [in] number of dimensions, from 1 to 3, to specify the global
+                                     ///< and work-group work-items
+    const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkSize,   ///< [in] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< the number of global work-items in workDim that will execute the
+                                     ///< kernel function
+    size_t *pSuggestedLocalWorkSize  ///< [out] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< suggested local work size that will contain the result of the query
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -9943,6 +9981,19 @@ typedef struct ur_kernel_create_with_native_handle_params_t {
     ur_kernel_handle_t **pphKernel;
 } ur_kernel_create_with_native_handle_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urKernelGetSuggestedLocalWorkSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_kernel_get_suggested_local_work_size_params_t {
+    ur_kernel_handle_t *phKernel;
+    ur_queue_handle_t *phQueue;
+    uint32_t *pnumWorkDim;
+    const size_t **ppGlobalWorkOffset;
+    const size_t **ppGlobalWorkSize;
+    size_t **ppSuggestedLocalWorkSize;
+} ur_kernel_get_suggested_local_work_size_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urKernelSetArgValue
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index fb1f1823b3..4aaa6d9fe3 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -535,6 +535,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelCreateWithNativeHandle_t)(
     const ur_kernel_native_properties_t *,
     ur_kernel_handle_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSize
+typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)(
+    ur_kernel_handle_t,
+    ur_queue_handle_t,
+    uint32_t,
+    const size_t *,
+    const size_t *,
+    size_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urKernelSetArgValue
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)(
@@ -603,6 +613,7 @@ typedef struct ur_kernel_dditable_t {
     ur_pfnKernelRelease_t pfnRelease;
     ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle;
     ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle;
+    ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
     ur_pfnKernelSetArgValue_t pfnSetArgValue;
     ur_pfnKernelSetArgLocal_t pfnSetArgLocal;
     ur_pfnKernelSetArgPointer_t pfnSetArgPointer;
diff --git a/include/ur_print.h b/include/ur_print.h
index 753875ace9..c8fb41753e 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1442,6 +1442,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetNativeHandleParams(const str
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelCreateWithNativeHandleParams(const struct ur_kernel_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_kernel_get_suggested_local_work_size_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetSuggestedLocalWorkSizeParams(const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_kernel_set_arg_value_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index db230c91d7..0e5026c521 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -929,6 +929,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP:
         os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP";
         break;
+    case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE:
+        os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -11462,6 +11465,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_kernel_get_suggested_local_work_size_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_kernel_get_suggested_local_work_size_params_t *params) {
+
+    os << ".hKernel = ";
+
+    ur::details::printPtr(os,
+                          *(params->phKernel));
+
+    os << ", ";
+    os << ".hQueue = ";
+
+    ur::details::printPtr(os,
+                          *(params->phQueue));
+
+    os << ", ";
+    os << ".numWorkDim = ";
+
+    os << *(params->pnumWorkDim);
+
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
+    os << ", ";
+    os << ".pGlobalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkSize));
+
+    os << ", ";
+    os << ".pSuggestedLocalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppSuggestedLocalWorkSize));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_kernel_set_arg_value_params_t type
 /// @returns
@@ -17143,6 +17189,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE: {
         os << (const struct ur_kernel_create_with_native_handle_params_t *)params;
     } break;
+    case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: {
+        os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)params;
+    } break;
     case UR_FUNCTION_KERNEL_SET_ARG_VALUE: {
         os << (const struct ur_kernel_set_arg_value_params_t *)params;
     } break;
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 4a0bf0bab1..5446f3bc1d 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -534,3 +534,44 @@ params:
 returns:
     - $X_RESULT_ERROR_UNSUPPORTED_FEATURE:
         - "If the adapter has no underlying equivalent handle."
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Get the suggested local work size for a kernel."
+class: $xKernel
+name: GetSuggestedLocalWorkSize
+ordinal: "0"
+details:
+    - "Query a suggested local work size for a kernel given a global size for each dimension."
+    - "The application may call this function from simultaneous threads for the same context."
+params:
+    - type: $x_kernel_handle_t
+      name: hKernel
+      desc: |
+            [in] handle of the kernel
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: |
+            [in] handle of the queue object
+    - type: uint32_t
+      name: numWorkDim
+      desc: |
+            [in] number of dimensions, from 1 to 3, to specify the global
+            and work-group work-items
+    - type: const size_t*
+      name: pGlobalWorkOffset
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the offset used to calculate the global ID of a work-item
+    - type: const size_t*
+      name: pGlobalWorkSize      
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the number of global work-items in workDim that will execute the 
+            kernel function
+    - type: size_t*
+      name: pSuggestedLocalWorkSize
+      desc: |
+            [out] pointer to an array of numWorkDim unsigned values that specify
+            suggested local work size that will contain the result of the query
+returns:
+    - $X_RESULT_ERROR_UNSUPPORTED_FEATURE
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index b0a61e7f88..52585ade3a 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -586,6 +586,9 @@ etors:
 - name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP
   desc: Enumerator for $xEnqueueKernelLaunchCustomExp
   value: '224'
+- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE
+  desc: Enumerator for $xKernelGetSuggestedLocalWorkSize
+  value: '225'
 ---
 type: enum
 desc: Defines structure types
diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp
index c925a27295..be141f7b20 100644
--- a/source/adapters/cuda/enqueue.hpp
+++ b/source/adapters/cuda/enqueue.hpp
@@ -17,6 +17,10 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
                               const ur_event_handle_t *EventWaitList);
 
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        ur_kernel_handle_t Kernel);
+
 bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
                                      ur_kernel_handle_t Kernel,
                                      size_t BlockSize);
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 675fdbe0a3..5e01845a56 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -9,7 +9,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "kernel.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
+#include "queue.hpp"
 #include "sampler.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -380,3 +382,30 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
   }
   return Result;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  // Preconditions
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_device_handle_t Device = hQueue->Device;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  size_t ThreadsPerBlock[3] = {};
+
+  // Set the active context here as guessLocalWorkSize needs an active context
+  ScopedContext Active(Device);
+
+  guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
+                     hKernel);
+
+  std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+            pSuggestedLocalWorkSize);
+  return Result;
+}
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index b70198b227..fc8cad9d43 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/hip/enqueue.hpp b/source/adapters/hip/enqueue.hpp
index a1f86b3678..eacac72a82 100644
--- a/source/adapters/hip/enqueue.hpp
+++ b/source/adapters/hip/enqueue.hpp
@@ -30,3 +30,7 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
                        const hipMemoryType DstType, ur_rect_offset_t DstOffset,
                        size_t DstRowPitch, size_t DstSlicePitch,
                        hipMemcpy3DParms &Params);
+
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3]);
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index f35d3957bc..b433c06852 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "kernel.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 #include "sampler.hpp"
 
@@ -349,3 +350,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
     [[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    [[maybe_unused]] ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue,
+    uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  size_t MaxThreadsPerBlock[3];
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+
+  MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX();
+  MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY();
+  MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ();
+
+  ur_device_handle_t Device = hQueue->getDevice();
+  ScopedContext Active(Device);
+
+  guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
+                     MaxThreadsPerBlock);
+  std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+            pSuggestedLocalWorkSize);
+  return UR_RESULT_SUCCESS;
+}
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 71979b75b1..7a28623e0b 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index 40d6260ac9..3f8e8c6986 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -13,6 +13,93 @@
 #include "ur_api.h"
 #include "ur_level_zero.hpp"
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  uint32_t LocalWorkSize[3];
+  size_t GlobalWorkSize3D[3]{1, 1, 1};
+  std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D);
+
+  ze_kernel_handle_t ZeKernel{};
+  UR_CALL(getZeKernel(hQueue, hKernel, &ZeKernel));
+
+  UR_CALL(getSuggestedLocalWorkSize(hQueue, ZeKernel, GlobalWorkSize3D,
+                                    LocalWorkSize));
+
+  std::copy(LocalWorkSize, LocalWorkSize + workDim, pSuggestedLocalWorkSize);
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                        ze_kernel_handle_t *phZeKernel) {
+  auto ZeDevice = hQueue->Device->ZeDevice;
+
+  if (hKernel->ZeKernelMap.empty()) {
+    *phZeKernel = hKernel->ZeKernel;
+  } else {
+    auto It = hKernel->ZeKernelMap.find(ZeDevice);
+    if (It == hKernel->ZeKernelMap.end()) {
+      /* kernel and queue don't match */
+      return UR_RESULT_ERROR_INVALID_QUEUE;
+    }
+    *phZeKernel = It->second;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue,
+                                      ze_kernel_handle_t hZeKernel,
+                                      size_t GlobalWorkSize3D[3],
+                                      uint32_t SuggestedLocalWorkSize3D[3]) {
+  uint32_t *WG = SuggestedLocalWorkSize3D;
+
+  // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
+  // values do not fit to 32-bit that the API only supports currently.
+  bool SuggestGroupSize = true;
+  for (int I : {0, 1, 2}) {
+    if (GlobalWorkSize3D[I] > UINT32_MAX) {
+      SuggestGroupSize = false;
+    }
+  }
+  if (SuggestGroupSize) {
+    ZE2UR_CALL(zeKernelSuggestGroupSize,
+               (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
+                GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
+  } else {
+    for (int I : {0, 1, 2}) {
+      // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
+      // fully divisable with. Start with the max possible size in
+      // each dimension.
+      uint32_t GroupSize[] = {
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
+      GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
+      while (GlobalWorkSize3D[I] % GroupSize[I]) {
+        --GroupSize[I];
+      }
+      if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
+        logger::error("getSuggestedLocalWorkSize: can't find a WG size "
+                      "suitable for global work size > UINT32_MAX");
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+      WG[I] = GroupSize[I];
+    }
+    logger::debug(
+        "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}",
+        WG[0], WG[1], WG[2]);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t Queue,   ///< [in] handle of the queue object
     ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
@@ -43,19 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular kernel execution instance.
 ) {
-  auto ZeDevice = Queue->Device->ZeDevice;
-
   ze_kernel_handle_t ZeKernel{};
-  if (Kernel->ZeKernelMap.empty()) {
-    ZeKernel = Kernel->ZeKernel;
-  } else {
-    auto It = Kernel->ZeKernelMap.find(ZeDevice);
-    if (It == Kernel->ZeKernelMap.end()) {
-      /* kernel and queue don't match */
-      return UR_RESULT_ERROR_INVALID_QUEUE;
-    }
-    ZeKernel = It->second;
-  }
+  UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel));
+
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
       Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
@@ -92,54 +169,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
 
   if (LocalWorkSize) {
-    // L0
-    UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    WG[0] = static_cast<uint32_t>(LocalWorkSize[0]);
-    WG[1] = static_cast<uint32_t>(LocalWorkSize[1]);
-    WG[2] = static_cast<uint32_t>(LocalWorkSize[2]);
-  } else {
-    // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
-    // values do not fit to 32-bit that the API only supports currently.
-    bool SuggestGroupSize = true;
-    for (int I : {0, 1, 2}) {
-      if (GlobalWorkSize3D[I] > UINT32_MAX) {
-        SuggestGroupSize = false;
-      }
-    }
-    if (SuggestGroupSize) {
-      ZE2UR_CALL(zeKernelSuggestGroupSize,
-                 (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
-                  GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
-    } else {
-      for (int I : {0, 1, 2}) {
-        // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
-        // fully divisable with. Start with the max possible size in
-        // each dimension.
-        uint32_t GroupSize[] = {
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
-        GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
-        while (GlobalWorkSize3D[I] % GroupSize[I]) {
-          --GroupSize[I];
-        }
-
-        if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
-          logger::error("urEnqueueKernelLaunch: can't find a WG size "
-                        "suitable for global work size > UINT32_MAX");
-          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-        WG[I] = GroupSize[I];
-      }
-      logger::debug(
-          "urEnqueueKernelLaunch: using computed WG size = {{{}, {}, {}}}",
-          WG[0], WG[1], WG[2]);
+    for (uint32_t I = 0; I < WorkDim; ++I) {
+      UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits<uint32_t>::max)(),
+                UR_RESULT_ERROR_INVALID_VALUE);
+      WG[I] = static_cast<uint32_t>(LocalWorkSize[I]);
     }
+  } else {
+    UR_CALL(getSuggestedLocalWorkSize(Queue, ZeKernel, GlobalWorkSize3D, WG));
   }
 
   // TODO: assert if sizes do not fit into 32-bit?
diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp
index 1cc146d262..2db3af0514 100644
--- a/source/adapters/level_zero/kernel.hpp
+++ b/source/adapters/level_zero/kernel.hpp
@@ -107,3 +107,10 @@ struct ur_kernel_handle_t_ : _ur_object {
   ZeCache<ZeStruct<ze_kernel_properties_t>> ZeKernelProperties;
   ZeCache<std::string> ZeKernelName;
 };
+
+ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue,
+                                      ze_kernel_handle_t hZeKernel,
+                                      size_t GlobalWorkSize3D[3],
+                                      uint32_t SuggestedLocalWorkSize3D[3]);
+ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                        ze_kernel_handle_t *phZeKernel);
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index d6d6060ea6..45568a7885 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -151,6 +151,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgSampler = urKernelSetArgSampler;
   pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return retVal;
 }
 
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index 7ef17b0c28..29b54503eb 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -297,3 +297,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
 
   DIE_NO_IMPLEMENTATION
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    [[maybe_unused]] ur_kernel_handle_t hKernel,
+    [[maybe_unused]] ur_queue_handle_t hQueue,
+    [[maybe_unused]] uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    [[maybe_unused]] const size_t *pGlobalWorkSize,
+    [[maybe_unused]] size_t *pSuggestedLocalWorkSize) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 065012613e..053fc32d9f 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -123,6 +123,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
index 2278d5907e..a713a385a7 100644
--- a/source/adapters/null/ur_nullddi.cpp
+++ b/source/adapters/null/ur_nullddi.cpp
@@ -2616,6 +2616,43 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // if the driver has created a custom function, then call it instead of using the generic path
+    auto pfnGetSuggestedLocalWorkSize =
+        d_context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr != pfnGetSuggestedLocalWorkSize) {
+        result = pfnGetSuggestedLocalWorkSize(
+            hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+            pSuggestedLocalWorkSize);
+    } else {
+        // generic implementation
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -6248,6 +6285,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         driver::urKernelCreateWithNativeHandle;
 
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        driver::urKernelGetSuggestedLocalWorkSize;
+
     pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue;
 
     pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal;
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index 4fcbdeefa5..3accd84778 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -419,3 +419,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
   CL_RETURN_ON_FAILURE(RetErr);
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    size_t *pSuggestedLocalWorkSize) {
+  cl_device_id Device;
+  cl_platform_id Platform;
+
+  CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
+      cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
+      sizeof(cl_device_id), &Device, nullptr));
+
+  CL_RETURN_ON_FAILURE(clGetDeviceInfo(
+      Device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &Platform, nullptr));
+
+  auto GetKernelSuggestedLocalWorkSizeFuncPtr =
+      (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+          clGetExtensionFunctionAddressForPlatform(
+              Platform, "clGetKernelSuggestedLocalWorkSizeKHR");
+  if (!GetKernelSuggestedLocalWorkSizeFuncPtr)
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  CL_RETURN_ON_FAILURE(GetKernelSuggestedLocalWorkSizeFuncPtr(
+      cl_adapter::cast<cl_command_queue>(hQueue),
+      cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
+      pGlobalWorkSize, pSuggestedLocalWorkSize));
+  return UR_RESULT_SUCCESS;
+}
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index effb2128c3..687b541911 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 56f270a9d9..da61c34992 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -3420,6 +3420,57 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    auto pfnGetSuggestedLocalWorkSize =
+        context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_kernel_get_suggested_local_work_size_params_t params = {
+        &hKernel,           &hQueue,          &numWorkDim,
+        &pGlobalWorkOffset, &pGlobalWorkSize, &pSuggestedLocalWorkSize};
+    uint64_t instance =
+        context.notify_begin(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE,
+                             "urKernelGetSuggestedLocalWorkSize", &params);
+
+    context.logger.info("---> urKernelGetSuggestedLocalWorkSize");
+
+    ur_result_t result = pfnGetSuggestedLocalWorkSize(
+        hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pSuggestedLocalWorkSize);
+
+    context.notify_end(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE,
+                       "urKernelGetSuggestedLocalWorkSize", &params, &result,
+                       instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -8348,6 +8399,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         ur_tracing_layer::urKernelCreateWithNativeHandle;
 
+    dditable.pfnGetSuggestedLocalWorkSize =
+        pDdiTable->pfnGetSuggestedLocalWorkSize;
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        ur_tracing_layer::urKernelGetSuggestedLocalWorkSize;
+
     dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
     pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue;
 
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 4bdd801c1a..6435cc24e1 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -3857,6 +3857,71 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    auto pfnGetSuggestedLocalWorkSize =
+        context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (context.enableParameterValidation) {
+        if (NULL == hKernel) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == hQueue) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == pGlobalWorkOffset) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == pGlobalWorkSize) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == pSuggestedLocalWorkSize) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hKernel)) {
+        refCountContext.logInvalidReference(hKernel);
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hQueue)) {
+        refCountContext.logInvalidReference(hQueue);
+    }
+
+    ur_result_t result = pfnGetSuggestedLocalWorkSize(
+        hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pSuggestedLocalWorkSize);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -10006,6 +10071,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         ur_validation_layer::urKernelCreateWithNativeHandle;
 
+    dditable.pfnGetSuggestedLocalWorkSize =
+        pDdiTable->pfnGetSuggestedLocalWorkSize;
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        ur_validation_layer::urKernelGetSuggestedLocalWorkSize;
+
     dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
     pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue;
 
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index d7a9447b06..fb392dd607 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -3449,6 +3449,49 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_kernel_object_t *>(hKernel)->dditable;
+    auto pfnGetSuggestedLocalWorkSize =
+        dditable->ur.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
+
+    // convert loader handle to platform handle
+    hQueue = reinterpret_cast<ur_queue_object_t *>(hQueue)->handle;
+
+    // forward to device-platform
+    result = pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim,
+                                          pGlobalWorkOffset, pGlobalWorkSize,
+                                          pSuggestedLocalWorkSize);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -8599,6 +8642,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
             pDdiTable->pfnGetNativeHandle = ur_loader::urKernelGetNativeHandle;
             pDdiTable->pfnCreateWithNativeHandle =
                 ur_loader::urKernelCreateWithNativeHandle;
+            pDdiTable->pfnGetSuggestedLocalWorkSize =
+                ur_loader::urKernelGetSuggestedLocalWorkSize;
             pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue;
             pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal;
             pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer;
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 35e5d68e36..2fa318e71c 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -4096,6 +4096,58 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+    ) try {
+    auto pfnGetSuggestedLocalWorkSize =
+        ur_lib::context->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim,
+                                        pGlobalWorkOffset, pGlobalWorkSize,
+                                        pSuggestedLocalWorkSize);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query information about a command queue
 ///
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 5af2165ea4..79107c733d 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1821,6 +1821,14 @@ ur_result_t urPrintKernelCreateWithNativeHandleParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams(
+    const struct ur_kernel_get_suggested_local_work_size_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintKernelSetArgValueParams(
     const struct ur_kernel_set_arg_value_params_t *params, char *buffer,
     const size_t buff_size, size_t *out_size) {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index bf90700e7d..b8496a83c7 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -3472,6 +3472,49 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query information about a command queue
 ///
diff --git a/test/conformance/kernel/CMakeLists.txt b/test/conformance/kernel/CMakeLists.txt
index df19ba2550..73ab3f1101 100644
--- a/test/conformance/kernel/CMakeLists.txt
+++ b/test/conformance/kernel/CMakeLists.txt
@@ -18,4 +18,5 @@ add_conformance_test_with_kernels_environment(kernel
     urKernelSetArgSampler.cpp
     urKernelSetArgValue.cpp
     urKernelSetExecInfo.cpp
-    urKernelSetSpecializationConstants.cpp)
+    urKernelSetSpecializationConstants.cpp
+    urKernelGetSuggestedLocalWorkSize.cpp)
diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match
index 2668b6821a..82c92e3f28 100644
--- a/test/conformance/kernel/kernel_adapter_level_zero.match
+++ b/test/conformance/kernel/kernel_adapter_level_zero.match
@@ -8,9 +8,6 @@ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_
 urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS
 urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index 93e3ddd67d..818c625e92 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -162,3 +162,12 @@ urKernelSetSpecializationConstantsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU
 urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidSuggestedLocalWorkSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp
new file mode 100644
index 0000000000..4eeabf5573
--- /dev/null
+++ b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <uur/fixtures.h>
+
+struct urKernelGetSuggestedLocalWorkSizeTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "bar";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+    size_t global_size = 32;
+    size_t global_offset = 0;
+    size_t n_dimensions = 1;
+
+    size_t suggested_local_work_size;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSuggestedLocalWorkSizeTest);
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success) {
+    suggested_local_work_size = SIZE_MAX;
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, n_dimensions, &global_offset, &global_size,
+        &suggested_local_work_size);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    ASSERT_LE(suggested_local_work_size, global_size);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success2D) {
+    size_t global_size_2d[2] = {32, 32};
+    size_t global_offset_2d[2] = {0, 0};
+    size_t suggested_local_work_size_2d[2] = {SIZE_MAX, SIZE_MAX};
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, 2, global_offset_2d, global_size_2d,
+        suggested_local_work_size_2d);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    for (int I = 0; I < 2; ++I) {
+        ASSERT_LE(suggested_local_work_size_2d[I], global_size_2d[I]);
+    }
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success3D) {
+    size_t global_size_3d[3] = {32, 32, 32};
+    size_t global_offset_3d[3] = {0, 0, 0};
+    size_t suggested_local_work_size_3d[3] = {SIZE_MAX, SIZE_MAX, SIZE_MAX};
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, 3, global_offset_3d, global_size_3d,
+        suggested_local_work_size_3d);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    for (int I = 0; I < 3; ++I) {
+        ASSERT_LE(suggested_local_work_size_3d[I], global_size_3d[I]);
+    }
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleKernel) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         nullptr, queue, n_dimensions, &global_offset,
+                         &global_size, &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleQueue) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         kernel, nullptr, n_dimensions, &global_offset,
+                         &global_size, &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidWorkDimension) {
+    uint32_t max_work_item_dimensions = 0;
+    ASSERT_SUCCESS(urDeviceGetInfo(
+        device, UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS,
+        sizeof(max_work_item_dimensions), &max_work_item_dimensions, nullptr));
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, max_work_item_dimensions + 1, &global_offset,
+        &global_size, &suggested_local_work_size);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalOffset) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         kernel, queue, n_dimensions, nullptr, &global_size,
+                         &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalSize) {
+    ASSERT_EQ_RESULT(
+        urKernelGetSuggestedLocalWorkSize(kernel, queue, n_dimensions,
+                                          &global_offset, nullptr, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidSuggestedLocalWorkSize) {
+    ASSERT_EQ_RESULT(
+        urKernelGetSuggestedLocalWorkSize(
+            kernel, queue, n_dimensions, &global_offset, &global_size, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}