diff --git a/.github/docker/README.md b/.github/docker/README.md
index 782dce372e..81adbc8f1a 100644
--- a/.github/docker/README.md
+++ b/.github/docker/README.md
@@ -8,10 +8,10 @@ development environment.
 
 # How to build docker image
 
-To build docker image on local machine execute:
+To build docker image on local machine, enter the root dir of the repository and execute:
 
 ```sh
-docker build -t ur:ubuntu-22.04 -f ./ubuntu-22.04.Dockerfile .
+docker build -t ur:ubuntu-22.04 -f .github/docker/ubuntu-22.04.Dockerfile .
 ```
 
 To set any build time variable (e.g., an optional ARG from docker recipe), add to the command (after `build`), e.g.:
diff --git a/.github/docker/fedora-40.Dockerfile b/.github/docker/fedora-40.Dockerfile
new file mode 100644
index 0000000000..70f77345fa
--- /dev/null
+++ b/.github/docker/fedora-40.Dockerfile
@@ -0,0 +1,82 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of fedora-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("40")
+FROM registry.hub.docker.com/library/fedora@sha256:5ce8497aeea599bf6b54ab3979133923d82aaa4f6ca5ced1812611b197c79eb0
+
+# Set environment variables
+ENV OS fedora
+ENV OS_VER 40
+ENV NOTTY 1
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	cmake \
+	git \
+	make"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-pip"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	ncurses-libs-6.4 \
+	passwd \
+	sudo \
+	wget"
+
+# Update and install required packages
+RUN dnf update -y \
+ && dnf install -y \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${MISC_DEPS} \
+ && dnf clean all
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+#
+# It's actively used and tested only on selected distros. Be aware
+# they may not work, because pip packages list differ from OS to OS.
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+
+# Install DPC++
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
+ENV DPCPP_PATH=/opt/dpcpp
+RUN /opt/ur/install_dpcpp.sh
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user'
+ENV USER test_user
+ENV USERPASS pass
+# Change shell to bash with safe pipe usage
+SHELL [ "/bin/bash", "-o", "pipefail", "-c" ]
+RUN useradd -m ${USER} \
+ && echo "${USER}:${USERPASS}" | chpasswd \
+ && gpasswd wheel -a ${USER}
+
+# Change shell back to default and switch to 'test_user'
+SHELL ["/bin/sh", "-c"]
+USER test_user
diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh
index 0aac93eee4..aa5831c734 100755
--- a/.github/docker/install_dpcpp.sh
+++ b/.github/docker/install_dpcpp.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -15,9 +15,6 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then
 	exit
 fi
 
-apt-get install -y --no-install-recommends \
-	libncurses5
-
-mkdir -p ${DPCPP_PATH}
-wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz
-tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/
+mkdir -p ${DPCPP_PATH}/dpcpp_compiler
+wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler
diff --git a/.github/docker/opensuse-leap-15.Dockerfile b/.github/docker/opensuse-leap-15.Dockerfile
new file mode 100644
index 0000000000..62a09b27ef
--- /dev/null
+++ b/.github/docker/opensuse-leap-15.Dockerfile
@@ -0,0 +1,92 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of opensuse-leap-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("15")
+FROM registry.hub.docker.com/opensuse/leap@sha256:1cf79e78bb69f39fb2f78a7c2c7ebc4b64cf8d82eb1df76cd36767a595ada7a8
+
+# Set environment variables
+ENV OS opensuse-leap
+ENV OS_VER 15
+ENV NOTTY 1
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	cmake \
+	gcc \
+	gcc-c++ \
+	git \
+	glibc-devel \
+	libstdc++-devel \
+	make"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-devel \
+	python3-pip"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	gzip \
+	libncurses5 \
+	sudo \
+	tar \
+	wget"
+
+# add openSUSE Leap 15.5 Oss repo
+RUN zypper ar -f https://download.opensuse.org/distribution/leap/15.5/repo/oss/ oss
+
+# Update and install required packages
+RUN zypper update -y \
+ && zypper install -y \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${MISC_DEPS} \
+ && zypper clean all
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+#
+# It's actively used and tested only on selected distros. Be aware
+# they may not work, because pip packages list differ from OS to OS.
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+
+# Install DPC++
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
+ENV DPCPP_PATH=/opt/dpcpp
+RUN /opt/ur/install_dpcpp.sh
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user' and switch to it
+ENV USER test_user
+ENV USERPASS pass
+ENV PFILE ./password
+RUN useradd -m ${USER} \
+ && echo ${USERPASS} > ${PFILE} \
+ && echo ${USERPASS} >> ${PFILE} \
+ && passwd ${USER} < ${PFILE} \
+ && rm -f ${PFILE} \
+ && sed -i 's/# %wheel/%wheel/g' /etc/sudoers \
+ && groupadd wheel \
+ && gpasswd wheel -a ${USER}
+USER test_user
diff --git a/.github/docker/rockylinux-8.Dockerfile b/.github/docker/rockylinux-8.Dockerfile
new file mode 100644
index 0000000000..7581cf5bd7
--- /dev/null
+++ b/.github/docker/rockylinux-8.Dockerfile
@@ -0,0 +1,93 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("8.9")
+FROM registry.hub.docker.com/library/rockylinux@sha256:9794037624aaa6212aeada1d28861ef5e0a935adaf93e4ef79837119f2a2d04c
+
+# Set environment variables
+ENV OS rockylinux
+ENV OS_VER 8
+ENV NOTTY 1
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	cmake \
+	git \
+	glibc-devel \
+	libstdc++-devel \
+	make"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-pip"
+
+# Packages required by requirements.txt
+ARG PRE_PYTHON_DEPS="\
+	libjpeg-turbo-devel \
+	python3-devel \
+	python3-wheel \
+	zlib-devel"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	ncurses-libs-6.1 \
+	passwd \
+	sudo \
+	wget"
+
+# Update and install required packages
+RUN dnf update -y \
+ && dnf --enablerepo devel install -y \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${PRE_PYTHON_DEPS} \
+	${MISC_DEPS} \
+ && dnf clean all
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+#
+# It's actively used and tested only on selected distros. Be aware
+# they may not work, because pip packages list differ from OS to OS.
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+
+# Install DPC++
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
+ENV DPCPP_PATH=/opt/dpcpp
+RUN /opt/ur/install_dpcpp.sh
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user'
+ENV USER test_user
+ENV USERPASS pass
+# Change shell to bash with safe pipe usage
+SHELL [ "/bin/bash", "-o", "pipefail", "-c" ]
+RUN useradd -m $USER \
+ && echo "${USERPASS}" | passwd "${USER}" --stdin \
+ && gpasswd wheel -a "${USER}" \
+ && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+
+# Change shell back to default and switch to 'test_user'
+SHELL ["/bin/sh", "-c"]
+USER test_user
diff --git a/.github/docker/rockylinux-9.Dockerfile b/.github/docker/rockylinux-9.Dockerfile
new file mode 100644
index 0000000000..171e315cbe
--- /dev/null
+++ b/.github/docker/rockylinux-9.Dockerfile
@@ -0,0 +1,85 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("9.3")
+FROM registry.hub.docker.com/library/rockylinux@sha256:d7be1c094cc5845ee815d4632fe377514ee6ebcf8efaed6892889657e5ddaaa6
+
+# Set environment variables
+ENV OS rockylinux
+ENV OS_VER 9
+ENV NOTTY 1
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	cmake \
+	git \
+	glibc-devel \
+	libstdc++-devel \
+	make"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-pip"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	ncurses-libs-6.2 \
+	passwd \
+	sudo \
+	wget"
+
+# Update and install required packages
+RUN dnf update -y \
+ && dnf --enablerepo devel install -y \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${MISC_DEPS} \
+ && dnf clean all
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+#
+# It's actively used and tested only on selected distros. Be aware
+# they may not work, because pip packages list differ from OS to OS.
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+
+# Install DPC++
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
+ENV DPCPP_PATH=/opt/dpcpp
+RUN /opt/ur/install_dpcpp.sh
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user'
+ENV USER test_user
+ENV USERPASS pass
+# Change shell to bash with safe pipe usage
+SHELL [ "/bin/bash", "-o", "pipefail", "-c" ]
+RUN useradd -m $USER \
+ && echo "${USERPASS}" | passwd "${USER}" --stdin \
+ && gpasswd wheel -a "${USER}" \
+ && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+
+# Change shell back to default and switch to 'test_user'
+SHELL ["/bin/sh", "-c"]
+USER test_user
diff --git a/.github/docker/ubuntu-20.04.Dockerfile b/.github/docker/ubuntu-20.04.Dockerfile
new file mode 100644
index 0000000000..2560bb10b9
--- /dev/null
+++ b/.github/docker/ubuntu-20.04.Dockerfile
@@ -0,0 +1,75 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("20.04")
+FROM registry.hub.docker.com/library/ubuntu@sha256:d86db849e59626d94f768c679aba441163c996caf7a3426f44924d0239ffe03f
+
+# Set environment variables
+ENV OS ubuntu
+ENV OS_VER 20.04
+ENV NOTTY 1
+ENV DEBIAN_FRONTEND noninteractive
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	build-essential \
+	git"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-pip"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	g++-7 \
+	libncurses5 \
+	sudo \
+	wget \
+	whois"
+
+# Update and install required packages
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${MISC_DEPS} \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean all
+
+# Install CMake from source (the version in apt it's too old)
+RUN wget https://cmake.org/files/v3.20/cmake-3.20.0-linux-x86_64.sh -O cmake.sh \
+ && chmod +x cmake.sh \
+ && ./cmake.sh --skip-license --prefix=/usr/local
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user' and switch to it
+ENV USER test_user
+ENV USERPASS pass
+RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})"
+USER test_user
diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile
index 55e63f2c03..d4b3a828fc 100644
--- a/.github/docker/ubuntu-22.04.Dockerfile
+++ b/.github/docker/ubuntu-22.04.Dockerfile
@@ -4,11 +4,12 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #
-# Dockerfile - image with all Unified Runtime dependencies.
+# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based
+#              environment for building the Unified Runtime project.
 #
 
-# Pull base image
-FROM registry.hub.docker.com/library/ubuntu:22.04
+# Pull base image ("22.04")
+FROM registry.hub.docker.com/library/ubuntu@sha256:0eb0f877e1c869a300c442c41120e778db7161419244ee5cbc6fa5f134e74736
 
 # Set environment variables
 ENV OS ubuntu
@@ -35,13 +36,10 @@ ARG UR_DEPS="\
 	python3-pip \
 	libhwloc-dev"
 
-# Unified Runtime's dependencies (installed via pip)
-ARG UR_PYTHON_DEPS="\
-	clang-format==15.0.7"
-
 # Miscellaneous for our builds/CI (optional)
 ARG MISC_DEPS="\
 	clang \
+	libncurses5 \
 	sudo \
 	wget \
 	whois"
@@ -55,18 +53,21 @@ RUN apt-get update \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean all
 
-# pip package is pinned to a version, but it's probably improperly parsed here
-# hadolint ignore=DL3013
-RUN pip3 install --no-cache-dir ${UR_PYTHON_DEPS}
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt
 
 # Install DPC++
-COPY install_dpcpp.sh /opt/install_dpcpp.sh
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
 ENV DPCPP_PATH=/opt/dpcpp
-RUN /opt/install_dpcpp.sh
+RUN /opt/ur/install_dpcpp.sh
 
 # Install libbacktrace
-COPY install_libbacktrace.sh /opt/install_libbacktrace.sh
-RUN /opt/install_libbacktrace.sh
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
 
 # Add a new (non-root) 'test_user' and switch to it
 ENV USER test_user
diff --git a/.github/docker/ubuntu-24.04.Dockerfile b/.github/docker/ubuntu-24.04.Dockerfile
new file mode 100644
index 0000000000..6d232e1296
--- /dev/null
+++ b/.github/docker/ubuntu-24.04.Dockerfile
@@ -0,0 +1,75 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#
+# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based
+#              environment for building the Unified Runtime project.
+#
+
+# Pull base image ("24.04")
+FROM registry.hub.docker.com/library/ubuntu@sha256:340d9b015b194dc6e2a13938944e0d016e57b9679963fdeb9ce021daac430221
+
+# Set environment variables
+ENV OS ubuntu
+ENV OS_VER 24.04
+ENV NOTTY 1
+ENV DEBIAN_FRONTEND noninteractive
+
+# Additional parameters to build docker without building components.
+# These ARGs can be set in docker building phase and are used
+# within bash scripts (executed within docker).
+ARG SKIP_DPCPP_BUILD
+ARG SKIP_LIBBACKTRACE_BUILD
+
+# Base development packages
+ARG BASE_DEPS="\
+	build-essential \
+	cmake \
+	git"
+
+# Unified Runtime's dependencies
+ARG UR_DEPS="\
+	doxygen \
+	python3 \
+	python3-pip"
+
+# Miscellaneous for our builds/CI (optional)
+ARG MISC_DEPS="\
+	clang \
+	libncurses5 \
+	sudo \
+	wget \
+	whois"
+
+# Update and install required packages
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+	${BASE_DEPS} \
+	${UR_DEPS} \
+	${MISC_DEPS} \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean all
+
+# Prepare a dir (accessible by anyone)
+RUN mkdir --mode 777 /opt/ur/
+
+# Additional dev. dependencies (installed via pip)
+COPY third_party/requirements.txt /opt/ur/requirements.txt
+RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt
+
+# Install DPC++
+COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh
+ENV DPCPP_PATH=/opt/dpcpp
+RUN /opt/ur/install_dpcpp.sh
+
+# Install libbacktrace
+COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh
+RUN /opt/ur/install_libbacktrace.sh
+
+# Add a new (non-root) 'test_user' and switch to it
+ENV USER test_user
+ENV USERPASS pass
+RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})"
+USER test_user
diff --git a/.github/workflows/benchmarks_compute.yml b/.github/workflows/benchmarks_compute.yml
index 619784b263..86fbb1ddc8 100644
--- a/.github/workflows/benchmarks_compute.yml
+++ b/.github/workflows/benchmarks_compute.yml
@@ -34,6 +34,16 @@ on:
         type: string
         required: false
         default: ''
+      sycl_repo:
+        description: 'Compiler repo'
+        type: string
+        required: true
+        default: 'intel/llvm'
+      sycl_commit:
+        description: 'Compiler commit'
+        type: string
+        required: false
+        default: ''
 
 permissions:
   contents: read
@@ -41,8 +51,6 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    # Run only on upstream; forks will not have the HW
-    # if: github.repository == 'oneapi-src/unified-runtime'
     name: Build SYCL, UR, run Compute Benchmarks
     strategy:
       matrix:
@@ -105,12 +113,19 @@ jobs:
     - name: Checkout SYCL
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
-        repository: intel/llvm
+        repository: ${{inputs.sycl_repo}}
         ref: refs/heads/sycl
         path: sycl-repo
         fetch-depth: 1
         fetch-tags: false
 
+    - name: Fetch specific SYCL commit
+      if: inputs.sycl_commit != ''
+      working-directory: ./sycl-repo
+      run: |
+        git fetch --depth=1 origin ${{ inputs.sycl_commit }}
+        git checkout ${{ inputs.sycl_commit }}
+
     - name: Set CUDA env vars
       if: matrix.adapter.str_name == 'cuda'
       run: |
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 3b5ef70e19..41d57bb591 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -36,8 +36,7 @@ jobs:
           - os: 'ubuntu-20.04'
             build_type: Release
             compiler: {c: gcc-7, cxx: g++-7}
-
-    runs-on: ${{matrix.os}}
+    runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}
 
     steps:
     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -50,6 +49,9 @@ jobs:
     - name: Install libhwloc
       run: .github/scripts/install_hwloc.sh
 
+    - name: Setup PATH
+      run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+
     - name: Install g++-7
       if: matrix.compiler.cxx == 'g++-7'
       run: |
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2ad96fb348..fdc5d0c0c0 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -12,7 +12,7 @@ permissions:
 jobs:
   analyze-ubuntu:
     name: Analyze on Ubuntu
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     permissions:
       security-events: write
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 0704038829..710aa659c8 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ concurrency:
 jobs:
   # Build job
   build:
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     steps:
       - name: Checkout
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -57,7 +57,7 @@ jobs:
     environment:
       name: github-pages
       url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     needs: build
     steps:
       - name: Deploy to GitHub Pages
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index 00055638df..f606b4a8fa 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -54,7 +54,7 @@ permissions:
 jobs:
   changed-files:
     name: Check for changed files
-    runs-on: ubuntu-22.04
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     outputs:
       any_changed: ${{ steps.get-changed.outputs.any_changed }}
     steps:
@@ -66,6 +66,7 @@ jobs:
         files: |
           source/adapters/${{inputs.str_name}}/**
           source/loader/**
+          .github/workflows/e2e*
 
   e2e-build-hw:
     # We want to run the job only if there are changes in the specific adapter
@@ -168,17 +169,24 @@ jobs:
         -DCMAKE_CXX_COMPILER="$(which clang++)"
         -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
 
-    - name: Set test filters for L0
-      if: matrix.adapter.name == 'L0'
-      run: |
-        echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV
-        echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
-        echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
+    - name: Set LIT_XFAIL_NOT
+      if: inputs.xfail_not != ''
+      run: echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV
+
+    - name: Set LIT_XFAIL
+      if: inputs.xfail != ''
+      run: echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
+
+    - name: Set LIT_FILTER_OUT
+      if: inputs.filter_out != ''
+      run: echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
 
     # TODO: remove once intel/llvm lit tests can properly recognize the GPU
     - name: Configure hardware platform feature for L0
       if: matrix.adapter.name == 'L0'
-      run: sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py
+      run: |
+        sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py
+        sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc")' build-e2e/lit.site.cfg.py
 
     - name: Run e2e tests
       id: tests
diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml
index 39f4a3082c..3fff36bb9c 100644
--- a/.github/workflows/e2e_level_zero.yml
+++ b/.github/workflows/e2e_level_zero.yml
@@ -21,9 +21,9 @@ jobs:
       config: ""
       unit: "gpu"
       # Failing tests
-      xfail: "Matrix/SG32/get_coord_int8_matB.cpp;Matrix/element_wise_all_ops_1d.cpp;Matrix/element_wise_all_ops_1d_cont.cpp;Matrix/element_wise_all_ops_scalar.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Plugin/level_zero_barrier_optimization.cpp"
+      xfail: "DeviceCodeSplit/grf.cpp;ESIMD/grf.cpp;ESIMD/named_barriers/loop_extended.cpp;KernelAndProgram/target_register_alloc_mode.cpp;Matrix/SG32/get_coord_int8_matB.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_prefetch.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp"
       # Flaky tests
-      filter_out: "UserDefinedReductions/user_defined_reductions.cpp"
+      filter_out: ""
       # These runners by default spawn upwards of 260 workers.
       # We also add a time out just in case some test hangs
-      extra_lit_flags: "--param gpu-intel-pvc=True -sv -j 100 --max-time 600"
+      extra_lit_flags: "--param gpu-intel-pvc=True --param gpu-intel-pvc-1T=True -sv -j 100 --max-time 600"
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index d0cb335d96..faf7060503 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -18,6 +18,6 @@ jobs:
     permissions:
       contents: read
       pull-requests: write
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     steps:
       - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index f1436fc46a..f466cc693e 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -11,7 +11,7 @@ permissions:
 
 jobs:
   weekly-prerelease:
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     permissions:
       contents: write
     steps:
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 2efb04c86a..c2ef1d47e7 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -19,7 +19,7 @@ permissions:
 jobs:
   linux:
     name: Trivy
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
     permissions:
       security-events: write
 
diff --git a/README.md b/README.md
index ae61b76b09..7ba72b43d3 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # Unified Runtime
 
 [![Build and test](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml)
-[![E2E Cuda](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml)
-[![E2E OpenCL](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml)
-[![E2E Level Zero](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml)
-[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml)
 [![Bandit](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml)
+[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml)
 [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
+[![Nightly](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml)
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/unified-runtime/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/unified-runtime)
+[![Trivy](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml)
+[![Deploy documentation to Pages](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml)
 
 <!-- TODO: add general description and purpose of the project -->
 
diff --git a/examples/collector/collector.cpp b/examples/collector/collector.cpp
index 910964e02c..cc9580bc4f 100644
--- a/examples/collector/collector.cpp
+++ b/examples/collector/collector.cpp
@@ -31,7 +31,7 @@ constexpr uint16_t TRACE_FN_BEGIN =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_begin);
 constexpr uint16_t TRACE_FN_END =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_end);
-constexpr std::string_view UR_STREAM_NAME = "ur";
+constexpr std::string_view UR_STREAM_NAME = "ur.call";
 
 /**
  * @brief Formats the function parameters and arguments for urAdapterGet
diff --git a/include/ur_api.h b/include/ur_api.h
index 8dd757afd2..5be733a429 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -227,6 +227,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228,                         ///< Enumerator for ::urEnqueueNativeCommandExp
     UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED = 229,                  ///< Enumerator for ::urLoaderConfigSetMockingEnabled
     UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP = 230,        ///< Enumerator for ::urBindlessImagesReleaseExternalMemoryExp
+    UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 231,     ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -2061,7 +2062,7 @@ typedef struct ur_device_native_properties_t {
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `NULL == hPlatform`
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phDevice`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -2069,7 +2070,7 @@ typedef struct ur_device_native_properties_t {
 UR_APIEXPORT ur_result_t UR_APICALL
 urDeviceCreateWithNativeHandle(
     ur_native_handle_t hNativeDevice,                 ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform,                   ///< [in] handle of the platform instance
+    ur_adapter_handle_t hAdapter,                     ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t *phDevice                      ///< [out] pointer to the handle of the device object created.
 );
@@ -3796,7 +3797,7 @@ urUSMPoolGetInfo(
 #endif
 // Intel 'oneAPI' Unified Runtime APIs
 #if !defined(__GNUC__)
-#pragma region virtual memory
+#pragma region virtual_memory
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Virtual memory granularity info
@@ -7353,7 +7354,7 @@ urEnqueueWriteHostPipe(
 #endif
 // Bindless Images Extension APIs
 #if !defined(__GNUC__)
-#pragma region bindless images(experimental)
+#pragma region bindless_images_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of bindless image
@@ -7962,6 +7963,36 @@ urBindlessImagesMapExternalArrayExp(
     ur_exp_image_mem_native_handle_t *phImageMem ///< [out] image memory handle to the externally allocated memory
 );
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Map an external memory handle to a device memory region described by
+///        void*
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hContext`
+///         + `NULL == hDevice`
+///         + `NULL == hExternalMem`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == ppRetMem`
+///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+UR_APIEXPORT ur_result_t UR_APICALL
+urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext,              ///< [in] handle of the context object
+    ur_device_handle_t hDevice,                ///< [in] handle of the device object
+    uint64_t offset,                           ///< [in] offset into memory region to map
+    uint64_t size,                             ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem                            ///< [out] pointer of the externally allocated memory
+);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Release external memory
 ///
@@ -8121,7 +8152,7 @@ urBindlessImagesSignalExternalSemaphoreExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for Command-Buffers
 #if !defined(__GNUC__)
-#pragma region command buffer(experimental)
+#pragma region command_buffer_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Command-buffer query information type
@@ -8974,7 +9005,7 @@ urCommandBufferCommandGetInfoExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for Cooperative Kernels
 #if !defined(__GNUC__)
-#pragma region cooperative kernels(experimental)
+#pragma region cooperative_kernels_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_COOPERATIVE_KERNELS_EXTENSION_STRING_EXP
@@ -9062,7 +9093,7 @@ urKernelSuggestMaxCooperativeGroupCountExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings
 #if !defined(__GNUC__)
-#pragma region enqueue timestamp recording(experimental)
+#pragma region enqueue_timestamp_recording_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Enqueue a command for recording the device timestamp
@@ -9104,7 +9135,7 @@ urEnqueueTimestampRecordingExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for (kernel) Launch Properties
 #if !defined(__GNUC__)
-#pragma region launch properties(experimental)
+#pragma region launch_properties_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP
@@ -9231,7 +9262,7 @@ urEnqueueKernelLaunchCustomExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for multi-device compile
 #if !defined(__GNUC__)
-#pragma region multi device compile(experimental)
+#pragma region multi_device_compile_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_MULTI_DEVICE_COMPILE_EXTENSION_STRING_EXP
@@ -9361,7 +9392,7 @@ urProgramLinkExp(
 #endif
 // Intel 'oneAPI' USM Import/Release Extension APIs
 #if !defined(__GNUC__)
-#pragma region usm import release(experimental)
+#pragma region usm_import_release_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Import memory into USM
@@ -9414,7 +9445,7 @@ urUSMReleaseExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental APIs for USM P2P
 #if !defined(__GNUC__)
-#pragma region usm p2p(experimental)
+#pragma region usm_p2p_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_USM_P2P_EXTENSION_STRING_EXP
@@ -9570,7 +9601,7 @@ urUsmP2PPeerAccessGetInfoExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs
 #if !defined(__GNUC__)
-#pragma region native enqueue(experimental)
+#pragma region native_enqueue_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Native enqueue properties
@@ -11231,6 +11262,19 @@ typedef struct ur_bindless_images_map_external_array_exp_params_t {
     ur_exp_image_mem_native_handle_t **pphImageMem;
 } ur_bindless_images_map_external_array_exp_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urBindlessImagesMapExternalLinearMemoryExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_bindless_images_map_external_linear_memory_exp_params_t {
+    ur_context_handle_t *phContext;
+    ur_device_handle_t *phDevice;
+    uint64_t *poffset;
+    uint64_t *psize;
+    ur_exp_external_mem_handle_t *phExternalMem;
+    void ***pppRetMem;
+} ur_bindless_images_map_external_linear_memory_exp_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urBindlessImagesReleaseExternalMemoryExp
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -11928,7 +11972,7 @@ typedef struct ur_device_get_native_handle_params_t {
 ///     allowing the callback the ability to modify the parameter's value
 typedef struct ur_device_create_with_native_handle_params_t {
     ur_native_handle_t *phNativeDevice;
-    ur_platform_handle_t *phPlatform;
+    ur_adapter_handle_t *phAdapter;
     const ur_device_native_properties_t **ppProperties;
     ur_device_handle_t **pphDevice;
 } ur_device_create_with_native_handle_params_t;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index ac47d7559f..13785a2d65 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1639,6 +1639,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalArrayExp_t)(
     ur_exp_external_mem_handle_t,
     ur_exp_image_mem_native_handle_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urBindlessImagesMapExternalLinearMemoryExp
+typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalLinearMemoryExp_t)(
+    ur_context_handle_t,
+    ur_device_handle_t,
+    uint64_t,
+    uint64_t,
+    ur_exp_external_mem_handle_t,
+    void **);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urBindlessImagesReleaseExternalMemoryExp
 typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesReleaseExternalMemoryExp_t)(
@@ -1699,6 +1709,7 @@ typedef struct ur_bindless_images_exp_dditable_t {
     ur_pfnBindlessImagesMipmapFreeExp_t pfnMipmapFreeExp;
     ur_pfnBindlessImagesImportExternalMemoryExp_t pfnImportExternalMemoryExp;
     ur_pfnBindlessImagesMapExternalArrayExp_t pfnMapExternalArrayExp;
+    ur_pfnBindlessImagesMapExternalLinearMemoryExp_t pfnMapExternalLinearMemoryExp;
     ur_pfnBindlessImagesReleaseExternalMemoryExp_t pfnReleaseExternalMemoryExp;
     ur_pfnBindlessImagesImportExternalSemaphoreExp_t pfnImportExternalSemaphoreExp;
     ur_pfnBindlessImagesReleaseExternalSemaphoreExp_t pfnReleaseExternalSemaphoreExp;
@@ -2362,7 +2373,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnDeviceGetNativeHandle_t)(
 /// @brief Function-pointer for urDeviceCreateWithNativeHandle
 typedef ur_result_t(UR_APICALL *ur_pfnDeviceCreateWithNativeHandle_t)(
     ur_native_handle_t,
-    ur_platform_handle_t,
+    ur_adapter_handle_t,
     const ur_device_native_properties_t *,
     ur_device_handle_t *);
 
diff --git a/include/ur_print.h b/include/ur_print.h
index a3a915827b..c70e661fb1 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -2146,6 +2146,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesImportExternalMemoryExp
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalArrayExpParams(const struct ur_bindless_images_map_external_array_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_bindless_images_map_external_linear_memory_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalLinearMemoryExpParams(const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_bindless_images_release_external_memory_exp_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index d8ac521bdc..9aeb5e3341 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -942,6 +942,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP:
         os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP";
         break;
+    case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP:
+        os << "UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -15190,6 +15193,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_bindless_images_map_external_linear_memory_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params) {
+
+    os << ".hContext = ";
+
+    ur::details::printPtr(os,
+                          *(params->phContext));
+
+    os << ", ";
+    os << ".hDevice = ";
+
+    ur::details::printPtr(os,
+                          *(params->phDevice));
+
+    os << ", ";
+    os << ".offset = ";
+
+    os << *(params->poffset);
+
+    os << ", ";
+    os << ".size = ";
+
+    os << *(params->psize);
+
+    os << ", ";
+    os << ".hExternalMem = ";
+
+    ur::details::printPtr(os,
+                          *(params->phExternalMem));
+
+    os << ", ";
+    os << ".ppRetMem = ";
+
+    ur::details::printPtr(os,
+                          *(params->pppRetMem));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_bindless_images_release_external_memory_exp_params_t type
 /// @returns
@@ -17312,10 +17357,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
                                   *(params->phNativeDevice)));
 
     os << ", ";
-    os << ".hPlatform = ";
+    os << ".hAdapter = ";
 
     ur::details::printPtr(os,
-                          *(params->phPlatform));
+                          *(params->phAdapter));
 
     os << ", ";
     os << ".pProperties = ";
@@ -17804,6 +17849,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP: {
         os << (const struct ur_bindless_images_map_external_array_exp_params_t *)params;
     } break;
+    case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP: {
+        os << (const struct ur_bindless_images_map_external_linear_memory_exp_params_t *)params;
+    } break;
     case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP: {
         os << (const struct ur_bindless_images_release_external_memory_exp_params_t *)params;
     } break;
diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py
index e976bfaee8..b28681c2ee 100644
--- a/scripts/benchmarks/benches/SobelFilter.py
+++ b/scripts/benchmarks/benches/SobelFilter.py
@@ -12,7 +12,10 @@
 class SobelFilter(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("sobel_filter", "sobel_filter", vb)
+
+    def download_deps(self):
         self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz")
+        return
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
diff --git a/scripts/benchmarks/benches/api_overhead.py b/scripts/benchmarks/benches/api_overhead.py
deleted file mode 100644
index d34f4c4ee8..0000000000
--- a/scripts/benchmarks/benches/api_overhead.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-import csv
-import io
-from utils.utils import run, git_clone
-from .base import Benchmark
-from .result import Result
-from .options import options
-
-class APIOverheadSYCL(Benchmark):
-    def __init__(self, directory):
-        super().__init__(directory)
-
-    def name(self):
-        return "api_overhead_benchmark_sycl, mean execution time per 10 kernels"
-
-    def unit(self):
-        return "μs"
-
-    def setup(self):
-        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59")
-        build_path = self.create_build_path('compute-benchmarks-build')
-
-        configure_command = [
-            "cmake",
-            f"-B {build_path}",
-            f"-S {repo_path}",
-            f"-DCMAKE_BUILD_TYPE=Release",
-            f"-DBUILD_SYCL=ON",
-            f"-DSYCL_COMPILER_ROOT={options.sycl}",
-            f"-DALLOW_WARNINGS=ON"
-        ]
-        run(configure_command, add_sycl=True)
-
-        run(f"cmake --build {build_path} -j", add_sycl=True)
-        self.benchmark_bin = f"{build_path}/bin/api_overhead_benchmark_sycl"
-
-    def run_internal(self, ioq, env_vars):
-        command = [
-            f"{self.benchmark_bin}",
-            "--test=SubmitKernel",
-            f"--Ioq={ioq}",
-            "--DiscardEvents=0",
-            "--MeasureCompletion=0",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-            "--csv",
-            "--noHeaders"
-        ]
-        result = self.run_bench(command, env_vars)
-        (label, mean) = self.parse_output(result)
-        return Result(label=label, value=mean, command=command, env=env_vars, stdout=result)
-
-    def run(self, env_vars) -> list[Result]:
-        results = []
-        for ioq in [0, 1]:
-            results.append(self.run_internal(ioq, env_vars))
-
-        return results
-
-    def parse_output(self, output):
-        csv_file = io.StringIO(output)
-        reader = csv.reader(csv_file)
-        next(reader, None)
-        data_row = next(reader, None)
-        if data_row is None:
-            raise ValueError("Benchmark output does not contain data.")
-        try:
-            label = data_row[0]
-            mean = float(data_row[1])
-            return (label, mean)
-        except (ValueError, IndexError) as e:
-            raise ValueError(f"Error parsing output: {e}")
-
-    def teardown(self):
-        return
diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 25b5d2619f..c7f263c253 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -20,16 +20,6 @@ def __init__(self, directory):
     def run_bench(self, command, env_vars):
         return run(command=command, env_vars=env_vars, add_sycl=True, cwd=options.benchmark_cwd).stdout.decode()
 
-    def create_build_path(self, name):
-        build_path = os.path.join(self.directory, name)
-
-        if options.rebuild and Path(build_path).exists():
-           shutil.rmtree(build_path)
-
-        Path(build_path).mkdir(parents=True, exist_ok=True)
-
-        return build_path
-
     def create_data_path(self, name):
         data_path = os.path.join(self.directory, "data", name)
 
@@ -58,10 +48,13 @@ def name(self):
     def unit(self):
         raise NotImplementedError()
 
+    def lower_is_better(self):
+        return True
+
     def setup(self):
         raise NotImplementedError()
 
-    def run(self, env_vars):
+    def run(self, env_vars) -> Result:
         raise NotImplementedError()
 
     def teardown(self):
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
new file mode 100644
index 0000000000..19bc0b7fd0
--- /dev/null
+++ b/scripts/benchmarks/benches/compute.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import csv
+import io
+from utils.utils import run, git_clone, create_build_path
+from .base import Benchmark
+from .result import Result
+from .options import options
+
+class ComputeBench:
+    def __init__(self, directory):
+        self.directory = directory
+        self.built = False
+        return
+
+    def setup(self):
+        if self.built:
+            return
+
+        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59")
+        build_path = create_build_path(self.directory, 'compute-benchmarks-build')
+
+        configure_command = [
+            "cmake",
+            f"-B {build_path}",
+            f"-S {repo_path}",
+            f"-DCMAKE_BUILD_TYPE=Release",
+            f"-DBUILD_SYCL=ON",
+            f"-DSYCL_COMPILER_ROOT={options.sycl}",
+            f"-DALLOW_WARNINGS=ON"
+        ]
+        run(configure_command, add_sycl=True)
+
+        run(f"cmake --build {build_path} -j", add_sycl=True)
+
+        self.built = True
+        self.bins = os.path.join(build_path, 'bin')
+
+class ComputeBenchmark(Benchmark):
+    def __init__(self, bench, name, test):
+        self.bench = bench
+        self.bench_name = name
+        self.test = test
+        super().__init__(bench.directory)
+
+    def bin_args(self) -> list[str]:
+        return []
+
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def unit(self):
+        return "μs"
+
+    def setup(self):
+        self.bench.setup()
+        self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name)
+
+    def run(self, env_vars) -> Result:
+        command = [
+            f"{self.benchmark_bin}",
+            f"--test={self.test}",
+            "--csv",
+            "--noHeaders"
+        ]
+
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
+
+        result = self.run_bench(command, env_vars)
+        (label, mean) = self.parse_output(result)
+        return Result(label=label, value=mean, command=command, env=env_vars, stdout=result)
+
+    def parse_output(self, output):
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+        next(reader, None)
+        data_row = next(reader, None)
+        if data_row is None:
+            raise ValueError("Benchmark output does not contain data.")
+        try:
+            label = data_row[0]
+            mean = float(data_row[1])
+            return (label, mean)
+        except (ValueError, IndexError) as e:
+            raise ValueError(f"Error parsing output: {e}")
+
+    def teardown(self):
+        return
+
+class SubmitKernelSYCL(ComputeBenchmark):
+    def __init__(self, bench, ioq):
+        self.ioq = ioq
+        super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_sycl SubmitKernel {order}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--Ioq={self.ioq}",
+            "--DiscardEvents=0",
+            "--MeasureCompletion=0",
+            "--iterations=100000",
+            "--Profiling=0",
+            "--NumKernels=10",
+            "--KernelExecTime=1"
+        ]
+
+class ExecImmediateCopyQueue(ComputeBenchmark):
+    def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
+        self.ioq = ioq
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=100000",
+            f"--ioq={self.ioq}",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            "--MeasureCompletionTime=0",
+            f"--src={self.destination}",
+            f"--dst={self.destination}",
+            f"--size={self.size}"
+        ]
+
+class QueueInOrderMemcpy(ComputeBenchmark):
+    def __init__(self, bench, isCopyOnly, source, destination, size):
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+            "--count=100"
+        ]
+
+class QueueMemcpy(ComputeBenchmark):
+    def __init__(self, bench, source, destination, size):
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+        ]
+
+class StreamMemory(ComputeBenchmark):
+    def __init__(self, bench, type, size, placement):
+        self.type = type
+        self.size = size
+        self.placement = placement
+        super().__init__(bench, "memory_benchmark_sycl", "StreamMemory")
+
+    def name(self):
+        return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--type={self.type}",
+            f"--size={self.size}",
+            f"--memoryPlacement={self.placement}",
+            "--useEvents=0",
+            "--contents=Zeros",
+        ]
+
+class VectorSum(ComputeBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum")
+
+    def name(self):
+        return f"miscellaneous_benchmark_sycl VectorSum"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=1000",
+            "--numberOfElementsX=512",
+            "--numberOfElementsY=256",
+            "--numberOfElementsZ=256",
+        ]
+
diff --git a/scripts/benchmarks/benches/cudaSift.py b/scripts/benchmarks/benches/cudaSift.py
index 6f9c19040e..482d258052 100644
--- a/scripts/benchmarks/benches/cudaSift.py
+++ b/scripts/benchmarks/benches/cudaSift.py
@@ -9,11 +9,18 @@
 from utils.utils import run
 import os
 import re
+import shutil
 
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("cudaSift", "cudaSift", vb)
 
+    def download_deps(self):
+        images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData')
+        dest = os.path.join(self.directory, 'inputData')
+        if not os.path.exists(dest):
+            shutil.copytree(images, dest)
+
     def name(self):
         return "Velocity-Bench CudaSift"
 
diff --git a/scripts/benchmarks/benches/easywave.py b/scripts/benchmarks/benches/easywave.py
index 2fa4d95685..2f89482329 100644
--- a/scripts/benchmarks/benches/easywave.py
+++ b/scripts/benchmarks/benches/easywave.py
@@ -14,6 +14,8 @@
 class Easywave(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("easywave", "easyWave_sycl", vb)
+
+    def download_deps(self):
         self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz")
 
     def name(self):
diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py
index c8cb0bdb03..7558183bf0 100644
--- a/scripts/benchmarks/benches/hashtable.py
+++ b/scripts/benchmarks/benches/hashtable.py
@@ -23,6 +23,9 @@ def unit(self):
     def bin_args(self) -> list[str]:
         return ["--no-verify"]
 
+    def lower_is_better(self):
+        return False
+
     def parse_output(self, stdout: str) -> float:
         match = re.search(r'(\d+\.\d+) million keys/second', stdout)
         if match:
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index c990a44d5f..c035ce6800 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -5,6 +5,9 @@ class Options:
     sycl: str = ""
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
+    timeout: float = 600
+    iterations: int = 5
+    verbose: bool = False
 
 options = Options()
 
diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py
index 383c8dd5be..7e1f65ee1d 100644
--- a/scripts/benchmarks/benches/quicksilver.py
+++ b/scripts/benchmarks/benches/quicksilver.py
@@ -15,10 +15,10 @@ def __init__(self, vb: VelocityBench):
         super().__init__("QuickSilver", "qs", vb)
         self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering")
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars) -> Result:
         # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
         if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0':
-            return []
+            return None
 
         return super().run(env_vars)
 
@@ -28,6 +28,9 @@ def name(self):
     def unit(self):
         return "MMS/CTT"
 
+    def lower_is_better(self):
+        return False
+
     def bin_args(self) -> list[str]:
         return ["-i", f"{self.data_path}/scatteringOnly.inp"]
 
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 8dd2f4ba9c..896ff4da98 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -16,3 +16,4 @@ class Result:
     stdout: str
     unit: str = ""
     name: str = ""
+    lower_is_better: bool = True
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index fec3abb842..e5601c6563 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -6,15 +6,14 @@
 from utils.utils import git_clone
 from .base import Benchmark
 from .result import Result
-from utils.utils import run
+from utils.utils import run, create_build_path
 import os
 import re
 
 class VelocityBench:
     def __init__(self, directory):
         self.directory = directory
-        # TODO: replace with https://github.com/oneapi-src/Velocity-Bench once all fixes land upstream
-        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/pbalcer/Velocity-Bench.git", "ae0ae05c7fd1469779ecea4f36e4741b1d956eb4")
+        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench", "34ee4ebe18d91dfdd38b7d798fd986b41874fcbc")
 
 class VelocityBase(Benchmark):
     def __init__(self, name: str, bin_name: str, vb: VelocityBench):
@@ -24,8 +23,13 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench):
         self.bin_name = bin_name
         self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL')
 
+    def download_deps(self):
+        return
+
     def setup(self):
-        build_path = self.create_build_path(self.bench_name)
+        self.download_deps()
+
+        build_path = create_build_path(self.directory, self.bench_name)
 
         configure_command = [
             "cmake",
@@ -47,7 +51,7 @@ def extra_env_vars(self) -> dict:
     def parse_output(self, stdout: str) -> float:
         raise NotImplementedError()
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars) -> Result:
         env_vars.update(self.extra_env_vars())
 
         command = [
@@ -57,7 +61,7 @@ def run(self, env_vars) -> list[Result]:
 
         result = self.run_bench(command, env_vars)
 
-        return [Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)]
+        return Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)
 
     def teardown(self):
         return
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 5dad40c7fe..34238f773c 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -5,9 +5,8 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
 from utils.utils import prepare_workdir, load_benchmark_results, save_benchmark_results;
-from benches.api_overhead import APIOverheadSYCL
+from benches.compute import *
 from benches.hashtable import Hashtable
 from benches.bitcracker import Bitcracker
 from benches.cudaSift import CudaSift
@@ -18,46 +17,72 @@
 from benches.options import options
 from output import generate_markdown
 import argparse
+import re
 
 # Update this if you are changing the layout of the results files
-INTERNAL_WORKDIR_VERSION = '1.0'
-
-def main(directory, additional_env_vars, save_name, compare_names):
-    variants = [
-        ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
-        ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
-    ]
+INTERNAL_WORKDIR_VERSION = '1.6'
 
+def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
     vb = VelocityBench(directory)
+    cb = ComputeBench(directory)
 
     benchmarks = [
-        APIOverheadSYCL(directory),
+        SubmitKernelSYCL(cb, 0),
+        SubmitKernelSYCL(cb, 1),
+        QueueInOrderMemcpy(cb, 0, 'Device', 'Device', 1024),
+        QueueInOrderMemcpy(cb, 0, 'Host', 'Device', 1024),
+        QueueMemcpy(cb, 'Device', 'Device', 1024),
+        StreamMemory(cb, 'Triad', 10 * 1024, 'Device'),
+        ExecImmediateCopyQueue(cb, 0, 1, 'Device', 'Device', 1024),
+        ExecImmediateCopyQueue(cb, 1, 1, 'Device', 'Host', 1024),
+        VectorSum(cb),
         Hashtable(vb),
         Bitcracker(vb),
-        #CudaSift(vb), TODO: the benchmark is passing, but is outputting "Failed to allocate device data"
+        CudaSift(vb),
         Easywave(vb),
         QuickSilver(vb),
         SobelFilter(vb)
     ]
 
+    if filter:
+        benchmarks = [benchmark for benchmark in benchmarks if filter.search(benchmark.name())]
+
     for benchmark in benchmarks:
+        print(f"setting up {benchmark.name()}... ", end='', flush=True)
         benchmark.setup()
+        print("complete.")
 
     results = []
     for benchmark in benchmarks:
-        for env_vars, extra_label in variants:
-            merged_env_vars = {**env_vars, **additional_env_vars}
+        merged_env_vars = {**additional_env_vars}
+        iteration_results = []
+        for iter in range(options.iterations):
+            print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
             bench_results = benchmark.run(merged_env_vars)
-            for res in bench_results:
-                res.unit = benchmark.unit()
-                res.name = benchmark.name()
-                res.label += f" {extra_label}"
-                results.append(res)
+            if bench_results is not None:
+                print(f"complete ({bench_results.value} {benchmark.unit()}).")
+                iteration_results.append(bench_results)
+            else:
+                print(f"did not finish.")
+
+        if len(iteration_results) == 0:
+            continue
+
+        iteration_results.sort(key=lambda res: res.value)
+        median_index = len(iteration_results) // 2
+        median_result = iteration_results[median_index]
+
+        median_result.unit = benchmark.unit()
+        median_result.name = benchmark.name()
+
+        results.append(median_result)
 
     for benchmark in benchmarks:
+        print(f"tearing down {benchmark.name()}... ", end='', flush=True)
         benchmark.teardown()
+        print("complete.")
 
     chart_data = {"This PR" : results}
 
@@ -93,11 +118,20 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
     parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
+    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
+    parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
+    parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
+    options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.sycl = args.sycl
+    options.iterations = args.iterations
+    options.timeout = args.timeout
+
+    benchmark_filter = re.compile(args.filter) if args.filter else None
 
-    main(args.benchmark_directory, additional_env_vars, args.save, args.compare)
+    main(args.benchmark_directory, additional_env_vars, args.save, args.compare, benchmark_filter)
diff --git a/scripts/benchmarks/output.py b/scripts/benchmarks/output.py
index 9cfee303b1..26deabe099 100644
--- a/scripts/benchmarks/output.py
+++ b/scripts/benchmarks/output.py
@@ -5,6 +5,7 @@
 
 import collections
 from benches.base import Result
+import math
 
 # Function to generate the mermaid bar chart script
 def generate_mermaid_script(chart_data: dict[str, list[Result]]):
@@ -19,6 +20,9 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]):
         # remove duplicates
         labels = list(dict.fromkeys(labels))
         mermaid_script += f"""
+<details>
+<summary>{bname}</summary>
+
 ```mermaid
 ---
 config:
@@ -57,6 +61,8 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]):
 """
         mermaid_script += f"""
 ```
+
+</details>
 """
 
     return mermaid_script
@@ -83,44 +89,52 @@ def generate_markdown_details(results: list[Result]):
 """)
     return "\n".join(markdown_sections)
 
-def generate_summary(chart_data: dict[str, list[Result]]) -> str:
-    # Calculate the mean value of "This PR" for each benchmark
-    this_pr_means = {}
-    for res in chart_data["This PR"]:
-        if res.name not in this_pr_means:
-            this_pr_means[res.name] = []
-        this_pr_means[res.name].append(res.value)
-    for bname in this_pr_means:
-        this_pr_means[bname] = sum(this_pr_means[bname]) / len(this_pr_means[bname])
-
-    # Calculate the percentage for each entry relative to "This PR"
-    summary_data = {"This PR": 100}
-    for entry_name, results in chart_data.items():
-        if entry_name == "This PR":
-            continue
-        entry_sum = 0
-        for res in results:
-            if res.name in this_pr_means:
-                percentage = (res.value / this_pr_means[res.name]) * 100
-                entry_sum += percentage
-
-        entry_average = entry_sum / len(results) if results else 0
-        summary_data[entry_name] = entry_average
+def generate_summary_table(chart_data: dict[str, list[Result]]):
+    summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " |\n"
+    summary_table += "|---" * (len(chart_data) + 1) + "|\n"
 
-    markdown_table = "| Name | Result % |\n| --- | --- |\n"
-    for entry_name, percentage in summary_data.items():
-        markdown_table += f"| {entry_name} | {percentage:.2f}% |\n"
-
-    return markdown_table
+    # Collect all benchmarks and their results
+    benchmark_results = collections.defaultdict(dict)
+    for key, results in chart_data.items():
+        for res in results:
+            benchmark_results[res.name][key] = res
+
+    # Generate the table rows
+    for bname, results in benchmark_results.items():
+        row = f"| {bname} |"
+        best_value = None
+        best_key = None
+
+        # Determine the best value
+        for key, res in results.items():
+            if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value):
+                best_value = res.value
+                best_key = key
+
+        # Generate the row with the best value highlighted
+        for key in chart_data.keys():
+            if key in results:
+                value = results[key].value
+                if key == best_key:
+                    row += f" `**{value}**` |"  # Highlight the best value
+                else:
+                    row += f" {value} |"
+            else:
+                row += " - |"
+
+        summary_table += row + "\n"
+
+    return summary_table
 
 def generate_markdown(chart_data: dict[str, list[Result]]):
     mermaid_script = generate_mermaid_script(chart_data)
+    summary_table = generate_summary_table(chart_data)
 
     return f"""
 # Summary
-{generate_summary(chart_data)}
-# Benchmark Results
+{summary_table}
+# Charts
 {mermaid_script}
-## Details
+# Details
 {generate_markdown_details(chart_data["This PR"])}
 """
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index 9dc3f23a9b..5c7beb95d0 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -28,9 +28,12 @@ def run(command, env_vars={}, cwd=None, add_sycl=False):
             env['LD_LIBRARY_PATH'] = sycl_lib_path + os.pathsep + env.get('LD_LIBRARY_PATH', '')
 
         env.update(env_vars)
-        result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) # nosec B603
-        print(result.stdout.decode())
-        print(result.stderr.decode())
+        result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, timeout=options.timeout) # nosec B603
+
+        if options.verbose:
+            print(result.stdout.decode())
+            print(result.stderr.decode())
+
         return result
     except subprocess.CalledProcessError as e:
         print(e.stdout.decode())
@@ -70,7 +73,8 @@ def load_benchmark_results(dir, compare_name) -> list[Result]:
         return None
 
 def prepare_bench_cwd(dir):
-    options.benchmark_cwd = os.path.join(dir, 'bcwd')
+    # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
+    options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
     if os.path.exists(options.benchmark_cwd):
         shutil.rmtree(options.benchmark_cwd)
     os.makedirs(options.benchmark_cwd)
@@ -97,3 +101,13 @@ def prepare_workdir(dir, version):
 
     with open(version_file_path, 'w') as version_file:
         version_file.write(version)
+
+def create_build_path(directory, name):
+    build_path = os.path.join(directory, name)
+
+    if options.rebuild and Path(build_path).exists():
+        shutil.rmtree(build_path)
+
+    Path(build_path).mkdir(parents=True, exist_ok=True)
+
+    return build_path
diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst
index 398d3ba06c..c2b3d1114e 100644
--- a/scripts/core/EXP-BINDLESS-IMAGES.rst
+++ b/scripts/core/EXP-BINDLESS-IMAGES.rst
@@ -181,6 +181,7 @@ Functions
 * Interop
    * ${x}BindlessImagesImportExternalMemoryExp
    * ${x}BindlessImagesMapExternalArrayExp
+   * ${x}BindlessImagesMapExternalLinearMemoryExp
    * ${x}BindlessImagesReleaseExternalMemoryExp
    * ${x}BindlessImagesImportExternalSemaphoreExp
    * ${x}BindlessImagesReleaseExternalSemaphoreExp
@@ -250,6 +251,8 @@ Changelog
 | 17.0     || Rename interop related structs and funcs with "external"   |
 |          || keyword over "interop".                                    |
 +----------+-------------------------------------------------------------+
+| 18.0     | Added BindlessImagesMapExternalLinearMemoryExp function.    |
++----------+-------------------------------------------------------------+
 
 Contributors
 --------------------------------------------------------------------------------
diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst
index 1f319d6884..448e3569e2 100644
--- a/scripts/core/INTRO.rst
+++ b/scripts/core/INTRO.rst
@@ -161,7 +161,7 @@ Tracing
 
 Unified Runtime loader implements tracing support through the `XPTI framework <https://github.com/intel/llvm/blob/sycl/xptifw/doc/XPTI_Framework.md>`__.
 
-.. list-table:: UR Stream `"ur"` Notification Signatures
+.. list-table:: UR Stream `"ur.call"` Notification Signatures
    :header-rows: 1
 
    * - Trace Point Type
@@ -295,6 +295,8 @@ Layers currently included with the runtime are as follows:
      - Description
    * - UR_LAYER_PARAMETER_VALIDATION
      - Enables non-adapter-specific parameter validation (e.g. checking for null values).
+   * - UR_LAYER_BOUNDS_CHECKING
+     - Enables non-adapter-specific bounds checking of USM allocations for enqueued commands. Automatically enables UR_LAYER_PARAMETER_VALIDATION.
    * - UR_LAYER_LEAK_CHECKING
      - Performs some leak checking for API calls involving object creation/destruction.
    * - UR_LAYER_LIFETIME_VALIDATION
diff --git a/scripts/core/device.yml b/scripts/core/device.yml
index ead3ceeb8d..23c0233ef7 100644
--- a/scripts/core/device.yml
+++ b/scripts/core/device.yml
@@ -820,9 +820,9 @@ params:
     - type: $x_native_handle_t
       name: hNativeDevice
       desc: "[in][nocheck] the native handle of the device."
-    - type: $x_platform_handle_t
-      name: hPlatform
-      desc: "[in] handle of the platform instance"
+    - type: $x_adapter_handle_t
+      name: hAdapter
+      desc: "[in] handle of the adapter to which `hNativeDevice` belongs"
     - type: const $x_device_native_properties_t*
       name: pProperties
       desc: "[in][optional] pointer to native device properties struct."
diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml
index 52cd754644..d78583ac39 100644
--- a/scripts/core/exp-bindless-images.yml
+++ b/scripts/core/exp-bindless-images.yml
@@ -737,6 +737,37 @@ returns:
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
 type: function
+desc: "Map an external memory handle to a device memory region described by void*"
+class: $xBindlessImages
+name: MapExternalLinearMemoryExp
+ordinal: "0"
+params:
+    - type: $x_context_handle_t
+      name: hContext
+      desc: "[in] handle of the context object"
+    - type: $x_device_handle_t
+      name: hDevice
+      desc: "[in] handle of the device object"
+    - type: uint64_t
+      name: offset
+      desc: "[in] offset into memory region to map"
+    - type: uint64_t
+      name: size
+      desc: "[in] size of memory region to map"
+    - type: $x_exp_external_mem_handle_t
+      name: hExternalMem
+      desc: "[in] external memory handle to the external memory"
+    - type: void**
+      name: ppRetMem
+      desc: "[out] pointer of the externally allocated memory"
+returns:
+    - $X_RESULT_ERROR_INVALID_CONTEXT
+    - $X_RESULT_ERROR_INVALID_VALUE
+    - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
+    - $X_RESULT_ERROR_INVALID_OPERATION
+    - $X_RESULT_ERROR_OUT_OF_RESOURCES
+--- #--------------------------------------------------------------------------
+type: function
 desc: "Release external memory"
 class: $xBindlessImages
 name: ReleaseExternalMemoryExp
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index d80c56b500..ab59404bb4 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -595,6 +595,9 @@ etors:
 - name: BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP
   desc: Enumerator for $xBindlessImagesReleaseExternalMemoryExp
   value: '230'
+- name: BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP
+  desc: Enumerator for $xBindlessImagesMapExternalLinearMemoryExp
+  value: '231'
 ---
 type: enum
 desc: Defines structure types
diff --git a/scripts/templates/api.h.mako b/scripts/templates/api.h.mako
index 9fc9944b47..85b8a78c2a 100644
--- a/scripts/templates/api.h.mako
+++ b/scripts/templates/api.h.mako
@@ -42,7 +42,7 @@ extern "C" {
 %if len(spec['objects']):
 // ${th.subt(n, tags, spec['header']['desc'])}
 #if !defined(__GNUC__)
-#pragma region ${spec['name']}
+#pragma region ${spec['name'].replace(' ', '_')}
 #endif
 %endif
 %for obj in spec['objects']:
diff --git a/scripts/templates/valddi.cpp.mako b/scripts/templates/valddi.cpp.mako
index 778595b052..8cc4a9dc0f 100644
--- a/scripts/templates/valddi.cpp.mako
+++ b/scripts/templates/valddi.cpp.mako
@@ -57,8 +57,16 @@ namespace ur_validation_layer
         {
             %for key, values in sorted_param_checks:
             %for val in values:
-            if( ${val} )
+            %if 'boundsError' in val:
+            if ( getContext()->enableBoundsChecking ) {
+                if ( ${val} ) {
+                    return ${key};
+                }
+            }
+            %else:
+            if ( ${val} )
                 return ${key};
+            %endif
 
             %endfor
             %endfor
@@ -178,9 +186,13 @@ namespace ur_validation_layer
 
         if (enabledLayerNames.count(nameFullValidation)) {
             enableParameterValidation = true;
+            enableBoundsChecking = true;
             enableLeakChecking = true;
             enableLifetimeValidation = true;
         } else {
+            if (enabledLayerNames.count(nameBoundsChecking)) {
+                enableBoundsChecking = true;
+            }
             if (enabledLayerNames.count(nameParameterValidation)) {
                 enableParameterValidation = true;
             }
@@ -209,13 +221,11 @@ namespace ur_validation_layer
     }
 
     ${x}_result_t context_t::tearDown() {
-        ${x}_result_t result = ${X}_RESULT_SUCCESS;
-
         if (enableLeakChecking) {
             getContext()->refCountContext->logInvalidReferences();
-            getContext()->refCountContext->clear();
         }
-        return result;
+        
+        return ${X}_RESULT_SUCCESS;
     }
 
 } // namespace ur_validation_layer
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index 013bd9b4d9..ca27eaa422 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -98,6 +98,7 @@ if (UR_ENABLE_TRACING)
   endif()
   target_compile_definitions(${TARGET_NAME} PRIVATE
     XPTI_ENABLE_INSTRUMENTATION
+    XPTI_STATIC_LIBRARY
     )
   target_include_directories(${TARGET_NAME} PUBLIC
     ${XPTI_INCLUDES}
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 8d21a93c75..2fdb6b08a3 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -74,12 +74,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
 ur_exp_command_buffer_command_handle_t_::
     ur_exp_command_buffer_command_handle_t_(
         ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
-        std::shared_ptr<CUgraphNode> &&Node, CUDA_KERNEL_NODE_PARAMS Params,
-        uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
-        const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr)
-    : CommandBuffer(CommandBuffer), Kernel(Kernel), Node{std::move(Node)},
-      Params(Params), WorkDim(WorkDim), RefCountInternal(1),
-      RefCountExternal(1) {
+        CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim,
+        const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
+        const size_t *LocalWorkSizePtr)
+    : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params),
+      WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) {
   CommandBuffer->incrementInternalReferenceCount();
 
   const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -124,7 +123,7 @@ static ur_result_t getNodesFromSyncPoints(
   for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
     if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]);
         NodeHandle != SyncPoints.end()) {
-      CuNodesList.push_back(*NodeHandle->second.get());
+      CuNodesList.push_back(NodeHandle->second);
     } else {
       return UR_RESULT_ERROR_INVALID_VALUE;
     }
@@ -161,22 +160,22 @@ static ur_result_t enqueueCommandBufferFillHelper(
     const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
     size_t Size, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
-                                 SyncPointWaitList, DepsList),
-          Result);
+  UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
+                                        SyncPointWaitList, DepsList));
 
   try {
+    // Graph node added to graph, if multiple nodes are created this will
+    // be set to the leaf node
+    CUgraphNode GraphNode;
+
     const size_t N = Size / PatternSize;
     auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
                       ? *static_cast<CUdeviceptr *>(DstDevice)
                       : (CUdeviceptr)DstDevice;
 
     if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
-      // Create a new node
-      CUgraphNode GraphNode;
       CUDA_MEMSET_NODE_PARAMS NodeParams = {};
       NodeParams.dst = DstPtr;
       NodeParams.elementSize = PatternSize;
@@ -207,11 +206,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
           cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph,
                                DepsList.data(), DepsList.size(), &NodeParams,
                                CommandBuffer->Device->getNativeContext()));
-
-      // Get sync point and register the cuNode with it.
-      *SyncPoint =
-          CommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
-
     } else {
       // CUDA has no memset functions that allow setting values more than 4
       // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
@@ -222,10 +216,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
 
       size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
 
-      // Shared pointer that will point to the last node created
-      std::shared_ptr<CUgraphNode> GraphNodePtr;
-      // Create a new node
-      CUgraphNode GraphNodeFirst;
       // Update NodeParam
       CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
       NodeParamsStepFirst.dst = DstPtr;
@@ -236,16 +226,12 @@ static ur_result_t enqueueCommandBufferFillHelper(
       NodeParamsStepFirst.width = 1;
 
       UR_CHECK_ERROR(cuGraphAddMemsetNode(
-          &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
+          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
           DepsList.size(), &NodeParamsStepFirst,
           CommandBuffer->Device->getNativeContext()));
 
-      // Get sync point and register the cuNode with it.
-      *SyncPoint = CommandBuffer->addSyncPoint(
-          std::make_shared<CUgraphNode>(GraphNodeFirst));
-
       DepsList.clear();
-      DepsList.push_back(GraphNodeFirst);
+      DepsList.push_back(GraphNode);
 
       // we walk up the pattern in 1-byte steps, and call cuMemset for each
       // 1-byte chunk of the pattern.
@@ -256,8 +242,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
         // offset the pointer to the part of the buffer we want to write to
         auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t));
 
-        // Create a new node
-        CUgraphNode GraphNode;
         // Update NodeParam
         CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
         NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
@@ -272,18 +256,20 @@ static ur_result_t enqueueCommandBufferFillHelper(
             DepsList.size(), &NodeParamsStep,
             CommandBuffer->Device->getNativeContext()));
 
-        GraphNodePtr = std::make_shared<CUgraphNode>(GraphNode);
-        // Get sync point and register the cuNode with it.
-        *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr);
-
         DepsList.clear();
-        DepsList.push_back(*GraphNodePtr.get());
+        DepsList.push_back(GraphNode);
       }
     }
+
+    // Get sync point and register the cuNode with it.
+    auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
@@ -368,18 +354,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
 
   std::vector<CUgraphNode> DepsList;
-
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   if (*pGlobalWorkSize == 0) {
     try {
@@ -388,12 +367,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                          DepsList.data(), DepsList.size()));
 
       // Get sync point and register the cuNode with it.
-      *pSyncPoint = hCommandBuffer->addSyncPoint(
-          std::make_shared<CUgraphNode>(GraphNode));
+      auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+      if (pSyncPoint) {
+        *pSyncPoint = SyncPoint;
+      }
     } catch (ur_result_t Err) {
-      Result = Err;
+      return Err;
     }
-    return Result;
+    return UR_RESULT_SUCCESS;
   }
 
   // Set the number of threads per block to the number of threads per warp
@@ -403,13 +384,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
 
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
-  Result =
+  UR_CHECK_ERROR(
       setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
                       pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
-                      hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+                      hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid));
 
   try {
     // Set node param structure with the kernel related data
@@ -434,14 +412,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
       hKernel->clearLocalSize();
 
     // Get sync point and register the cuNode with it.
-    auto NodeSP = std::make_shared<CUgraphNode>(GraphNode);
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
-      *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP);
+      *pSyncPoint = SyncPoint;
     }
 
     auto NewCommand = new ur_exp_command_buffer_command_handle_t_{
-        hCommandBuffer, hKernel,           std::move(NodeSP), NodeParams,
-        workDim,        pGlobalWorkOffset, pGlobalWorkSize,   pLocalWorkSize};
+        hCommandBuffer, hKernel,           GraphNode,       NodeParams,
+        workDim,        pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize};
 
     NewCommand->incrementInternalReferenceCount();
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -451,9 +429,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
@@ -461,16 +439,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
     size_t size, uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     CUDA_MEMCPY3D NodeParams = {};
@@ -482,12 +454,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
@@ -496,7 +470,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
 
@@ -505,13 +478,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   UR_ASSERT(size + srcOffset <= std::get<BufferMem>(hSrcMem->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Src = std::get<BufferMem>(hSrcMem->Mem)
@@ -528,12 +496,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
@@ -544,16 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto SrcPtr =
@@ -571,12 +535,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT
@@ -586,16 +552,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Dst = std::get<BufferMem>(hBuffer->Mem)
@@ -610,12 +570,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT
@@ -624,16 +586,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
     size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Src = std::get<BufferMem>(hBuffer->Mem)
@@ -648,12 +604,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT
@@ -665,16 +623,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto DstPtr =
@@ -691,12 +643,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT
@@ -708,16 +662,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto SrcPtr =
@@ -734,12 +682,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
         &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
@@ -750,13 +700,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
   // Prefetch cmd is not supported by Cuda Graph.
   // We implement it as an empty node to enforce dependencies.
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
 
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     // Add an empty node to preserve dependencies.
@@ -764,17 +712,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
                                        DepsList.data(), DepsList.size()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
 
-    setErrorMessage("Prefetch hint ignored and replaced with empty node as "
-                    "prefetch is not supported by CUDA Graph backend",
-                    UR_RESULT_SUCCESS);
-    Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
@@ -785,13 +731,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
   // Mem-Advise cmd is not supported by Cuda Graph.
   // We implement it as an empty node to enforce dependencies.
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUgraphNode GraphNode;
 
   std::vector<CUgraphNode> DepsList;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     // Add an empty node to preserve dependencies.
@@ -799,18 +743,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
                                        DepsList.data(), DepsList.size()));
 
     // Get sync point and register the cuNode with it.
-    *pSyncPoint =
-        hCommandBuffer->addSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
-
-    setErrorMessage("Memory advice ignored and replaced with empty node as "
-                    "memory advice is not supported by CUDA Graph backend",
-                    UR_RESULT_SUCCESS);
-    Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
@@ -860,7 +801,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
@@ -870,10 +810,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                                    phEventWaitList)) != UR_RESULT_SUCCESS) {
-      return Result;
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent = std::unique_ptr<ur_event_handle_t_>(
@@ -890,10 +828,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
@@ -1067,7 +1005,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   Params.sharedMemBytes = Kernel->getLocalSize();
   Params.kernelParams = const_cast<void **>(Kernel->getArgIndices().data());
 
-  CUgraphNode Node = *(hCommand->Node);
+  CUgraphNode Node = hCommand->Node;
   CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
   UR_CHECK_ERROR(cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params));
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index d83269f2ae..504095612b 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -42,9 +42,9 @@
 struct ur_exp_command_buffer_command_handle_t_ {
   ur_exp_command_buffer_command_handle_t_(
       ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
-      std::shared_ptr<CUgraphNode> &&Node, CUDA_KERNEL_NODE_PARAMS Params,
-      uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
-      const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr);
+      CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim,
+      const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
+      const size_t *LocalWorkSizePtr);
 
   void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) {
     const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -97,7 +97,7 @@ struct ur_exp_command_buffer_command_handle_t_ {
 
   ur_exp_command_buffer_handle_t CommandBuffer;
   ur_kernel_handle_t Kernel;
-  std::shared_ptr<CUgraphNode> Node;
+  CUgraphNode Node;
   CUDA_KERNEL_NODE_PARAMS Params;
 
   uint32_t WorkDim;
@@ -118,8 +118,8 @@ struct ur_exp_command_buffer_handle_t_ {
   ~ur_exp_command_buffer_handle_t_();
 
   void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
-                         std::shared_ptr<CUgraphNode> CuNode) {
-    SyncPoints[SyncPoint] = std::move(CuNode);
+                         CUgraphNode CuNode) {
+    SyncPoints[SyncPoint] = CuNode;
     NextSyncPoint++;
   }
 
@@ -130,8 +130,7 @@ struct ur_exp_command_buffer_handle_t_ {
   // Helper to register next sync point
   // @param CuNode Node to register as next sync point
   // @return Pointer to the sync that registers the Node
-  ur_exp_command_buffer_sync_point_t
-  addSyncPoint(std::shared_ptr<CUgraphNode> CuNode) {
+  ur_exp_command_buffer_sync_point_t addSyncPoint(CUgraphNode CuNode) {
     ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint;
     registerSyncPoint(SyncPoint, std::move(CuNode));
     return SyncPoint;
@@ -173,8 +172,7 @@ struct ur_exp_command_buffer_handle_t_ {
   std::atomic_uint32_t RefCountExternal;
 
   // Map of sync_points to ur_events
-  std::unordered_map<ur_exp_command_buffer_sync_point_t,
-                     std::shared_ptr<CUgraphNode>>
+  std::unordered_map<ur_exp_command_buffer_sync_point_t, CUgraphNode>
       SyncPoints;
   // Next sync_point value (may need to consider ways to reuse values if 32-bits
   // is not enough)
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index f6e6bbe4b3..bbaaa27cdb 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1185,27 +1185,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 /// \return TBD
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
-    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
-    const ur_device_native_properties_t *pProperties,
+    ur_native_handle_t hNativeDevice,
+    [[maybe_unused]] ur_adapter_handle_t hAdapter,
+    [[maybe_unused]] const ur_device_native_properties_t *pProperties,
     ur_device_handle_t *phDevice) {
-  std::ignore = pProperties;
-
   CUdevice CuDevice = static_cast<CUdevice>(hNativeDevice);
 
   auto IsDevice = [=](std::unique_ptr<ur_device_handle_t_> &Dev) {
     return Dev->get() == CuDevice;
   };
 
-  // If a platform is provided just check if the device is in it
-  if (hPlatform) {
-    auto SearchRes = std::find_if(begin(hPlatform->Devices),
-                                  end(hPlatform->Devices), IsDevice);
-    if (SearchRes != end(hPlatform->Devices)) {
-      *phDevice = SearchRes->get();
-      return UR_RESULT_SUCCESS;
-    }
-  }
-
   // Get list of platforms
   uint32_t NumPlatforms = 0;
   ur_adapter_handle_t AdapterHandle = &adapter;
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 58ee98184d..c1154ec9c4 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -455,21 +455,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
 
   // Allocate a cuArray
   if (pImageDesc->numMipLevel == 1) {
-    CUarray ImageArray;
+    CUarray ImageArray{};
 
     try {
       UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &array_desc));
       *phImageMem = (ur_exp_image_mem_native_handle_t)ImageArray;
     } catch (ur_result_t Err) {
-      cuArrayDestroy(ImageArray);
+      if (ImageArray != CUarray{}) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
       return Err;
     } catch (...) {
-      cuArrayDestroy(ImageArray);
+      if (ImageArray != CUarray{}) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
       return UR_RESULT_ERROR_UNKNOWN;
     }
   } else // Allocate a cuMipmappedArray
   {
-    CUmipmappedArray mip_array;
+    CUmipmappedArray mip_array{};
     array_desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
 
     try {
@@ -477,10 +481,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
                                             pImageDesc->numMipLevel));
       *phImageMem = (ur_exp_image_mem_native_handle_t)mip_array;
     } catch (ur_result_t Err) {
-      cuMipmappedArrayDestroy(mip_array);
+      if (mip_array) {
+        UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array));
+      }
       return Err;
     } catch (...) {
-      cuMipmappedArrayDestroy(mip_array);
+      if (mip_array) {
+        UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array));
+      }
       return UR_RESULT_ERROR_UNKNOWN;
     }
   }
@@ -1169,6 +1177,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset,
+    uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **ppRetMem) {
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
+            UR_RESULT_ERROR_INVALID_CONTEXT);
+
+  try {
+    ScopedContext Active(hDevice);
+
+    CUDA_EXTERNAL_MEMORY_BUFFER_DESC BufferDesc = {};
+    BufferDesc.size = size;
+    BufferDesc.offset = offset;
+    BufferDesc.flags = 0;
+
+    CUdeviceptr retMem;
+    UR_CHECK_ERROR(cuExternalMemoryGetMappedBuffer(
+        &retMem, (CUexternalMemory)hExternalMem, &BufferDesc));
+
+    *ppRetMem = (void *)retMem;
+
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_external_mem_handle_t hExternalMem) {
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 1aefb15a3d..ea55c1669a 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -439,7 +439,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
     }
   } else {
-    CUarray ImageArray;
+    CUarray ImageArray{};
     CUsurfObject Surface;
     try {
       auto &Image = std::get<SurfaceMem>(Mem->Mem);
@@ -465,12 +465,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
       Image.SurfObjs[DeviceIdx] = Surface;
     } catch (ur_result_t Err) {
-      if (ImageArray) {
+      if (ImageArray != CUarray{}) {
         UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
       }
       return Err;
     } catch (...) {
-      if (ImageArray) {
+      if (ImageArray != CUarray{}) {
         UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
       }
       return UR_RESULT_ERROR_UNKNOWN;
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index aa992f44bf..6dcaa28414 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -197,20 +197,15 @@ struct SurfaceMem {
              void *HostPtr)
       : Arrays(Context->Devices.size(), CUarray{0}),
         SurfObjs(Context->Devices.size(), CUsurfObject{0}),
-        OuterMemStruct{OuterMemStruct},
-        ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} {
+        OuterMemStruct{OuterMemStruct}, ImageDesc{ImageDesc}, ArrayDesc{},
+        HostPtr{HostPtr} {
     // We have to use hipArray3DCreate, which has some caveats. The height and
     // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc
     // gives a minimum value of 1, so we need to convert the answer.
     ArrayDesc.NumChannels = 4; // Only support 4 channel image
-    ArrayDesc.Flags = 0;       // No flags required
     ArrayDesc.Width = ImageDesc.width;
-    if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-      ArrayDesc.Height = 0;
-      ArrayDesc.Depth = 0;
-    } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+    if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
       ArrayDesc.Height = ImageDesc.height;
-      ArrayDesc.Depth = 0;
     } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
       ArrayDesc.Height = ImageDesc.height;
       ArrayDesc.Depth = ImageDesc.depth;
@@ -414,10 +409,14 @@ struct ur_mem_handle_t_ {
   }
 
   ur_result_t clear() {
-    if (isBuffer()) {
-      return std::get<BufferMem>(Mem).clear();
+    try {
+      if (isBuffer()) {
+        return std::get<BufferMem>(Mem).clear();
+      }
+      return std::get<SurfaceMem>(Mem).clear();
+    } catch (const ur_result_t &error) {
+      return error;
     }
-    return std::get<SurfaceMem>(Mem).clear();
   }
 
   ur_context_handle_t getContext() const noexcept { return Context; }
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
index bd92a01400..e5cce87798 100644
--- a/source/adapters/cuda/queue.cpp
+++ b/source/adapters/cuda/queue.cpp
@@ -45,7 +45,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
       // change NumComputeStreams after that
       if (NumComputeStreams < ComputeStreams.size()) {
         UR_CHECK_ERROR(cuStreamCreateWithPriority(
-            &ComputeStreams[NumComputeStreams++], Flags, Priority));
+            &ComputeStreams[NumComputeStreams], Flags, Priority));
+        ++NumComputeStreams;
       }
     }
     Token = ComputeStreamIndex++;
@@ -110,7 +111,8 @@ CUstream ur_queue_handle_t_::getNextTransferStream() {
     // change NumTransferStreams after that
     if (NumTransferStreams < TransferStreams.size()) {
       UR_CHECK_ERROR(cuStreamCreateWithPriority(
-          &TransferStreams[NumTransferStreams++], Flags, Priority));
+          &TransferStreams[NumTransferStreams], Flags, Priority));
+      ++NumTransferStreams;
     }
   }
   uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index 2e01a4b7a3..bb3fb9aee5 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -340,6 +340,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
   pDdiTable->pfnImportExternalMemoryExp =
       urBindlessImagesImportExternalMemoryExp;
   pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp;
+  pDdiTable->pfnMapExternalLinearMemoryExp =
+      urBindlessImagesMapExternalLinearMemoryExp;
   pDdiTable->pfnReleaseExternalMemoryExp =
       urBindlessImagesReleaseExternalMemoryExp;
   pDdiTable->pfnImportExternalSemaphoreExp =
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index d9438eeb9c..4ff38626af 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -76,12 +76,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
 ur_exp_command_buffer_command_handle_t_::
     ur_exp_command_buffer_command_handle_t_(
         ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
-        std::shared_ptr<hipGraphNode_t> &&Node, hipKernelNodeParams Params,
-        uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
-        const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr)
-    : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(std::move(Node)),
-      Params(Params), WorkDim(WorkDim), RefCountInternal(1),
-      RefCountExternal(1) {
+        hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim,
+        const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
+        const size_t *LocalWorkSizePtr)
+    : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params),
+      WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) {
   CommandBuffer->incrementInternalReferenceCount();
 
   const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -125,7 +124,7 @@ static ur_result_t getNodesFromSyncPoints(
   for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
     if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]);
         NodeHandle != SyncPoints.end()) {
-      HIPNodesList.push_back(*NodeHandle->second.get());
+      HIPNodesList.push_back(NodeHandle->second);
     } else {
       return UR_RESULT_ERROR_INVALID_VALUE;
     }
@@ -139,29 +138,23 @@ static ur_result_t enqueueCommandBufferFillHelper(
     const hipMemoryType DstType, const void *Pattern, size_t PatternSize,
     size_t Size, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   std::vector<hipGraphNode_t> DepsList;
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
-                                   SyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
+                                        SyncPointWaitList, DepsList));
 
   try {
+    // Graph node added to graph, if multiple nodes are created this will
+    // be set to the leaf node
+    hipGraphNode_t GraphNode;
+
     const size_t N = Size / PatternSize;
     auto DstPtr = DstType == hipMemoryTypeDevice
                       ? *static_cast<hipDeviceptr_t *>(DstDevice)
                       : DstDevice;
 
     if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
-      // Create a new node
-      hipGraphNode_t GraphNode;
       hipMemsetParams NodeParams = {};
       NodeParams.dst = DstPtr;
       NodeParams.elementSize = PatternSize;
@@ -192,10 +185,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
                                            DepsList.data(), DepsList.size(),
                                            &NodeParams));
 
-      // Get sync point and register the node with it.
-      *SyncPoint = CommandBuffer->addSyncPoint(
-          std::make_shared<hipGraphNode_t>(GraphNode));
-
     } else {
       // HIP has no memset functions that allow setting values more than 4
       // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
@@ -206,11 +195,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
 
       size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
 
-      // Shared pointer that will point to the last node created
-      std::shared_ptr<hipGraphNode_t> GraphNodePtr;
-
-      // Create a new node
-      hipGraphNode_t GraphNodeFirst;
       // Update NodeParam
       hipMemsetParams NodeParamsStepFirst = {};
       NodeParamsStepFirst.dst = DstPtr;
@@ -220,16 +204,12 @@ static ur_result_t enqueueCommandBufferFillHelper(
       NodeParamsStepFirst.value = *(static_cast<const uint32_t *>(Pattern));
       NodeParamsStepFirst.width = 1;
 
-      UR_CHECK_ERROR(hipGraphAddMemsetNode(
-          &GraphNodeFirst, CommandBuffer->HIPGraph, DepsList.data(),
-          DepsList.size(), &NodeParamsStepFirst));
-
-      // Get sync point and register the node with it.
-      *SyncPoint = CommandBuffer->addSyncPoint(
-          std::make_shared<hipGraphNode_t>(GraphNodeFirst));
+      UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph,
+                                           DepsList.data(), DepsList.size(),
+                                           &NodeParamsStepFirst));
 
       DepsList.clear();
-      DepsList.push_back(GraphNodeFirst);
+      DepsList.push_back(GraphNode);
 
       // we walk up the pattern in 1-byte steps, and add Memset node for each
       // 1-byte chunk of the pattern.
@@ -241,8 +221,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
         auto OffsetPtr = reinterpret_cast<void *>(
             reinterpret_cast<uint8_t *>(DstPtr) + (Step * sizeof(uint8_t)));
 
-        // Create a new node
-        hipGraphNode_t GraphNode;
         // Update NodeParam
         hipMemsetParams NodeParamsStep = {};
         NodeParamsStep.dst = reinterpret_cast<void *>(OffsetPtr);
@@ -256,14 +234,17 @@ static ur_result_t enqueueCommandBufferFillHelper(
             &GraphNode, CommandBuffer->HIPGraph, DepsList.data(),
             DepsList.size(), &NodeParamsStep));
 
-        GraphNodePtr = std::make_shared<hipGraphNode_t>(GraphNode);
-        // Get sync point and register the node with it.
-        *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr);
-
         DepsList.clear();
-        DepsList.push_back(*GraphNodePtr.get());
+        DepsList.push_back(GraphNode);
       }
     }
+
+    // Get sync point and register the node with it.
+    auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
+
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -346,14 +327,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
   hipGraphNode_t GraphNode;
   std::vector<hipGraphNode_t> DepsList;
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                 pSyncPointWaitList, DepsList),
-          Result);
-
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   if (*pGlobalWorkSize == 0) {
     try {
@@ -362,8 +337,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                           DepsList.data(), DepsList.size()));
 
       // Get sync point and register the node with it.
-      *pSyncPoint = hCommandBuffer->addSyncPoint(
-          std::make_shared<hipGraphNode_t>(GraphNode));
+      auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+      if (pSyncPoint) {
+        *pSyncPoint = SyncPoint;
+      }
     } catch (ur_result_t Err) {
       return Err;
     }
@@ -377,13 +354,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
 
   uint32_t LocalSize = hKernel->getLocalSize();
   hipFunction_t HIPFunc = hKernel->get();
-  UR_CALL(setKernelParams(hCommandBuffer->Device, workDim, pGlobalWorkOffset,
-                          pGlobalWorkSize, pLocalWorkSize, hKernel, HIPFunc,
-                          ThreadsPerBlock, BlocksPerGrid),
-          Result);
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(setKernelParams(
+      hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
 
   try {
     // Set node param structure with the kernel related data
@@ -409,14 +382,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
       hKernel->clearLocalSize();
 
     // Get sync point and register the node with it.
-    auto NodeSP = std::make_shared<hipGraphNode_t>(GraphNode);
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
-      *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP);
+      *pSyncPoint = SyncPoint;
     }
 
     auto NewCommand = new ur_exp_command_buffer_command_handle_t_{
-        hCommandBuffer, hKernel,           std::move(NodeSP), NodeParams,
-        workDim,        pGlobalWorkOffset, pGlobalWorkSize,   pLocalWorkSize};
+        hCommandBuffer, hKernel,           GraphNode,       NodeParams,
+        workDim,        pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize};
 
     NewCommand->incrementInternalReferenceCount();
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -442,25 +415,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
-    UR_CHECK_ERROR(hipGraphAddMemcpyNode1D(
-        &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(),
-        pDst, pSrc, size, hipMemcpyHostToHost));
+    UR_CHECK_ERROR(hipGraphAddMemcpyNode1D(&GraphNode, hCommandBuffer->HIPGraph,
+                                           DepsList.data(), DepsList.size(),
+                                           pDst, pSrc, size, hipMemcpyDefault));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -483,16 +450,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   UR_ASSERT(size + srcOffset <= std::get<BufferMem>(hSrcMem->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Src = std::get<BufferMem>(hSrcMem->Mem)
@@ -505,8 +464,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
         Dst, Src, size, hipMemcpyDeviceToDevice));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -527,16 +488,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto SrcPtr =
@@ -554,8 +507,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
                                          &NodeParams));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -575,16 +530,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Dst = std::get<BufferMem>(hBuffer->Mem)
@@ -595,8 +542,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
         Dst, pSrc, size, hipMemcpyHostToDevice));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -615,16 +564,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto Src = std::get<BufferMem>(hBuffer->Mem)
@@ -635,8 +576,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
         pDst, Src, size, hipMemcpyDeviceToHost));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -658,16 +601,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto DstPtr =
@@ -683,8 +618,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
                                          &NodeParams));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -706,16 +643,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     auto SrcPtr =
@@ -731,8 +660,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
                                          &NodeParams));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -753,16 +684,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     // Create an empty node if the kernel workload size is zero
@@ -770,13 +693,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
                                         DepsList.data(), DepsList.size()));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
-
-    setErrorMessage("Prefetch hint ignored and replaced with empty node as "
-                    "prefetch is not supported by HIP Graph backend",
-                    UR_RESULT_SUCCESS);
-    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -797,16 +717,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  {
-    ur_result_t Result = UR_RESULT_SUCCESS;
-    UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                   pSyncPointWaitList, DepsList),
-            Result);
-
-    if (Result != UR_RESULT_SUCCESS) {
-      return Result;
-    }
-  }
+  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                        pSyncPointWaitList, DepsList));
 
   try {
     // Create an empty node if the kernel workload size is zero
@@ -814,13 +726,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
                                         DepsList.data(), DepsList.size()));
 
     // Get sync point and register the node with it.
-    *pSyncPoint = hCommandBuffer->addSyncPoint(
-        std::make_shared<hipGraphNode_t>(GraphNode));
-
-    setErrorMessage("Memory advice ignored and replaced with empty node as "
-                    "memory advice is not supported by HIP Graph backend",
-                    UR_RESULT_SUCCESS);
-    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
+    if (pSyncPoint) {
+      *pSyncPoint = SyncPoint;
+    }
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -878,8 +787,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
     ScopedContext Active(hQueue->getDevice());
@@ -888,10 +795,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     hipStream_t HIPStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if ((Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
-                                    phEventWaitList)) != UR_RESULT_SUCCESS) {
-      return Result;
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent = std::unique_ptr<ur_event_handle_t_>(
@@ -908,10 +813,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
@@ -978,12 +883,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     uint32_t ArgIndex = PointerArgDesc.argIndex;
     const void *ArgValue = PointerArgDesc.pNewPointerArg;
 
-    ur_result_t Result = UR_RESULT_SUCCESS;
     try {
       Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue);
     } catch (ur_result_t Err) {
-      Result = Err;
-      return Result;
+      return Err;
     }
   }
 
@@ -996,7 +899,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     uint32_t ArgIndex = MemobjArgDesc.argIndex;
     ur_mem_handle_t ArgValue = MemobjArgDesc.hNewMemObjArg;
 
-    ur_result_t Result = UR_RESULT_SUCCESS;
     try {
       if (ArgValue == nullptr) {
         Kernel->setKernelArg(ArgIndex, 0, nullptr);
@@ -1005,8 +907,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
         Kernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr);
       }
     } catch (ur_result_t Err) {
-      Result = Err;
-      return Result;
+      return Err;
     }
   }
 
@@ -1020,13 +921,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     size_t ArgSize = ValueArgDesc.argSize;
     const void *ArgValue = ValueArgDesc.pNewValueArg;
 
-    ur_result_t Result = UR_RESULT_SUCCESS;
-
     try {
       Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
     } catch (ur_result_t Err) {
-      Result = Err;
-      return Result;
+      return Err;
     }
   }
 
@@ -1064,12 +962,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
   hipFunction_t HIPFunc = Kernel->get();
-  auto Result = setKernelParams(Device, WorkDim, GlobalWorkOffset,
-                                GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc,
-                                ThreadsPerBlock, BlocksPerGrid);
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(setKernelParams(Device, WorkDim, GlobalWorkOffset,
+                                 GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc,
+                                 ThreadsPerBlock, BlocksPerGrid));
 
   hipKernelNodeParams &Params = hCommand->Params;
 
@@ -1083,7 +978,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   Params.sharedMemBytes = Kernel->getLocalSize();
   Params.kernelParams = const_cast<void **>(Kernel->getArgIndices().data());
 
-  hipGraphNode_t Node = *(hCommand->Node);
+  hipGraphNode_t Node = hCommand->Node;
   hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec;
   UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params));
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp
index 751fde3720..d744a3544d 100644
--- a/source/adapters/hip/command_buffer.hpp
+++ b/source/adapters/hip/command_buffer.hpp
@@ -41,9 +41,9 @@
 struct ur_exp_command_buffer_command_handle_t_ {
   ur_exp_command_buffer_command_handle_t_(
       ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
-      std::shared_ptr<hipGraphNode_t> &&Node, hipKernelNodeParams Params,
-      uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
-      const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr);
+      hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim,
+      const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
+      const size_t *LocalWorkSizePtr);
 
   void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) {
     const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -96,7 +96,7 @@ struct ur_exp_command_buffer_command_handle_t_ {
 
   ur_exp_command_buffer_handle_t CommandBuffer;
   ur_kernel_handle_t Kernel;
-  std::shared_ptr<hipGraphNode_t> Node;
+  hipGraphNode_t Node;
   hipKernelNodeParams Params;
 
   uint32_t WorkDim;
@@ -117,7 +117,7 @@ struct ur_exp_command_buffer_handle_t_ {
   ~ur_exp_command_buffer_handle_t_();
 
   void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
-                         std::shared_ptr<hipGraphNode_t> &&HIPNode) {
+                         hipGraphNode_t HIPNode) {
     SyncPoints[SyncPoint] = std::move(HIPNode);
     NextSyncPoint++;
   }
@@ -129,8 +129,7 @@ struct ur_exp_command_buffer_handle_t_ {
   // Helper to register next sync point
   // @param HIPNode Node to register as next sync point
   // @return Pointer to the sync that registers the Node
-  ur_exp_command_buffer_sync_point_t
-  addSyncPoint(std::shared_ptr<hipGraphNode_t> HIPNode) {
+  ur_exp_command_buffer_sync_point_t addSyncPoint(hipGraphNode_t HIPNode) {
     ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint;
     registerSyncPoint(SyncPoint, std::move(HIPNode));
     return SyncPoint;
@@ -171,8 +170,7 @@ struct ur_exp_command_buffer_handle_t_ {
   std::atomic_uint32_t RefCountExternal;
 
   // Map of sync_points to ur_events
-  std::unordered_map<ur_exp_command_buffer_sync_point_t,
-                     std::shared_ptr<hipGraphNode_t>>
+  std::unordered_map<ur_exp_command_buffer_sync_point_t, hipGraphNode_t>
       SyncPoints;
   // Next sync_point value (may need to consider ways to reuse values if 32-bits
   // is not enough)
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index da92fa6a87..3ae98e929d 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -988,7 +988,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
-    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    ur_native_handle_t hNativeDevice,
+    [[maybe_unused]] ur_adapter_handle_t hAdapter,
     [[maybe_unused]] const ur_device_native_properties_t *pProperties,
     ur_device_handle_t *phDevice) {
   // We can't cast between ur_native_handle_t and hipDevice_t, so memcpy the
@@ -1000,16 +1001,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     return Dev->get() == HIPDevice;
   };
 
-  // If a platform is provided just check if the device is in it
-  if (hPlatform) {
-    auto SearchRes = std::find_if(begin(hPlatform->Devices),
-                                  end(hPlatform->Devices), IsDevice);
-    if (SearchRes != end(hPlatform->Devices)) {
-      *phDevice = SearchRes->get();
-      return UR_RESULT_SUCCESS;
-    }
-  }
-
   // Get list of platforms
   uint32_t NumPlatforms = 0;
   ur_adapter_handle_t AdapterHandle = &adapter;
diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp
index 75f93ca4f3..7449c3ba3f 100644
--- a/source/adapters/hip/image.cpp
+++ b/source/adapters/hip/image.cpp
@@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    [[maybe_unused]] ur_context_handle_t hContext,
+    [[maybe_unused]] ur_device_handle_t hDevice,
+    [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size,
+    [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem,
+    [[maybe_unused]] void **phRetMem) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
     [[maybe_unused]] ur_context_handle_t hContext,
     [[maybe_unused]] ur_device_handle_t hDevice,
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index 5f06567064..eafce43d1c 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -498,7 +498,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(hipMalloc(&DevPtr, Buffer.Size));
     }
   } else {
-    hipArray *ImageArray;
+    hipArray *ImageArray{};
     hipSurfaceObject_t Surface;
     try {
       auto &Image = std::get<SurfaceMem>(Mem->Mem);
diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index 0605b9a40c..5ae1d52e7b 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -43,15 +43,31 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
   }
 
   std::vector<ze_driver_handle_t> ZeDrivers;
+  std::vector<ze_device_handle_t> ZeDevices;
   ZeDrivers.resize(ZeDriverCount);
 
   ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
   for (uint32_t I = 0; I < ZeDriverCount; ++I) {
-    auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
-    UR_CALL(platform->initialize());
-
-    // Save a copy in the cache for future uses.
-    platforms.push_back(std::move(platform));
+    ze_device_properties_t device_properties{};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    uint32_t ZeDeviceCount = 0;
+    ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr));
+    ZeDevices.resize(ZeDeviceCount);
+    ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data()));
+    // Check if this driver has GPU Devices
+    for (uint32_t D = 0; D < ZeDeviceCount; ++D) {
+      ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties));
+
+      if (ZE_DEVICE_TYPE_GPU == device_properties.type) {
+        // If this Driver is a GPU, save it as a usable platform.
+        auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
+        UR_CALL(platform->initialize());
+
+        // Save a copy in the cache for future uses.
+        platforms.push_back(std::move(platform));
+        break;
+      }
+    }
   }
   return UR_RESULT_SUCCESS;
 } catch (...) {
@@ -105,8 +121,16 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
       // We must only initialize the driver once, even if urPlatformGet() is
       // called multiple times.  Declaring the return value as "static" ensures
       // it's only called once.
-      GlobalAdapter->ZeResult =
-          ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY));
+
+      // Init with all flags set to enable for all driver types to be init in
+      // the application.
+      ze_init_flags_t L0InitFlags = ZE_INIT_FLAG_GPU_ONLY;
+      if (UrL0InitAllDrivers) {
+        L0InitFlags |= ZE_INIT_FLAG_VPU_ONLY;
+      }
+      logger::debug("\nzeInit with flags value of {}\n",
+                    static_cast<int>(L0InitFlags));
+      GlobalAdapter->ZeResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags));
     }
     assert(GlobalAdapter->ZeResult !=
            std::nullopt); // verify that level-zero is initialized
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 3b4a91fc0a..ff4f0b56bc 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -610,8 +610,8 @@ ur_result_t createMainCommandList(ur_context_handle_t Context,
 bool canBeInOrder(ur_context_handle_t Context,
                   const ur_exp_command_buffer_desc_t *CommandBufferDesc) {
   // In-order command-lists are not available in old driver version.
-  bool CompatibleDriver = isDriverVersionNewerOrSimilar(
-      Context->getPlatform()->ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION);
+  bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar(
+      1, 3, L0_DRIVER_INORDER_MIN_VERSION);
   return CompatibleDriver
              ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
              : false;
@@ -921,6 +921,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
 
   bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) ||
                           !IsDevicePointer(CommandBuffer->Context, Dst);
+  // For better performance, Copy Engines are not preferred given Shared
+  // pointers on DG2.
+  if (CommandBuffer->Device->isDG2() &&
+      (IsSharedPointer(CommandBuffer->Context, Src) ||
+       IsSharedPointer(CommandBuffer->Context, Dst))) {
+    PreferCopyEngine = false;
+  }
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
 
   return enqueueCommandBufferMemCopyHelper(
@@ -1293,13 +1300,14 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer,
  * @param[in] CommandBuffer The command buffer.
  * @param[in] Queue The UR queue used to submit the command buffer.
  * @param[in] SignalCommandList The command-list to append the barrier to.
- * @param[out] Event The host visible event which will be returned to the user.
+ * @param[out][optional] Event The host visible event which will be returned
+ * to the user, if user passed an output parameter to the UR API.
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
 ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer,
                             ur_queue_handle_legacy_t Queue,
                             ur_command_list_ptr_t SignalCommandList,
-                            ur_event_handle_t &Event) {
+                            ur_event_handle_t *Event) {
   // Execution event for this enqueue of the UR command-buffer
   ur_event_handle_t RetEvent{};
 
@@ -1335,7 +1343,9 @@ ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer,
                 &(CommandBuffer->SignalEvent->ZeEvent)));
   }
 
-  Event = RetEvent;
+  if (Event) {
+    *Event = RetEvent;
+  }
 
   return UR_RESULT_SUCCESS;
 }
@@ -1398,9 +1408,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   ZE2UR_CALL(zeCommandListAppendEventReset,
              (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent));
 
-  if (Event) {
-    UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, *Event));
-  }
+  // Appends a wait on the main command-list signal and registers output Event
+  // parameter with signal command-list completing.
+  UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, Event));
 
   UR_CALL(Queue->executeCommandList(SignalCommandList, false, false));
 
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index 9dfb5a2b19..7031bb5f03 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -67,28 +67,6 @@ ur_result_t ze2urResult(ze_result_t ZeResult) {
   }
 }
 
-/// Checks the version of the level-zero driver.
-/// @param ZeDriver Level Zero Driver handle
-/// @param VersionMajor Major verion number to compare to.
-/// @param VersionMinor Minor verion number to compare to.
-/// @param VersionBuild Build verion number to compare to.
-/// @return true is the version of the driver is higher than or equal to the
-/// compared version
-bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver,
-                                   uint32_t VersionMajor, uint32_t VersionMinor,
-                                   uint32_t VersionBuild) {
-  ZeStruct<ze_driver_properties_t> ZeDriverProperties;
-  ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
-  uint32_t DriverVersion = ZeDriverProperties.driverVersion;
-  auto DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24;
-  auto DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16;
-  auto DriverVersionBuild = DriverVersion & 0x0000FFFF;
-
-  return ((DriverVersionMajor >= VersionMajor) &&
-          (DriverVersionMinor >= VersionMinor) &&
-          (DriverVersionBuild >= VersionBuild));
-}
-
 // This function will ensure compatibility with both Linux and Windows for
 // setting environment variables.
 bool setEnvVar(const char *name, const char *value) {
diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index a81b852727..5784d5bf78 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -207,6 +207,15 @@ const int UrL0LeaksDebug = [] {
   return std::atoi(UrRet);
 }();
 
+// Enable for UR L0 Adapter to Init all L0 Drivers on the system with filtering
+// in place for only currently used Drivers.
+const int UrL0InitAllDrivers = [] {
+  const char *UrRet = std::getenv("UR_L0_INIT_ALL_DRIVERS");
+  if (!UrRet)
+    return 0;
+  return std::atoi(UrRet);
+}();
+
 // Controls Level Zero calls serialization to w/a Level Zero driver being not MT
 // ready. Recognized values (can be used as a bit mask):
 enum {
@@ -317,11 +326,6 @@ bool setEnvVar(const char *name, const char *value);
 // Map Level Zero runtime error code to UR error code.
 ur_result_t ze2urResult(ze_result_t ZeResult);
 
-/// Checks the version of the level-zero driver.
-bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver,
-                                   uint32_t VersionMajor, uint32_t VersionMinor,
-                                   uint32_t VersionBuild);
-
 // Trace a call to Level-Zero RT
 #define ZE2UR_CALL(ZeName, ZeArgs)                                             \
   {                                                                            \
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index dc70a2470c..fab54f3783 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -43,6 +43,8 @@ struct ur_context_handle_t_ : _ur_object {
 
   ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}
 
+  virtual ~ur_context_handle_t_() {}
+
   // A L0 context handle is primarily used during creation and management of
   // resources that may be used by multiple devices.
   // This field is only set at ur_context_handle_t creation time, and cannot
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 9e832bbb9a..08f13268eb 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1173,12 +1173,10 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
   // Use in-order lists implementation from L0 driver instead
   // of adapter's implementation.
 
-  ze_driver_handle_t ZeDriver = this->Platform->ZeDriver;
-
   static const bool UseDriverInOrderLists = [&] {
     const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
-    bool CompatibleDriver = isDriverVersionNewerOrSimilar(
-        ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION);
+    bool CompatibleDriver = this->Platform->isDriverVersionNewerOrSimilar(
+        1, 3, L0_DRIVER_INORDER_MIN_VERSION);
     if (!UrRet)
       return CompatibleDriver;
     return std::atoi(UrRet) != 0;
@@ -1602,14 +1600,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t NativeDevice, ///< [in] the native handle of the device.
-    ur_platform_handle_t Platform,   ///< [in] handle of the platform instance
-    const ur_device_native_properties_t
+    [[maybe_unused]] ur_adapter_handle_t
+        Adapter, ///< [in] handle of the platform instance
+    [[maybe_unused]] const ur_device_native_properties_t
         *Properties, ///< [in][optional] pointer to native device properties
                      ///< struct.
     ur_device_handle_t
         *Device ///< [out] pointer to the handle of the device object created.
 ) {
-  std::ignore = Properties;
   auto ZeDevice = ur_cast<ze_device_handle_t>(NativeDevice);
 
   // The SYCL spec requires that the set of devices must remain fixed for the
@@ -1622,12 +1620,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
   if (const auto *platforms = GlobalAdapter->PlatformCache->get_value()) {
     for (const auto &p : *platforms) {
       Dev = p->getDeviceFromNativeHandle(ZeDevice);
-      if (Dev) {
-        // Check that the input Platform, if was given, matches the found one.
-        UR_ASSERT(!Platform || Platform == p.get(),
-                  UR_RESULT_ERROR_INVALID_PLATFORM);
-        break;
-      }
     }
   } else {
     return GlobalAdapter->PlatformCache->get_error();
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 2d0443512d..898edff779 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -61,7 +61,7 @@ struct ur_device_handle_t_ : _ur_object {
   ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt,
                       ur_device_handle_t ParentDevice = nullptr)
       : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
-        ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
+        ZeDeviceProperties{}, ZeDeviceComputeProperties{}, Id(std::nullopt) {
     // NOTE: one must additionally call initialize() to complete
     // UR device creation.
   }
@@ -189,6 +189,9 @@ struct ur_device_handle_t_ : _ur_object {
            (ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
   }
 
+  // Checks if this GPU is an Intel Flex GPU or Intel Arc Alchemist
+  bool isDG2() { return (ZeDeviceProperties->deviceId & 0xff00) == 0x5600; }
+
   bool isIntegrated() {
     return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
   }
@@ -226,5 +229,5 @@ struct ur_device_handle_t_ : _ur_object {
       ZeOffsetToImageHandleMap;
 
   // unique ephemeral identifer of the device in the adapter
-  DeviceId Id;
+  std::optional<DeviceId> Id;
 };
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 33495f52b8..f4dee0d661 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -228,7 +228,9 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the
 
   // For in-order queue and wait-list which is empty or has events from
   // the same queue just use the last command event as the barrier event.
-  if (Queue->isInOrderQueue() &&
+  // This optimization is disabled when profiling is enabled to ensure
+  // accurate profiling values & the overhead that profiling incurs.
+  if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() &&
       WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList,
                                             EventWaitList) &&
       Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
@@ -1034,7 +1036,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
         EndTimeRecording.EventHasDied = true;
       } else {
         // Otherwise we evict the entry.
-        Legacy(Event->UrQueue)->EndTimeRecordings.erase(Entry);
+        Queue->EndTimeRecordings.erase(Entry);
       }
     }
   }
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 1537a1d201..f68b2d93be 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -1033,6 +1033,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp(
         break;
       case UR_EXP_EXTERNAL_MEM_TYPE_OPAQUE_FD:
       default:
+        delete importWin32;
+        delete externalMemoryData;
         return UR_RESULT_ERROR_INVALID_VALUE;
       }
       importWin32->handle = Win32Handle->handle;
@@ -1083,6 +1085,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset,
+    uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **phRetMem) {
+  std::ignore = hContext;
+  std::ignore = hDevice;
+  std::ignore = size;
+  std::ignore = offset;
+  std::ignore = hExternalMem;
+  std::ignore = phRetMem;
+  logger::error("[UR][L0] {} function not implemented!",
+                "{} function not implemented!", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_external_mem_handle_t hExternalMem) {
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index cb020395ed..9e5670ae5d 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -613,6 +613,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite(
   // Copy engine is preferred only for host to device transfer.
   // Device to device transfers run faster on compute engines.
   bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src);
+  // For better performance, Copy Engines are not preferred given Shared
+  // pointers on DG2.
+  if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Src)) {
+    PreferCopyEngine = false;
+  }
 
   // Temporary option added to use copy engine for D2D copy
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
@@ -663,6 +668,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableRead(
   // Copy engine is preferred only for host to device transfer.
   // Device to device transfers run faster on compute engines.
   bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst);
+  // For better performance, Copy Engines are not preferred given Shared
+  // pointers on DG2.
+  if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Dst)) {
+    PreferCopyEngine = false;
+  }
 
   // Temporary option added to use copy engine for D2D copy
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
@@ -700,7 +710,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(
     ZeKernelDesc.pKernelName = KernelName;
 
     ze_kernel_handle_t ZeKernel;
-    ZE2UR_CALL(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel));
+    auto ZeResult =
+        ZE_CALL_NOCHECK(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel));
+    // Gracefully handle the case that kernel create fails.
+    if (ZeResult != ZE_RESULT_SUCCESS) {
+      delete *RetKernel;
+      *RetKernel = nullptr;
+      return ze2urResult(ZeResult);
+    }
 
     auto ZeDevice = It.first;
 
@@ -754,20 +771,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
     PArgValue = nullptr;
   }
 
+  if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
+  }
+
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  ze_result_t ZeResult = ZE_RESULT_SUCCESS;
   if (Kernel->ZeKernelMap.empty()) {
     auto ZeKernel = Kernel->ZeKernel;
-    ZE2UR_CALL(zeKernelSetArgumentValue,
-               (ZeKernel, ArgIndex, ArgSize, PArgValue));
+    ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue,
+                               (ZeKernel, ArgIndex, ArgSize, PArgValue));
   } else {
     for (auto It : Kernel->ZeKernelMap) {
       auto ZeKernel = It.second;
-      ZE2UR_CALL(zeKernelSetArgumentValue,
-                 (ZeKernel, ArgIndex, ArgSize, PArgValue));
+      ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue,
+                                 (ZeKernel, ArgIndex, ArgSize, PArgValue));
     }
   }
 
-  return UR_RESULT_SUCCESS;
+  if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE;
+  }
+
+  return ze2urResult(ZeResult);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal(
@@ -816,6 +842,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(
     } catch (...) {
       return UR_RESULT_ERROR_UNKNOWN;
     }
+  case UR_KERNEL_INFO_NUM_REGS:
   case UR_KERNEL_INFO_NUM_ARGS:
     return ReturnValue(uint32_t{Kernel->ZeKernelProperties->numKernelArgs});
   case UR_KERNEL_INFO_REFERENCE_COUNT:
@@ -1066,6 +1093,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
 ) {
   std::ignore = Properties;
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
+  }
   ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, ArgIndex,
                                         sizeof(void *), &ArgValue->ZeSampler));
 
@@ -1085,6 +1115,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
   // The ArgValue may be a NULL pointer in which case a NULL value is used for
   // the kernel argument declared as a pointer to global or constant memory.
 
+  if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
+  }
+
   ur_mem_handle_t_ *UrMem = ur_cast<ur_mem_handle_t_ *>(ArgValue);
 
   ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write;
diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 95650a7b94..585a10ef4f 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -42,6 +42,19 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) {
   return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE);
 }
 
+// Helper function to check if a pointer is a shared pointer.
+bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) {
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  // Query memory type of the pointer
+  ZE2UR_CALL(zeMemGetAllocProperties,
+             (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+              &ZeDeviceHandle));
+
+  return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED);
+}
+
 // Shared by all memory read/write/copy PI interfaces.
 // PI interfaces must have queue's and destination buffer's mutexes locked for
 // exclusive use and source buffer's mutex locked for shared use on entry.
@@ -1191,6 +1204,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy(
   // (versus compute engine).
   bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
                           !IsDevicePointer(Queue->Context, Dst);
+  // For better performance, Copy Engines are not preferred given Shared
+  // pointers on DG2.
+  if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) ||
+                                 IsSharedPointer(Queue->Context, Dst))) {
+    PreferCopyEngine = false;
+  }
 
   // Temporary option added to use copy engine for D2D copy
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
@@ -1390,6 +1409,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy2D(
   // (versus compute engine).
   bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
                           !IsDevicePointer(Queue->Context, Dst);
+  // For better performance, Copy Engines are not preferred given Shared
+  // pointers on DG2.
+  if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) ||
+                                 IsSharedPointer(Queue->Context, Dst))) {
+    PreferCopyEngine = false;
+  }
 
   // Temporary option added to use copy engine for D2D copy
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
diff --git a/source/adapters/level_zero/memory.hpp b/source/adapters/level_zero/memory.hpp
index b590165947..43d548f16b 100644
--- a/source/adapters/level_zero/memory.hpp
+++ b/source/adapters/level_zero/memory.hpp
@@ -32,6 +32,7 @@ using ur_queue_handle_legacy_t = ur_queue_handle_legacy_t_ *;
 struct ur_device_handle_t_;
 
 bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr);
+bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr);
 
 // This is an experimental option to test performance of device to device copy
 // operations on copy engines (versus compute engine)
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index 02b3663710..f51a8f1aa7 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -266,6 +266,67 @@ ur_result_t ur_platform_handle_t_::initialize() {
   return UR_RESULT_SUCCESS;
 }
 
+/// Checks the version of the level-zero driver.
+/// @param VersionMajor Major verion number to compare to.
+/// @param VersionMinor Minor verion number to compare to.
+/// @param VersionBuild Build verion number to compare to.
+/// @return true is the version of the driver is higher than or equal to the
+/// compared version
+bool ur_platform_handle_t_::isDriverVersionNewerOrSimilar(
+    uint32_t VersionMajor, uint32_t VersionMinor, uint32_t VersionBuild) {
+  uint32_t DriverVersionMajor = 0;
+  uint32_t DriverVersionMinor = 0;
+  uint32_t DriverVersionBuild = 0;
+  if (!ZeDriverVersionString.Supported) {
+    ZeStruct<ze_driver_properties_t> ZeDriverProperties;
+    ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
+    uint32_t DriverVersion = ZeDriverProperties.driverVersion;
+    DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24;
+    DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16;
+    DriverVersionBuild = DriverVersion & 0x0000FFFF;
+  } else {
+    std::string ZeDriverVersion;
+    size_t sizeOfDriverString = 0;
+    ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated,
+                                                 nullptr, &sizeOfDriverString);
+    ZeDriverVersion.resize(sizeOfDriverString);
+    ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated,
+                                                 ZeDriverVersion.data(),
+                                                 &sizeOfDriverString);
+
+    // Intel driver version string is in the format:
+    // Major.Minor.Build+Hotfix where hotfix is optional.
+    std::stringstream VersionString(ZeDriverVersion);
+
+    std::string VersionValue;
+    std::vector<std::string> VersionValues;
+    char VersionDelim = '.';
+    char HotfixDelim = '+';
+
+    while (getline(VersionString, VersionValue, VersionDelim)) {
+      VersionValues.push_back(VersionValue);
+    }
+    // If the extension exists, but the string value comes by empty or
+    // malformed, assume this is a developer driver.
+    if (VersionValues.size() >= 3) {
+      DriverVersionMajor = atoi(VersionValues[0].c_str());
+      DriverVersionMinor = atoi(VersionValues[1].c_str());
+      std::stringstream HotfixString(VersionValues[2]);
+      std::vector<std::string> BuildHotfixVersionValues;
+      // Check to see if there is a hotfix value and strip it off.
+      while (getline(HotfixString, VersionValue, HotfixDelim)) {
+        BuildHotfixVersionValues.push_back(VersionValue);
+      }
+      DriverVersionBuild = atoi(BuildHotfixVersionValues[0].c_str());
+    } else {
+      return true;
+    }
+  }
+  return std::make_tuple(DriverVersionMajor, DriverVersionMinor,
+                         DriverVersionBuild) >=
+         std::make_tuple(VersionMajor, VersionMinor, VersionBuild);
+}
+
 // Get the cached PI device created for the L0 device handle.
 // Return NULL if no such PI device found.
 ur_device_handle_t
diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp
index f9fdcb117e..fa15c88bdf 100644
--- a/source/adapters/level_zero/platform.hpp
+++ b/source/adapters/level_zero/platform.hpp
@@ -62,6 +62,11 @@ struct ur_platform_handle_t_ : public _ur_platform {
   // If not found, then nullptr is returned.
   ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t);
 
+  /// Checks the version of the level-zero driver.
+  bool isDriverVersionNewerOrSimilar(uint32_t VersionMajor,
+                                     uint32_t VersionMinor,
+                                     uint32_t VersionBuild);
+
   // Keep track of all contexts in the platform. This is needed to manage
   // a lifetime of memory allocations in each context when there are kernels
   // with indirect access.
diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index 26c75aef31..2b40d736c4 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -58,6 +58,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(
         *Program ///< [out] pointer to handle of program object created.
 ) {
   std::ignore = Properties;
+  UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(IL && Program, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   try {
     ur_program_handle_t_ *UrProgram =
         new ur_program_handle_t_(ur_program_handle_t_::IL, Context, IL, Length);
@@ -82,8 +84,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     ur_program_handle_t
         *Program ///< [out] pointer to handle of Program object created.
 ) {
-  std::ignore = Device;
-  std::ignore = Properties;
   // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
   // following: "program executable", "compiled program", or "library of
   // compiled programs".  In addition, the loaded program can be either
@@ -96,8 +96,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   // information to distinguish the cases.
 
   try {
-    ur_program_handle_t_ *UrProgram = new ur_program_handle_t_(
-        ur_program_handle_t_::Native, Context, Binary, Size);
+    ur_program_handle_t_ *UrProgram =
+        new ur_program_handle_t_(ur_program_handle_t_::Native, Context, Device,
+                                 Properties, Binary, Size);
     *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
@@ -208,8 +209,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
         }
       }
       hProgram->ZeModuleMap.insert(std::make_pair(ZeDevice, ZeModuleHandle));
-      hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog));
     }
+    hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog));
   }
 
   // We no longer need the IL / native code.
@@ -597,11 +598,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetGlobalVariablePointer(
     void **GlobalVariablePointerRet ///< [out] Returns the pointer to the global
                                     ///< variable if it is found in the program.
 ) {
-  std::ignore = Device;
   std::scoped_lock<ur_shared_mutex> lock(Program->Mutex);
 
+  ze_module_handle_t ZeModuleEntry{};
+  ZeModuleEntry = Program->ZeModule;
+  if (!Program->ZeModuleMap.empty()) {
+    auto It = Program->ZeModuleMap.find(Device->ZeDevice);
+    if (It != Program->ZeModuleMap.end()) {
+      ZeModuleEntry = It->second;
+    }
+  }
+
   ze_result_t ZeResult =
-      zeModuleGetGlobalPointer(Program->ZeModule, GlobalVariableName,
+      zeModuleGetGlobalPointer(ZeModuleEntry, GlobalVariableName,
                                GlobalVariableSizeRet, GlobalVariablePointerRet);
 
   if (ZeResult == ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) {
@@ -632,11 +641,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
   case UR_PROGRAM_INFO_CONTEXT:
     return ReturnValue(Program->Context);
   case UR_PROGRAM_INFO_NUM_DEVICES:
-    // TODO: return true number of devices this program exists for.
-    return ReturnValue(uint32_t{1});
+    if (!Program->ZeModuleMap.empty())
+      return ReturnValue(
+          uint32_t{ur_cast<uint32_t>(Program->ZeModuleMap.size())});
+    else
+      return ReturnValue(uint32_t{1});
   case UR_PROGRAM_INFO_DEVICES:
-    // TODO: return all devices this program exists for.
-    return ReturnValue(Program->Context->Devices[0]);
+    if (!Program->ZeModuleMap.empty()) {
+      std::vector<ur_device_handle_t> devices;
+      for (auto &ZeModulePair : Program->ZeModuleMap) {
+        auto It = Program->ZeModuleMap.find(ZeModulePair.first);
+        if (It != Program->ZeModuleMap.end()) {
+          for (auto &Device : Program->Context->Devices) {
+            if (Device->ZeDevice == ZeModulePair.first) {
+              devices.push_back(Device);
+            }
+          }
+        }
+      }
+      return ReturnValue(devices.data(), devices.size());
+    } else {
+      return ReturnValue(Program->Context->Devices[0]);
+    }
   case UR_PROGRAM_INFO_BINARY_SIZES: {
     std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
     size_t SzBinary;
@@ -645,8 +671,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
         Program->State == ur_program_handle_t_::Object) {
       SzBinary = Program->CodeLength;
     } else if (Program->State == ur_program_handle_t_::Exe) {
-      ZE2UR_CALL(zeModuleGetNativeBinary,
-                 (Program->ZeModule, &SzBinary, nullptr));
+      if (!Program->ZeModuleMap.empty()) {
+        std::vector<size_t> binarySizes;
+        for (auto &ZeModulePair : Program->ZeModuleMap) {
+          size_t binarySize = 0;
+          ZE2UR_CALL(zeModuleGetNativeBinary,
+                     (ZeModulePair.second, &binarySize, nullptr));
+          binarySizes.push_back(binarySize);
+        }
+        return ReturnValue(binarySizes.data(), binarySizes.size());
+      } else {
+        ZE2UR_CALL(zeModuleGetNativeBinary,
+                   (Program->ZeModule, &SzBinary, nullptr));
+        return ReturnValue(SzBinary);
+      }
     } else {
       return UR_RESULT_ERROR_INVALID_PROGRAM;
     }
@@ -655,22 +693,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
   }
   case UR_PROGRAM_INFO_BINARIES: {
     // The caller sets "ParamValue" to an array of pointers, one for each
-    // device.  Since Level Zero supports only one device, there is only one
-    // pointer.  If the pointer is NULL, we don't do anything.  Otherwise, we
-    // copy the program's binary image to the buffer at that pointer.
-    uint8_t **PBinary = ur_cast<uint8_t **>(ProgramInfo);
-    if (!PBinary[0])
-      break;
-
+    // device.
+    uint8_t **PBinary = nullptr;
+    if (ProgramInfo) {
+      PBinary = ur_cast<uint8_t **>(ProgramInfo);
+      if (!PBinary[0]) {
+        break;
+      }
+    }
     std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+    // If the caller is using a Program which is IL, Native or an object, then
+    // the program has not been built for multiple devices so a single IL is
+    // returned.
     if (Program->State == ur_program_handle_t_::IL ||
         Program->State == ur_program_handle_t_::Native ||
         Program->State == ur_program_handle_t_::Object) {
-      std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
+      if (PropSizeRet)
+        *PropSizeRet = Program->CodeLength;
+      if (PBinary) {
+        std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
+      }
     } else if (Program->State == ur_program_handle_t_::Exe) {
+      // If the caller is using a Program which is a built binary, then
+      // the program returned will either be a single module if this is a native
+      // binary or the native binary for each device will be returned.
       size_t SzBinary = 0;
-      ZE2UR_CALL(zeModuleGetNativeBinary,
-                 (Program->ZeModule, &SzBinary, PBinary[0]));
+      uint8_t *NativeBinaryPtr = nullptr;
+      if (PBinary) {
+        NativeBinaryPtr = PBinary[0];
+      }
+      if (!Program->ZeModuleMap.empty()) {
+        uint32_t deviceIndex = 0;
+        for (auto &ZeDeviceModule : Program->ZeModuleMap) {
+          size_t binarySize = 0;
+          if (PBinary) {
+            NativeBinaryPtr = PBinary[deviceIndex++];
+          }
+          ZE2UR_CALL(zeModuleGetNativeBinary,
+                     (ZeDeviceModule.second, &binarySize, NativeBinaryPtr));
+          SzBinary += binarySize;
+        }
+      } else {
+        ZE2UR_CALL(zeModuleGetNativeBinary,
+                   (Program->ZeModule, &SzBinary, NativeBinaryPtr));
+      }
+      if (PropSizeRet)
+        *PropSizeRet = SzBinary;
     } else {
       return UR_RESULT_ERROR_INVALID_PROGRAM;
     }
@@ -678,15 +746,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
   }
   case UR_PROGRAM_INFO_NUM_KERNELS: {
     std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-    uint32_t NumKernels;
+    uint32_t NumKernels = 0;
     if (Program->State == ur_program_handle_t_::IL ||
         Program->State == ur_program_handle_t_::Native ||
         Program->State == ur_program_handle_t_::Object) {
       return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
     } else if (Program->State == ur_program_handle_t_::Exe) {
-      NumKernels = 0;
-      ZE2UR_CALL(zeModuleGetKernelNames,
-                 (Program->ZeModule, &NumKernels, nullptr));
+      if (!Program->ZeModuleMap.empty()) {
+        ZE2UR_CALL(
+            zeModuleGetKernelNames,
+            (Program->ZeModuleMap.begin()->second, &NumKernels, nullptr));
+      } else {
+        ZE2UR_CALL(zeModuleGetKernelNames,
+                   (Program->ZeModule, &NumKernels, nullptr));
+      }
     } else {
       return UR_RESULT_ERROR_INVALID_PROGRAM;
     }
@@ -702,11 +775,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
         return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
       } else if (Program->State == ur_program_handle_t_::Exe) {
         uint32_t Count = 0;
-        ZE2UR_CALL(zeModuleGetKernelNames,
-                   (Program->ZeModule, &Count, nullptr));
-        std::unique_ptr<const char *[]> PNames(new const char *[Count]);
-        ZE2UR_CALL(zeModuleGetKernelNames,
-                   (Program->ZeModule, &Count, PNames.get()));
+        std::unique_ptr<const char *[]> PNames;
+        if (!Program->ZeModuleMap.empty()) {
+          ZE2UR_CALL(zeModuleGetKernelNames,
+                     (Program->ZeModuleMap.begin()->second, &Count, nullptr));
+          PNames = std::make_unique<const char *[]>(Count);
+          ZE2UR_CALL(
+              zeModuleGetKernelNames,
+              (Program->ZeModuleMap.begin()->second, &Count, PNames.get()));
+        } else {
+          ZE2UR_CALL(zeModuleGetKernelNames,
+                     (Program->ZeModule, &Count, nullptr));
+          PNames = std::make_unique<const char *[]>(Count);
+          ZE2UR_CALL(zeModuleGetKernelNames,
+                     (Program->ZeModule, &Count, PNames.get()));
+        }
         for (uint32_t I = 0; I < Count; ++I) {
           PINames += (I > 0 ? ";" : "");
           PINames += PNames[I];
@@ -720,8 +803,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
     } catch (...) {
       return UR_RESULT_ERROR_UNKNOWN;
     }
+  case UR_PROGRAM_INFO_SOURCE:
+    return ReturnValue(Program->Code.get());
   default:
-    die("urProgramGetInfo: not implemented");
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
 
   return UR_RESULT_SUCCESS;
@@ -761,6 +846,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo(
     // return for programs that were built outside and registered
     // with urProgramRegister?
     return ReturnValue("");
+  } else if (PropName == UR_PROGRAM_BUILD_INFO_STATUS) {
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   } else if (PropName == UR_PROGRAM_BUILD_INFO_LOG) {
     // Check first to see if the plugin code recorded an error message.
     if (!Program->ErrorMessage.empty()) {
@@ -852,6 +939,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
                                  ///< program object created.
 ) {
   std::ignore = Properties;
+  UR_ASSERT(Context && NativeProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(Program, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   auto ZeModule = ur_cast<ze_module_handle_t>(NativeProgram);
 
   // We assume here that programs created from a native handle always
diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp
index 8d148c8fa2..42330adcbf 100644
--- a/source/adapters/level_zero/program.hpp
+++ b/source/adapters/level_zero/program.hpp
@@ -65,10 +65,21 @@ struct ur_program_handle_t_ : _ur_object {
     ze_module_constants_t ZeSpecConstants;
   };
 
-  // Construct a program in IL or Native state.
+  // Construct a program in IL.
   ur_program_handle_t_(state St, ur_context_handle_t Context, const void *Input,
                        size_t Length)
-      : Context{Context},
+      : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr},
+        OwnZeModule{true}, State{St}, Code{new uint8_t[Length]},
+        CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} {
+    std::memcpy(Code.get(), Input, Length);
+  }
+
+  // Construct a program in NATIVE.
+  ur_program_handle_t_(state St, ur_context_handle_t Context,
+                       ur_device_handle_t Device,
+                       const ur_program_properties_t *Properties,
+                       const void *Input, size_t Length)
+      : Context{Context}, NativeDevice(Device), NativeProperties(Properties),
         OwnZeModule{true}, State{St}, Code{new uint8_t[Length]},
         CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} {
     std::memcpy(Code.get(), Input, Length);
@@ -78,26 +89,29 @@ struct ur_program_handle_t_ : _ur_object {
   ur_program_handle_t_(state St, ur_context_handle_t Context,
                        ze_module_handle_t ZeModule,
                        ze_module_build_log_handle_t ZeBuildLog)
-      : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
-        ZeBuildLog{ZeBuildLog} {}
+      : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr},
+        OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{
+                                                              ZeBuildLog} {}
 
   // Construct a program in Exe state (interop).
   ur_program_handle_t_(state St, ur_context_handle_t Context,
                        ze_module_handle_t ZeModule, bool OwnZeModule)
-      : Context{Context}, OwnZeModule{OwnZeModule}, State{St},
-        ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
+      : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr},
+        OwnZeModule{OwnZeModule}, State{St}, ZeModule{ZeModule}, ZeBuildLog{
+                                                                     nullptr} {}
 
   // Construct a program from native handle
   ur_program_handle_t_(state St, ur_context_handle_t Context,
                        ze_module_handle_t ZeModule)
-      : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
-        ZeBuildLog{nullptr} {}
+      : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr},
+        OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
 
   // Construct a program in Invalid state with a custom error message.
   ur_program_handle_t_(state St, ur_context_handle_t Context,
                        const std::string &ErrorMessage)
-      : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage},
-        State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {}
+      : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr},
+        OwnZeModule{true}, ErrorMessage{ErrorMessage}, State{St},
+        ZeModule{nullptr}, ZeBuildLog{nullptr} {}
 
   ~ur_program_handle_t_();
   void ur_release_program_resources(bool deletion);
@@ -108,6 +122,12 @@ struct ur_program_handle_t_ : _ur_object {
 
   const ur_context_handle_t Context; // Context of the program.
 
+  // Device Handle used for the Native Build
+  ur_device_handle_t NativeDevice;
+
+  // Properties used for the Native Build
+  const ur_program_properties_t *NativeProperties;
+
   // Indicates if we own the ZeModule or it came from interop that
   // asked to not transfer the ownership to SYCL RT.
   const bool OwnZeModule;
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index f467447753..34da252c74 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -607,7 +607,7 @@ ur_result_t ur_queue_handle_legacy_t_::queueRelease() {
       // internal reference count. When the External Reference count == 0, then
       // cleanup of the queue begins and the final decrement of the internal
       // reference count is completed.
-      Queue->RefCount.decrementAndTest();
+      static_cast<void>(Queue->RefCount.decrementAndTest());
       return UR_RESULT_SUCCESS;
     }
 
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index 939a625122..8941f756ea 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -387,6 +387,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
   pDdiTable->pfnImportExternalMemoryExp =
       urBindlessImagesImportExternalMemoryExp;
   pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp;
+  pDdiTable->pfnMapExternalLinearMemoryExp =
+      urBindlessImagesMapExternalLinearMemoryExp;
   pDdiTable->pfnReleaseExternalMemoryExp =
       urBindlessImagesReleaseExternalMemoryExp;
   pDdiTable->pfnImportExternalSemaphoreExp =
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index a25c57e21b..1069ec78da 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -311,8 +311,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   // L0 spec says that alignment values that are not powers of 2 are invalid.
-  if (Align > 65536 || (Align & (Align - 1)) != 0)
-    return UR_RESULT_ERROR_INVALID_VALUE;
+  // If alignment == 0, then we are allowing the L0 driver to choose the
+  // alignment so no need to check.
+  if (Align > 0) {
+    if (Align > 65536 || (Align & (Align - 1)) != 0)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+  }
 
   ur_platform_handle_t Plt = Context->getPlatform();
   // If indirect access tracking is enabled then lock the mutex which is
@@ -381,8 +385,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   // L0 spec says that alignment values that are not powers of 2 are invalid.
-  if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
-    return UR_RESULT_ERROR_INVALID_VALUE;
+  // If alignment == 0, then we are allowing the L0 driver to choose the
+  // alignment so no need to check.
+  if (Alignment > 0) {
+    if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+  }
 
   ur_platform_handle_t Plt = Device->Platform;
 
@@ -482,8 +490,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   // L0 spec says that alignment values that are not powers of 2 are invalid.
-  if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
-    return UR_RESULT_ERROR_INVALID_VALUE;
+  // If alignment == 0, then we are allowing the L0 driver to choose the
+  // alignment so no need to check.
+  if (Alignment > 0) {
+    if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+  }
 
   ur_platform_handle_t Plt = Device->Platform;
 
diff --git a/source/adapters/level_zero/v2/event_pool.hpp b/source/adapters/level_zero/v2/event_pool.hpp
index 8976daa939..8dd66654ad 100644
--- a/source/adapters/level_zero/v2/event_pool.hpp
+++ b/source/adapters/level_zero/v2/event_pool.hpp
@@ -37,7 +37,7 @@ class event_pool {
   event_pool(const event_pool &) = delete;
   event_pool &operator=(const event_pool &) = delete;
 
-  DeviceId Id() { return provider->device()->Id; };
+  DeviceId Id() { return provider->device()->Id.value(); };
 
   ur_event *allocate();
   void free(ur_event *event);
diff --git a/source/adapters/mock/ur_mock.cpp b/source/adapters/mock/ur_mock.cpp
index b1fc9c8c29..c72c1e30ed 100644
--- a/source/adapters/mock/ur_mock.cpp
+++ b/source/adapters/mock/ur_mock.cpp
@@ -17,13 +17,14 @@ namespace driver {
 context_t d_context;
 
 ur_result_t mock_urPlatformGetApiVersion(void *pParams) {
-    auto params = *static_cast<ur_platform_get_api_version_params_t *>(pParams);
+    const auto &params =
+        *static_cast<ur_platform_get_api_version_params_t *>(pParams);
     **params.ppVersion = d_context.version;
     return UR_RESULT_SUCCESS;
 }
 
 ur_result_t mock_urPlatformGetInfo(void *pParams) {
-    auto params = *static_cast<ur_platform_get_info_params_t *>(pParams);
+    const auto &params = *static_cast<ur_platform_get_info_params_t *>(pParams);
     if (!*params.phPlatform) {
         return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
     }
@@ -49,7 +50,7 @@ ur_result_t mock_urPlatformGetInfo(void *pParams) {
 
 //////////////////////////////////////////////////////////////////////////
 ur_result_t mock_urDeviceGetInfo(void *pParams) {
-    auto params = *static_cast<ur_device_get_info_params_t *>(pParams);
+    const auto &params = *static_cast<ur_device_get_info_params_t *>(pParams);
     switch (*params.ppropName) {
     case UR_DEVICE_INFO_TYPE:
         if (*params.ppPropValue != nullptr) {
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 1f42e5dbbe..d792c3bd2c 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -921,7 +921,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -930,7 +931,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_device_create_with_native_handle_params_t params = {
-        &hNativeDevice, &hPlatform, &pProperties, &phDevice};
+        &hNativeDevice, &hAdapter, &pProperties, &phDevice};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
@@ -7805,6 +7806,58 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp
+__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    ur_bindless_images_map_external_linear_memory_exp_params_t params = {
+        &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem};
+
+    auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_before_callback(
+            "urBindlessImagesMapExternalLinearMemoryExp"));
+    if (beforeCallback) {
+        result = beforeCallback(&params);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+    }
+
+    auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_replace_callback(
+            "urBindlessImagesMapExternalLinearMemoryExp"));
+    if (replaceCallback) {
+        result = replaceCallback(&params);
+    } else {
+
+        result = UR_RESULT_SUCCESS;
+    }
+
+    if (result != UR_RESULT_SUCCESS) {
+        return result;
+    }
+
+    auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_after_callback(
+            "urBindlessImagesMapExternalLinearMemoryExp"));
+    if (afterCallback) {
+        return afterCallback(&params);
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp
 __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
@@ -10272,6 +10325,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
     pDdiTable->pfnMapExternalArrayExp =
         driver::urBindlessImagesMapExternalArrayExp;
 
+    pDdiTable->pfnMapExternalLinearMemoryExp =
+        driver::urBindlessImagesMapExternalLinearMemoryExp;
+
     pDdiTable->pfnReleaseExternalMemoryExp =
         driver::urBindlessImagesReleaseExternalMemoryExp;
 
diff --git a/source/adapters/native_cpu/context.hpp b/source/adapters/native_cpu/context.hpp
index 30bfb31d71..0595f211d9 100644
--- a/source/adapters/native_cpu/context.hpp
+++ b/source/adapters/native_cpu/context.hpp
@@ -10,13 +10,141 @@
 
 #pragma once
 
+#include <mutex>
+#include <set>
 #include <ur_api.h>
 
 #include "common.hpp"
 #include "device.hpp"
+#include "ur/ur.hpp"
+
+namespace native_cpu {
+struct usm_alloc_info {
+  ur_usm_type_t type;
+  const void *base_ptr;
+  size_t size;
+  ur_device_handle_t device;
+  ur_usm_pool_handle_t pool;
+
+  // We store a pointer to the actual allocation because it is needed when
+  // freeing memory.
+  void *base_alloc_ptr;
+  constexpr usm_alloc_info(ur_usm_type_t type, const void *base_ptr,
+                           size_t size, ur_device_handle_t device,
+                           ur_usm_pool_handle_t pool, void *base_alloc_ptr)
+      : type(type), base_ptr(base_ptr), size(size), device(device), pool(pool),
+        base_alloc_ptr(base_alloc_ptr) {}
+};
+
+constexpr usm_alloc_info usm_alloc_info_null_entry(UR_USM_TYPE_UNKNOWN, nullptr,
+                                                   0, nullptr, nullptr,
+                                                   nullptr);
+
+constexpr size_t alloc_header_size = sizeof(usm_alloc_info);
+
+// Computes the padding that we need to add to ensure the
+// pointer returned by UR is aligned as the user requested.
+static size_t get_padding(uint32_t alignment) {
+  assert(alignment >= alignof(usm_alloc_info) &&
+         "memory not aligned to usm_alloc_info");
+  if (!alignment || alloc_header_size % alignment == 0)
+    return 0;
+  size_t padd = 0;
+  if (alignment <= alloc_header_size) {
+    padd = alignment - (alloc_header_size % alignment);
+  } else {
+    padd = alignment - alloc_header_size;
+  }
+  return padd;
+}
+
+// In order to satisfy the MemAllocInfo queries we allocate extra memory
+// for the native_cpu::usm_alloc_info struct.
+// To satisfy the alignment requirements we "pad" the memory
+// allocation so that the pointer returned to the user
+// always satisfies (ptr % align) == 0.
+static inline void *malloc_impl(uint32_t alignment, size_t size) {
+  void *ptr = nullptr;
+  assert(alignment >= alignof(usm_alloc_info) &&
+         "memory not aligned to usm_alloc_info");
+#ifdef _MSC_VER
+  ptr = _aligned_malloc(alloc_header_size + get_padding(alignment) + size,
+                        alignment);
+
+#else
+  ptr = std::aligned_alloc(alignment,
+                           alloc_header_size + get_padding(alignment) + size);
+#endif
+  return ptr;
+}
+
+// The info struct is retrieved by subtracting its size from the pointer
+// returned to the user.
+static inline uint8_t *get_alloc_info_addr(const void *ptr) {
+  return (uint8_t *)const_cast<void *>(ptr) - alloc_header_size;
+}
+
+static usm_alloc_info get_alloc_info(void *ptr) {
+  return *(usm_alloc_info *)get_alloc_info_addr(ptr);
+}
+
+} // namespace native_cpu
 
 struct ur_context_handle_t_ : RefCounted {
   ur_context_handle_t_(ur_device_handle_t_ *phDevices) : _device{phDevices} {}
 
   ur_device_handle_t _device;
+
+  ur_result_t remove_alloc(void *ptr) {
+    std::lock_guard<std::mutex> lock(alloc_mutex);
+    const native_cpu::usm_alloc_info &info = native_cpu::get_alloc_info(ptr);
+    UR_ASSERT(info.type != UR_USM_TYPE_UNKNOWN,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+#ifdef _MSC_VER
+    _aligned_free(info.base_alloc_ptr);
+#else
+    free(info.base_alloc_ptr);
+#endif
+    allocations.erase(ptr);
+    return UR_RESULT_SUCCESS;
+  }
+
+  const native_cpu::usm_alloc_info &
+  get_alloc_info_entry(const void *ptr) const {
+    auto it = allocations.find(ptr);
+    if (it == allocations.end()) {
+      return native_cpu::usm_alloc_info_null_entry;
+    }
+
+    return *(native_cpu::usm_alloc_info *)native_cpu::get_alloc_info_addr(ptr);
+  }
+
+  void *add_alloc(uint32_t alignment, ur_usm_type_t type, size_t size,
+                  ur_usm_pool_handle_t pool) {
+    std::lock_guard<std::mutex> lock(alloc_mutex);
+    // We need to ensure that we align to at least alignof(usm_alloc_info),
+    // otherwise its start address may be unaligned.
+    alignment =
+        std::max<size_t>(alignment, alignof(native_cpu::usm_alloc_info));
+    void *alloc = native_cpu::malloc_impl(alignment, size);
+    if (!alloc)
+      return nullptr;
+    // Compute the address of the pointer that we'll return to the user.
+    void *ptr = native_cpu::alloc_header_size +
+                native_cpu::get_padding(alignment) + (uint8_t *)alloc;
+    uint8_t *info_addr = native_cpu::get_alloc_info_addr(ptr);
+    if (!info_addr)
+      return nullptr;
+    // Do a placement new of the alloc_info to avoid allocation and copy
+    auto info = new (info_addr)
+        native_cpu::usm_alloc_info(type, ptr, size, this->_device, pool, alloc);
+    if (!info)
+      return nullptr;
+    allocations.insert(ptr);
+    return ptr;
+  }
+
+private:
+  std::mutex alloc_mutex;
+  std::set<const void *> allocations;
 };
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 64d99927ae..4b32c11e37 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -366,11 +366,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
-    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    ur_native_handle_t hNativeDevice, ur_adapter_handle_t hAdapter,
     const ur_device_native_properties_t *pProperties,
     ur_device_handle_t *phDevice) {
   std::ignore = hNativeDevice;
-  std::ignore = hPlatform;
+  std::ignore = hAdapter;
   std::ignore = pProperties;
   std::ignore = phDevice;
 
diff --git a/source/adapters/native_cpu/image.cpp b/source/adapters/native_cpu/image.cpp
index a1b862ad9f..d89990ed10 100644
--- a/source/adapters/native_cpu/image.cpp
+++ b/source/adapters/native_cpu/image.cpp
@@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    [[maybe_unused]] ur_context_handle_t hContext,
+    [[maybe_unused]] ur_device_handle_t hDevice,
+    [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size,
+    [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem,
+    [[maybe_unused]] void **phRetMem) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
     [[maybe_unused]] ur_context_handle_t hContext,
     [[maybe_unused]] ur_device_handle_t hDevice,
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 55c32eb84b..2f2f79cd5a 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -81,8 +81,11 @@ class worker_thread {
 
   // Waits for all tasks to finish and destroys the worker thread
   inline void stop() {
-    m_isRunning.store(false, std::memory_order_release);
-    m_startWorkCondition.notify_all();
+    {
+      std::lock_guard<std::mutex> lock(m_workMutex);
+      m_isRunning.store(false, std::memory_order_release);
+      m_startWorkCondition.notify_all();
+    }
     if (m_worker.joinable()) {
       // Wait for the worker thread to finish handling the task queue
       m_worker.join();
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 7b5f1b923d..ff6c9d8c0f 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -329,6 +329,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
   pDdiTable->pfnImportExternalMemoryExp =
       urBindlessImagesImportExternalMemoryExp;
   pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp;
+  pDdiTable->pfnMapExternalLinearMemoryExp =
+      urBindlessImagesMapExternalLinearMemoryExp;
   pDdiTable->pfnReleaseExternalMemoryExp =
       urBindlessImagesReleaseExternalMemoryExp;
   pDdiTable->pfnImportExternalSemaphoreExp =
diff --git a/source/adapters/native_cpu/usm.cpp b/source/adapters/native_cpu/usm.cpp
index 45ac0596f3..dcae1881f1 100644
--- a/source/adapters/native_cpu/usm.cpp
+++ b/source/adapters/native_cpu/usm.cpp
@@ -8,90 +8,98 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ur/ur.hpp"
 #include "ur_api.h"
 
 #include "common.hpp"
+#include "context.hpp"
+#include <cstdlib>
 
-UR_APIEXPORT ur_result_t UR_APICALL
-urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
-               ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
-  std::ignore = hContext;
-  std::ignore = pUSMDesc;
-  std::ignore = pool;
+namespace native_cpu {
 
+static ur_result_t alloc_helper(ur_context_handle_t hContext,
+                                const ur_usm_desc_t *pUSMDesc, size_t size,
+                                void **ppMem, ur_usm_type_t type) {
+  auto alignment = pUSMDesc ? pUSMDesc->align : 1u;
+  UR_ASSERT((alignment & (alignment - 1)) == 0, UR_RESULT_ERROR_INVALID_VALUE);
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented
   UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE);
 
-  *ppMem = malloc(size);
+  auto *ptr = hContext->add_alloc(alignment, type, size, nullptr);
+  UR_ASSERT(ptr != nullptr, UR_RESULT_ERROR_OUT_OF_RESOURCES);
+  *ppMem = ptr;
 
   return UR_RESULT_SUCCESS;
 }
 
+} // namespace native_cpu
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
+               ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
+  std::ignore = pool;
+
+  return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem,
+                                  UR_USM_TYPE_HOST);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL
 urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
                  size_t size, void **ppMem) {
-  std::ignore = hContext;
   std::ignore = hDevice;
-  std::ignore = pUSMDesc;
   std::ignore = pool;
 
-  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented
-  UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE);
-
-  *ppMem = malloc(size);
-
-  return UR_RESULT_SUCCESS;
+  return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem,
+                                  UR_USM_TYPE_DEVICE);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
                  size_t size, void **ppMem) {
-  std::ignore = hContext;
   std::ignore = hDevice;
-  std::ignore = pUSMDesc;
   std::ignore = pool;
 
-  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented
-  UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE);
-
-  *ppMem = malloc(size);
-
-  return UR_RESULT_SUCCESS;
+  return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem,
+                                  UR_USM_TYPE_SHARED);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
                                               void *pMem) {
-  std::ignore = hContext;
 
   UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  free(pMem);
+  auto res = hContext->remove_alloc(pMem);
 
-  return UR_RESULT_SUCCESS;
+  return res;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
                      ur_usm_alloc_info_t propName, size_t propSize,
                      void *pPropValue, size_t *pPropSizeRet) {
-  std::ignore = hContext;
-  std::ignore = pMem;
-  std::ignore = propName;
-  std::ignore = propSize;
-  std::ignore = pPropValue;
-  std::ignore = pPropSizeRet;
 
+  UR_ASSERT(pMem != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+  if (propName == UR_USM_ALLOC_INFO_BASE_PTR) {
+    // TODO: logic to compute base ptr given ptr
+    DIE_NO_IMPLEMENTATION;
+  }
 
+  const native_cpu::usm_alloc_info &alloc_info =
+      hContext->get_alloc_info_entry(pMem);
   switch (propName) {
   case UR_USM_ALLOC_INFO_TYPE:
-    // Todo implement this in context
-    return ReturnValue(UR_USM_TYPE_DEVICE);
+    return ReturnValue(alloc_info.type);
+  case UR_USM_ALLOC_INFO_SIZE:
+    return ReturnValue(alloc_info.size);
+  case UR_USM_ALLOC_INFO_DEVICE:
+    return ReturnValue(alloc_info.device);
+  case UR_USM_ALLOC_INFO_POOL:
+    return ReturnValue(alloc_info.pool);
   default:
     DIE_NO_IMPLEMENTATION;
   }
diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp
index 399f668077..18b08bf095 100644
--- a/source/adapters/opencl/common.hpp
+++ b/source/adapters/opencl/common.hpp
@@ -319,45 +319,33 @@ cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer,
 template <typename T> struct FuncPtrCache {
   std::map<cl_context, T> Map;
   std::mutex Mutex;
+
+  void clear(cl_context context) {
+    std::lock_guard<std::mutex> CacheLock{Mutex};
+    Map.erase(context);
+  }
 };
 
-// FIXME: There's currently no mechanism for cleaning up this cache, meaning
-// that it is invalidated whenever a context is destroyed. This could lead to
-// reusing an invalid function pointer if another context happens to have the
-// same native handle.
 struct ExtFuncPtrCacheT {
-  FuncPtrCache<clHostMemAllocINTEL_fn> clHostMemAllocINTELCache;
-  FuncPtrCache<clDeviceMemAllocINTEL_fn> clDeviceMemAllocINTELCache;
-  FuncPtrCache<clSharedMemAllocINTEL_fn> clSharedMemAllocINTELCache;
-  FuncPtrCache<clGetDeviceFunctionPointer_fn> clGetDeviceFunctionPointerCache;
-  FuncPtrCache<clGetDeviceGlobalVariablePointer_fn>
-      clGetDeviceGlobalVariablePointerCache;
-  FuncPtrCache<clCreateBufferWithPropertiesINTEL_fn>
-      clCreateBufferWithPropertiesINTELCache;
-  FuncPtrCache<clMemBlockingFreeINTEL_fn> clMemBlockingFreeINTELCache;
-  FuncPtrCache<clSetKernelArgMemPointerINTEL_fn>
-      clSetKernelArgMemPointerINTELCache;
-  FuncPtrCache<clEnqueueMemFillINTEL_fn> clEnqueueMemFillINTELCache;
-  FuncPtrCache<clEnqueueMemcpyINTEL_fn> clEnqueueMemcpyINTELCache;
-  FuncPtrCache<clGetMemAllocInfoINTEL_fn> clGetMemAllocInfoINTELCache;
-  FuncPtrCache<clEnqueueWriteGlobalVariable_fn>
-      clEnqueueWriteGlobalVariableCache;
-  FuncPtrCache<clEnqueueReadGlobalVariable_fn> clEnqueueReadGlobalVariableCache;
-  FuncPtrCache<clEnqueueReadHostPipeINTEL_fn> clEnqueueReadHostPipeINTELCache;
-  FuncPtrCache<clEnqueueWriteHostPipeINTEL_fn> clEnqueueWriteHostPipeINTELCache;
-  FuncPtrCache<clSetProgramSpecializationConstant_fn>
-      clSetProgramSpecializationConstantCache;
-  FuncPtrCache<clCreateCommandBufferKHR_fn> clCreateCommandBufferKHRCache;
-  FuncPtrCache<clRetainCommandBufferKHR_fn> clRetainCommandBufferKHRCache;
-  FuncPtrCache<clReleaseCommandBufferKHR_fn> clReleaseCommandBufferKHRCache;
-  FuncPtrCache<clFinalizeCommandBufferKHR_fn> clFinalizeCommandBufferKHRCache;
-  FuncPtrCache<clCommandNDRangeKernelKHR_fn> clCommandNDRangeKernelKHRCache;
-  FuncPtrCache<clCommandCopyBufferKHR_fn> clCommandCopyBufferKHRCache;
-  FuncPtrCache<clCommandCopyBufferRectKHR_fn> clCommandCopyBufferRectKHRCache;
-  FuncPtrCache<clCommandFillBufferKHR_fn> clCommandFillBufferKHRCache;
-  FuncPtrCache<clEnqueueCommandBufferKHR_fn> clEnqueueCommandBufferKHRCache;
-  FuncPtrCache<clGetCommandBufferInfoKHR_fn> clGetCommandBufferInfoKHRCache;
-  FuncPtrCache<clUpdateMutableCommandsKHR_fn> clUpdateMutableCommandsKHRCache;
+#define CL_EXTENSION_FUNC(func) FuncPtrCache<func##_fn> func##Cache;
+
+#include "extension_functions.def"
+
+#undef CL_EXTENSION_FUNC
+
+  // If a context stored in the current caching mechanism is destroyed by the
+  // CL driver all of its function pointers are invalidated. This can lead to a
+  // pathological case where a subsequently created context gets returned with
+  // a coincidentally identical handle to the destroyed one and ends up being
+  // used to retrieve bad function pointers. To avoid this we clear the cache
+  // when contexts are released.
+  void clearCache(cl_context context) {
+#define CL_EXTENSION_FUNC(func) func##Cache.clear(context);
+
+#include "extension_functions.def"
+
+#undef CL_EXTENSION_FUNC
+  }
 };
 // A raw pointer is used here since the lifetime of this map has to be tied to
 // piTeardown to avoid issues with static destruction order (a user application
diff --git a/source/adapters/opencl/context.cpp b/source/adapters/opencl/context.cpp
index 1478050cda..38202bbf58 100644
--- a/source/adapters/opencl/context.cpp
+++ b/source/adapters/opencl/context.cpp
@@ -113,9 +113,30 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName,
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urContextRelease(ur_context_handle_t hContext) {
+  // If we're reasonably sure this context is about to be detroyed we should
+  // clear the ext function pointer cache. This isn't foolproof sadly but it
+  // should drastically reduce the chances of the pathological case described
+  // in the comments in common.hpp.
+  static std::mutex contextReleaseMutex;
+  auto clContext = cl_adapter::cast<cl_context>(hContext);
 
-  cl_int Ret = clReleaseContext(cl_adapter::cast<cl_context>(hContext));
-  return mapCLErrorToUR(Ret);
+  {
+    std::lock_guard<std::mutex> lock(contextReleaseMutex);
+    size_t refCount = 0;
+    CL_RETURN_ON_FAILURE(clGetContextInfo(clContext, CL_CONTEXT_REFERENCE_COUNT,
+                                          sizeof(size_t), &refCount, nullptr));
+
+    // ExtFuncPtrCache is destroyed in an atexit() callback, so it doesn't
+    // necessarily outlive the adapter (or all the contexts).
+    if (refCount == 1 && cl_ext::ExtFuncPtrCache) {
+      cl_ext::ExtFuncPtrCache->clearCache(clContext);
+    }
+  }
+
+  CL_RETURN_ON_FAILURE(
+      clReleaseContext(cl_adapter::cast<cl_context>(hContext)));
+
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index 44262df26a..a31d6580a0 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -1125,7 +1125,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
-    ur_native_handle_t hNativeDevice, ur_platform_handle_t,
+    ur_native_handle_t hNativeDevice, ur_adapter_handle_t,
     const ur_device_native_properties_t *, ur_device_handle_t *phDevice) {
 
   *phDevice = reinterpret_cast<ur_device_handle_t>(hNativeDevice);
diff --git a/source/adapters/opencl/extension_functions.def b/source/adapters/opencl/extension_functions.def
new file mode 100644
index 0000000000..76771744b2
--- /dev/null
+++ b/source/adapters/opencl/extension_functions.def
@@ -0,0 +1,27 @@
+CL_EXTENSION_FUNC(clHostMemAllocINTEL)
+CL_EXTENSION_FUNC(clDeviceMemAllocINTEL)
+CL_EXTENSION_FUNC(clSharedMemAllocINTEL)
+CL_EXTENSION_FUNC(clGetDeviceFunctionPointer)
+CL_EXTENSION_FUNC(clGetDeviceGlobalVariablePointer)
+CL_EXTENSION_FUNC(clCreateBufferWithPropertiesINTEL)
+CL_EXTENSION_FUNC(clMemBlockingFreeINTEL)
+CL_EXTENSION_FUNC(clSetKernelArgMemPointerINTEL)
+CL_EXTENSION_FUNC(clEnqueueMemFillINTEL)
+CL_EXTENSION_FUNC(clEnqueueMemcpyINTEL)
+CL_EXTENSION_FUNC(clGetMemAllocInfoINTEL)
+CL_EXTENSION_FUNC(clEnqueueWriteGlobalVariable)
+CL_EXTENSION_FUNC(clEnqueueReadGlobalVariable)
+CL_EXTENSION_FUNC(clEnqueueReadHostPipeINTEL)
+CL_EXTENSION_FUNC(clEnqueueWriteHostPipeINTEL)
+CL_EXTENSION_FUNC(clSetProgramSpecializationConstant)
+CL_EXTENSION_FUNC(clCreateCommandBufferKHR)
+CL_EXTENSION_FUNC(clRetainCommandBufferKHR)
+CL_EXTENSION_FUNC(clReleaseCommandBufferKHR)
+CL_EXTENSION_FUNC(clFinalizeCommandBufferKHR)
+CL_EXTENSION_FUNC(clCommandNDRangeKernelKHR)
+CL_EXTENSION_FUNC(clCommandCopyBufferKHR)
+CL_EXTENSION_FUNC(clCommandCopyBufferRectKHR)
+CL_EXTENSION_FUNC(clCommandFillBufferKHR)
+CL_EXTENSION_FUNC(clEnqueueCommandBufferKHR)
+CL_EXTENSION_FUNC(clGetCommandBufferInfoKHR)
+CL_EXTENSION_FUNC(clUpdateMutableCommandsKHR)
diff --git a/source/adapters/opencl/image.cpp b/source/adapters/opencl/image.cpp
index c33bb57b0f..0c628594bb 100644
--- a/source/adapters/opencl/image.cpp
+++ b/source/adapters/opencl/image.cpp
@@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    [[maybe_unused]] ur_context_handle_t hContext,
+    [[maybe_unused]] ur_device_handle_t hDevice,
+    [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size,
+    [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem,
+    [[maybe_unused]] void **phRetMem) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
     [[maybe_unused]] ur_context_handle_t hContext,
     [[maybe_unused]] ur_device_handle_t hDevice,
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index 41c6d6de70..9735abefbf 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -206,19 +206,14 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
       // Two calls to urDeviceGetInfo are needed: the first determines the size
       // required to store the result, and the second returns the actual size
       // values.
-      ur_result_t URRet =
-          urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, 0,
-                          nullptr, &ResultSize);
-      if (URRet != UR_RESULT_SUCCESS) {
-        return URRet;
-      }
-      assert(ResultSize % sizeof(size_t) == 0);
-      std::vector<size_t> Result(ResultSize / sizeof(size_t));
-      URRet = urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL,
-                              ResultSize, Result.data(), nullptr);
-      if (URRet != UR_RESULT_SUCCESS) {
-        return URRet;
-      }
+      UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice,
+                                           UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL,
+                                           0, nullptr, &ResultSize));
+      assert(ResultSize % sizeof(uint32_t) == 0);
+      std::vector<uint32_t> Result(ResultSize / sizeof(uint32_t));
+      UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice,
+                                           UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL,
+                                           ResultSize, Result.data(), nullptr));
       RetVal = *std::max_element(Result.begin(), Result.end());
       Ret = CL_SUCCESS;
     } else if (propName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) {
diff --git a/source/adapters/opencl/sampler.cpp b/source/adapters/opencl/sampler.cpp
index f05177a987..a47ba7f894 100644
--- a/source/adapters/opencl/sampler.cpp
+++ b/source/adapters/opencl/sampler.cpp
@@ -158,16 +158,38 @@ urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName,
   static_assert(sizeof(cl_addressing_mode) ==
                 sizeof(ur_sampler_addressing_mode_t));
 
-  size_t CheckPropSize = 0;
-  ur_result_t Err = mapCLErrorToUR(
-      clGetSamplerInfo(cl_adapter::cast<cl_sampler>(hSampler), SamplerInfo,
-                       propSize, pPropValue, &CheckPropSize));
-  if (pPropValue && CheckPropSize != propSize) {
-    return UR_RESULT_ERROR_INVALID_SIZE;
-  }
-  UR_RETURN_ON_FAILURE(Err);
-  if (pPropSizeRet) {
-    *pPropSizeRet = CheckPropSize;
+  ur_result_t Err = UR_RESULT_SUCCESS;
+  // ur_bool_t have a size of uint8_t, but cl_bool size have the size of
+  // uint32_t so this adjust UR_SAMPLER_INFO_NORMALIZED_COORDS info to map
+  // between them.
+  if (propName == UR_SAMPLER_INFO_NORMALIZED_COORDS) {
+    cl_bool normalized_coords = false;
+    Err = mapCLErrorToUR(
+        clGetSamplerInfo(cl_adapter::cast<cl_sampler>(hSampler), SamplerInfo,
+                         sizeof(cl_bool), &normalized_coords, nullptr));
+    if (pPropValue && propSize != sizeof(ur_bool_t)) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
+    UR_RETURN_ON_FAILURE(Err);
+    if (pPropValue) {
+      *static_cast<ur_bool_t *>(pPropValue) =
+          static_cast<ur_bool_t>(normalized_coords);
+    }
+    if (pPropSizeRet) {
+      *pPropSizeRet = sizeof(ur_bool_t);
+    }
+  } else {
+    size_t CheckPropSize = 0;
+    Err = mapCLErrorToUR(
+        clGetSamplerInfo(cl_adapter::cast<cl_sampler>(hSampler), SamplerInfo,
+                         propSize, pPropValue, &CheckPropSize));
+    if (pPropValue && CheckPropSize != propSize) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
+    UR_RETURN_ON_FAILURE(Err);
+    if (pPropSizeRet) {
+      *pPropSizeRet = CheckPropSize;
+    }
   }
 
   // Convert OpenCL returns to UR
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index 171c561c28..100bb888cf 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -347,6 +347,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
   pDdiTable->pfnImportExternalMemoryExp =
       urBindlessImagesImportExternalMemoryExp;
   pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp;
+  pDdiTable->pfnMapExternalLinearMemoryExp =
+      urBindlessImagesMapExternalLinearMemoryExp;
   pDdiTable->pfnReleaseExternalMemoryExp =
       urBindlessImagesReleaseExternalMemoryExp;
   pDdiTable->pfnImportExternalSemaphoreExp =
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 222b7f008b..e327d7672b 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -23,7 +23,8 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    set(UMF_TAG 413327815feb1585bfb36b1f34750f1ba961ceed)
+    # v0.9.x 19.08.2024: Merge pull request #688 ...
+    set(UMF_TAG 59c4150b7120a7af5b3c8eb2d9b8bbb5d2e96aa3)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")
diff --git a/source/common/linux/ur_lib_loader.cpp b/source/common/linux/ur_lib_loader.cpp
index 53b6e0bebe..4da7f98bc1 100644
--- a/source/common/linux/ur_lib_loader.cpp
+++ b/source/common/linux/ur_lib_loader.cpp
@@ -45,7 +45,13 @@ LibLoader::loadAdapterLibrary(const char *name) {
     }
 #endif
     HMODULE handle = dlopen(name, mode);
-    logger::info("loaded adapter 0x{} ({})", handle, name);
+    if (!handle) {
+        char *err = dlerror();
+        logger::info("failed to load adapter '{}' with error: {}", name,
+                     err ? err : "unknown error");
+    } else {
+        logger::info("loaded adapter 0x{} ({})", handle, name);
+    }
     return std::unique_ptr<HMODULE, LibLoader::lib_dtor>(handle);
 }
 
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index e24c1153c5..0475cf31e4 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -462,7 +462,7 @@ template <typename T> class AtomicSingleton {
 
     static int release(std::function<void(T *)> deleter) {
         auto val = instance.acquire();
-        int ret = val->release(deleter);
+        int ret = val->release(std::move(deleter));
         instance.release();
 
         return ret;
diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index 5a7a419954..edfd8b055d 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -142,6 +142,8 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_report.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/common.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.hpp
@@ -164,7 +166,7 @@ if(UR_ENABLE_SANITIZER)
             ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/symbolizer.cpp
         )
         target_include_directories(ur_loader PRIVATE ${LLVM_INCLUDE_DIRS})
-        target_link_libraries(ur_loader PRIVATE LLVMSymbolize)
+        target_link_libraries(ur_loader PRIVATE LLVMSupport LLVMSymbolize)
     endif()
 
     target_include_directories(ur_loader PRIVATE
diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp
index 4cf90c7da4..382d6e3ada 100644
--- a/source/loader/layers/sanitizer/asan_buffer.cpp
+++ b/source/loader/layers/sanitizer/asan_buffer.cpp
@@ -75,12 +75,14 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
         return UR_RESULT_SUCCESS;
     }
 
+    std::scoped_lock<ur_shared_mutex> Guard(Mutex);
     auto &Allocation = Allocations[Device];
+    ur_result_t URes = UR_RESULT_SUCCESS;
     if (!Allocation) {
         ur_usm_desc_t USMDesc{};
         USMDesc.align = getAlignment();
         ur_usm_pool_handle_t Pool{};
-        ur_result_t URes = getContext()->interceptor->allocateMemory(
+        URes = getContext()->interceptor->allocateMemory(
             Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER,
             ur_cast<void **>(&Allocation));
         if (URes != UR_RESULT_SUCCESS) {
@@ -105,7 +107,60 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
 
     Handle = Allocation;
 
-    return UR_RESULT_SUCCESS;
+    if (!LastSyncedDevice.hDevice) {
+        LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+        return URes;
+    }
+
+    // If the device required to allocate memory is not the previous one, we
+    // need to do data migration.
+    if (Device != LastSyncedDevice.hDevice) {
+        auto &HostAllocation = Allocations[nullptr];
+        if (!HostAllocation) {
+            ur_usm_desc_t USMDesc{};
+            USMDesc.align = getAlignment();
+            ur_usm_pool_handle_t Pool{};
+            URes = getContext()->interceptor->allocateMemory(
+                Context, nullptr, &USMDesc, Pool, Size, AllocType::HOST_USM,
+                ur_cast<void **>(&HostAllocation));
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error("Failed to allocate {} bytes host "
+                                           "USM for buffer {} migration",
+                                           Size, this);
+                return URes;
+            }
+        }
+
+        // Copy data from last synced device to host
+        {
+            ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
+            URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size,
+                0, nullptr, nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to migrate memory buffer data");
+                return URes;
+            }
+        }
+
+        // Sync data back to device
+        {
+            ManagedQueue Queue(Context, Device);
+            URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, Allocation, HostAllocation, Size, 0, nullptr,
+                nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to migrate memory buffer data");
+                return URes;
+            }
+        }
+    }
+
+    LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+
+    return URes;
 }
 
 ur_result_t MemBuffer::free() {
diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp
index b4eba4e4ba..989ef4249f 100644
--- a/source/loader/layers/sanitizer/asan_buffer.hpp
+++ b/source/loader/layers/sanitizer/asan_buffer.hpp
@@ -48,6 +48,12 @@ struct MemBuffer {
 
     ur_context_handle_t Context;
 
+    struct Device_t {
+        ur_device_handle_t hDevice;
+        char *MemHandle;
+    };
+    Device_t LastSyncedDevice{};
+
     size_t Size;
 
     char *HostPtr{};
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 0deb021a3f..ec1d5e8fad 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -16,6 +16,7 @@
 #include "asan_quarantine.hpp"
 #include "asan_report.hpp"
 #include "asan_shadow_setup.hpp"
+#include "asan_validator.hpp"
 #include "stacktrace.hpp"
 #include "ur_sanitizer_utils.hpp"
 
@@ -185,6 +186,18 @@ SanitizerInterceptor::~SanitizerInterceptor() {
     DestroyShadowMemoryOnCPU();
     DestroyShadowMemoryOnPVC();
     DestroyShadowMemoryOnDG2();
+
+    // We must release these objects before releasing adapters, since
+    // they may use the adapter in their destructor
+    m_Quarantine = nullptr;
+    m_MemBufferMap.clear();
+    m_AllocationMap.clear();
+    m_KernelMap.clear();
+    m_ContextMap.clear();
+
+    for (auto Adapter : m_Adapters) {
+        getContext()->urDdiTable.Global.pfnAdapterRelease(Adapter);
+    }
 }
 
 /// The memory chunk allocated from the underlying allocator looks like this:
@@ -615,6 +628,9 @@ SanitizerInterceptor::insertDevice(ur_device_handle_t Device,
 
     DI = std::make_shared<ur_sanitizer_layer::DeviceInfo>(Device);
 
+    DI->IsSupportSharedSystemUSM = GetDeviceUSMCapability(
+        Device, UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT);
+
     // Query alignment
     UR_CALL(getContext()->urDdiTable.Device.pfnGetInfo(
         Device, UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN, sizeof(DI->Alignment),
@@ -683,8 +699,25 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
     auto Program = GetProgram(Kernel);
 
     do {
-        // Set membuffer arguments
         auto KernelInfo = getKernelInfo(Kernel);
+
+        // Validate pointer arguments
+        if (Options(logger).DetectKernelArguments) {
+            for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
+                auto Ptr = PtrPair.first;
+                if (Ptr == nullptr) {
+                    continue;
+                }
+                if (auto ValidateResult = ValidateUSMPointer(
+                        Context, DeviceInfo->Handle, (uptr)Ptr)) {
+                    ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
+                                                ValidateResult, PtrPair.second);
+                    exit(1);
+                }
+            }
+        }
+
+        // Set membuffer arguments
         for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
             char *ArgPointer = nullptr;
             UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index 39c7705c99..1c87cdc8e1 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -22,6 +22,7 @@
 #include <optional>
 #include <queue>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace ur_sanitizer_layer {
@@ -41,21 +42,16 @@ struct DeviceInfo {
     uptr ShadowOffset = 0;
     uptr ShadowOffsetEnd = 0;
 
+    // Device features
+    bool IsSupportSharedSystemUSM = false;
+
     ur_mutex Mutex;
     std::queue<std::shared_ptr<AllocInfo>> Quarantine;
     size_t QuarantineSize = 0;
 
-    explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {
-        [[maybe_unused]] auto Result =
-            getContext()->urDdiTable.Device.pfnRetain(Device);
-        assert(Result == UR_RESULT_SUCCESS);
-    }
-
-    ~DeviceInfo() {
-        [[maybe_unused]] auto Result =
-            getContext()->urDdiTable.Device.pfnRelease(Handle);
-        assert(Result == UR_RESULT_SUCCESS);
-    }
+    // Device handles are special and alive in the whole process lifetime,
+    // so we needn't retain&release here.
+    explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {}
 
     ur_result_t allocShadowMemory(ur_context_handle_t Context);
 };
@@ -85,6 +81,8 @@ struct KernelInfo {
     ur_shared_mutex Mutex;
     std::atomic<int32_t> RefCount = 1;
     std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
+    std::unordered_map<uint32_t, std::pair<const void *, StackTrace>>
+        PointerArgs;
 
     // Need preserve the order of local arguments
     std::map<uint32_t, LocalArgsInfo> LocalArgs;
@@ -201,6 +199,16 @@ class SanitizerInterceptor {
     ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle);
     std::shared_ptr<MemBuffer> getMemBuffer(ur_mem_handle_t MemHandle);
 
+    ur_result_t holdAdapter(ur_adapter_handle_t Adapter) {
+        std::scoped_lock<ur_shared_mutex> Guard(m_AdaptersMutex);
+        if (m_Adapters.find(Adapter) != m_Adapters.end()) {
+            return UR_RESULT_SUCCESS;
+        }
+        UR_CALL(getContext()->urDdiTable.Global.pfnAdapterRetain(Adapter));
+        m_Adapters.insert(Adapter);
+        return UR_RESULT_SUCCESS;
+    }
+
     std::optional<AllocationIterator> findAllocInfoByAddress(uptr Address);
 
     std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
@@ -262,6 +270,9 @@ class SanitizerInterceptor {
 
     std::unique_ptr<Quarantine> m_Quarantine;
     logger::Logger &logger;
+
+    std::unordered_set<ur_adapter_handle_t> m_Adapters;
+    ur_shared_mutex m_AdaptersMutex;
 };
 
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_options.hpp b/source/loader/layers/sanitizer/asan_options.hpp
index ab6ee0c26b..298639b73c 100644
--- a/source/loader/layers/sanitizer/asan_options.hpp
+++ b/source/loader/layers/sanitizer/asan_options.hpp
@@ -38,6 +38,7 @@ struct AsanOptions {
     uint32_t MaxQuarantineSizeMB = 0;
     bool DetectLocals = true;
     bool DetectPrivates = true;
+    bool DetectKernelArguments = true;
 
   private:
     AsanOptions(logger::Logger &logger) {
@@ -93,10 +94,11 @@ struct AsanOptions {
         SetBoolOption("debug", Debug);
         SetBoolOption("detect_locals", DetectLocals);
         SetBoolOption("detect_privates", DetectPrivates);
+        SetBoolOption("detect_kernel_arguments", DetectKernelArguments);
 
         auto KV = OptionsEnvMap->find("quarantine_size_mb");
         if (KV != OptionsEnvMap->end()) {
-            auto Value = KV->second.front();
+            const auto &Value = KV->second.front();
             try {
                 auto temp_long = std::stoul(Value);
                 if (temp_long > UINT32_MAX) {
@@ -112,7 +114,7 @@ struct AsanOptions {
 
         KV = OptionsEnvMap->find("redzone");
         if (KV != OptionsEnvMap->end()) {
-            auto Value = KV->second.front();
+            const auto &Value = KV->second.front();
             try {
                 MinRZSize = std::stoul(Value);
                 if (MinRZSize < 16) {
@@ -127,7 +129,7 @@ struct AsanOptions {
 
         KV = OptionsEnvMap->find("max_redzone");
         if (KV != OptionsEnvMap->end()) {
-            auto Value = KV->second.front();
+            const auto &Value = KV->second.front();
             try {
                 MaxRZSize = std::stoul(Value);
                 if (MaxRZSize > 2048) {
diff --git a/source/loader/layers/sanitizer/asan_report.cpp b/source/loader/layers/sanitizer/asan_report.cpp
index bdae3284b4..a92e93f979 100644
--- a/source/loader/layers/sanitizer/asan_report.cpp
+++ b/source/loader/layers/sanitizer/asan_report.cpp
@@ -11,16 +11,32 @@
  */
 
 #include "asan_report.hpp"
-#include "asan_options.hpp"
-
 #include "asan_allocator.hpp"
 #include "asan_interceptor.hpp"
 #include "asan_libdevice.hpp"
+#include "asan_options.hpp"
+#include "asan_validator.hpp"
 #include "ur_sanitizer_layer.hpp"
 #include "ur_sanitizer_utils.hpp"
 
 namespace ur_sanitizer_layer {
 
+namespace {
+
+void PrintAllocateInfo(uptr Addr, const AllocInfo *AI) {
+    getContext()->logger.always("{} is located inside of {} region [{}, {})",
+                                (void *)Addr, ToString(AI->Type),
+                                (void *)AI->UserBegin, (void *)AI->UserEnd);
+    getContext()->logger.always("allocated here:");
+    AI->AllocStack.print();
+    if (AI->IsReleased) {
+        getContext()->logger.always("freed here:");
+        AI->ReleaseStack.print();
+    }
+}
+
+} // namespace
+
 void ReportBadFree(uptr Addr, const StackTrace &stack,
                    const std::shared_ptr<AllocInfo> &AI) {
     getContext()->logger.always(
@@ -32,13 +48,9 @@ void ReportBadFree(uptr Addr, const StackTrace &stack,
                                     (void *)Addr);
     }
 
-    assert(!AI->IsReleased && "Chunk must be not released");
+    assert(AI && !AI->IsReleased && "Chunk must be not released");
 
-    getContext()->logger.always("{} is located inside of {} region [{}, {})",
-                                (void *)Addr, ToString(AI->Type),
-                                (void *)AI->UserBegin, (void *)AI->UserEnd);
-    getContext()->logger.always("allocated here:");
-    AI->AllocStack.print();
+    PrintAllocateInfo(Addr, AI.get());
 }
 
 void ReportBadContext(uptr Addr, const StackTrace &stack,
@@ -48,16 +60,7 @@ void ReportBadContext(uptr Addr, const StackTrace &stack,
         (void *)Addr);
     stack.print();
 
-    getContext()->logger.always("{} is located inside of {} region [{}, {})",
-                                (void *)Addr, ToString(AI->Type),
-                                (void *)AI->UserBegin, (void *)AI->UserEnd);
-    getContext()->logger.always("allocated here:");
-    AI->AllocStack.print();
-
-    if (AI->IsReleased) {
-        getContext()->logger.always("freed here:");
-        AI->ReleaseStack.print();
-    }
+    PrintAllocateInfo(Addr, AI.get());
 }
 
 void ReportDoubleFree(uptr Addr, const StackTrace &Stack,
@@ -139,16 +142,10 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report,
                     "Failed to find which chunck {} is allocated",
                     (void *)Report.Address);
             }
-            assert(AllocInfo->IsReleased);
+            assert(AllocInfo->IsReleased &&
+                   "It must be released since it's use-after-free");
 
-            getContext()->logger.always(
-                "{} is located inside of {} region [{}, {})",
-                (void *)Report.Address, ToString(AllocInfo->Type),
-                (void *)AllocInfo->UserBegin, (void *)AllocInfo->UserEnd);
-            getContext()->logger.always("allocated here:");
-            AllocInfo->AllocStack.print();
-            getContext()->logger.always("released here:");
-            AllocInfo->ReleaseStack.print();
+            PrintAllocateInfo(Report.Address, AllocInfo.get());
         }
     } else {
         getContext()->logger.always(
@@ -157,4 +154,47 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report,
     }
 }
 
+void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex,
+                                 uptr Addr, const ValidateUSMResult &VR,
+                                 StackTrace Stack) {
+    getContext()->logger.always("\n====ERROR: DeviceSanitizer: "
+                                "invalid-argument on kernel <{}>",
+                                DemangleName(GetKernelName(Kernel)));
+    Stack.print();
+    auto &AI = VR.AI;
+    switch (VR.Type) {
+    case ValidateUSMResult::MAYBE_HOST_POINTER:
+        getContext()->logger.always("The {}th argument {} is not a USM pointer",
+                                    ArgIndex + 1, (void *)Addr);
+        break;
+    case ValidateUSMResult::RELEASED_POINTER:
+        getContext()->logger.always(
+            "The {}th argument {} is a released USM pointer", ArgIndex,
+            (void *)Addr);
+        PrintAllocateInfo(Addr, AI.get());
+        break;
+    case ValidateUSMResult::BAD_CONTEXT:
+        getContext()->logger.always(
+            "The {}th argument {} is allocated in other context", ArgIndex,
+            (void *)Addr);
+        PrintAllocateInfo(Addr, AI.get());
+        break;
+    case ValidateUSMResult::BAD_DEVICE:
+        getContext()->logger.always(
+            "The {}th argument {} is allocated in other device", ArgIndex,
+            (void *)Addr);
+        PrintAllocateInfo(Addr, AI.get());
+        break;
+    case ValidateUSMResult::OUT_OF_BOUNDS:
+        getContext()->logger.always(
+            "The {}th argument {} is located outside of its region [{}, {})",
+            ArgIndex, (void *)Addr, (void *)AI->UserBegin, (void *)AI->UserEnd);
+        getContext()->logger.always("allocated here:");
+        AI->AllocStack.print();
+        break;
+    default:
+        break;
+    }
+}
+
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_report.hpp b/source/loader/layers/sanitizer/asan_report.hpp
index 77a182b0e6..0dd8f346d0 100644
--- a/source/loader/layers/sanitizer/asan_report.hpp
+++ b/source/loader/layers/sanitizer/asan_report.hpp
@@ -21,6 +21,7 @@ namespace ur_sanitizer_layer {
 struct DeviceSanitizerReport;
 struct AllocInfo;
 struct StackTrace;
+struct ValidateUSMResult;
 
 void ReportBadFree(uptr Addr, const StackTrace &stack,
                    const std::shared_ptr<AllocInfo> &AllocInfo);
@@ -40,4 +41,8 @@ void ReportGenericError(const DeviceSanitizerReport &Report,
 void ReportUseAfterFree(const DeviceSanitizerReport &Report,
                         ur_kernel_handle_t Kernel, ur_context_handle_t Context);
 
+void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex,
+                                 uptr Addr, const ValidateUSMResult &VR,
+                                 StackTrace Stack);
+
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_validator.cpp b/source/loader/layers/sanitizer/asan_validator.cpp
new file mode 100644
index 0000000000..a9f2bd2b17
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_validator.cpp
@@ -0,0 +1,77 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_validator.cpp
+ *
+ */
+
+#include "asan_validator.hpp"
+#include "asan_interceptor.hpp"
+#include "ur_sanitizer_utils.hpp"
+
+namespace ur_sanitizer_layer {
+
+namespace {
+
+bool IsSameDevice(ur_device_handle_t Device1, ur_device_handle_t Device2) {
+    if (Device1 == Device2) {
+        return true;
+    }
+    auto RootDevice1 = GetParentDevice(Device1);
+    RootDevice1 = RootDevice1 ? RootDevice1 : Device1;
+    auto RootDevice2 = GetParentDevice(Device2);
+    RootDevice2 = RootDevice2 ? RootDevice2 : Device2;
+    if (RootDevice1 == RootDevice2) {
+        return true;
+    }
+    return false;
+}
+
+} // namespace
+
+ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context,
+                                     ur_device_handle_t Device, uptr Ptr) {
+    assert(Ptr != 0 && "Don't validate nullptr here");
+
+    auto AllocInfoItOp = getContext()->interceptor->findAllocInfoByAddress(Ptr);
+    if (!AllocInfoItOp) {
+        auto DI = getContext()->interceptor->getDeviceInfo(Device);
+        bool IsSupportSharedSystemUSM = DI->IsSupportSharedSystemUSM;
+        if (IsSupportSharedSystemUSM) {
+            // maybe it's host pointer
+            return ValidateUSMResult::success();
+        }
+        return ValidateUSMResult::fail(ValidateUSMResult::MAYBE_HOST_POINTER);
+    }
+
+    auto AllocInfo = AllocInfoItOp.value()->second;
+
+    if (AllocInfo->Context != Context) {
+        return ValidateUSMResult::fail(ValidateUSMResult::BAD_CONTEXT,
+                                       AllocInfo);
+    }
+
+    if (AllocInfo->Device && !IsSameDevice(AllocInfo->Device, Device)) {
+        return ValidateUSMResult::fail(ValidateUSMResult::BAD_DEVICE,
+                                       AllocInfo);
+    }
+
+    if (AllocInfo->IsReleased) {
+        return ValidateUSMResult::fail(ValidateUSMResult::RELEASED_POINTER,
+                                       AllocInfo);
+    }
+
+    if (Ptr < AllocInfo->UserBegin || Ptr >= AllocInfo->UserEnd) {
+        return ValidateUSMResult::fail(ValidateUSMResult::OUT_OF_BOUNDS,
+                                       AllocInfo);
+    }
+
+    return ValidateUSMResult::success();
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_validator.hpp b/source/loader/layers/sanitizer/asan_validator.hpp
new file mode 100644
index 0000000000..52db966562
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_validator.hpp
@@ -0,0 +1,50 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_validator.hpp
+ *
+ */
+#pragma once
+
+#include "asan_allocator.hpp"
+
+namespace ur_sanitizer_layer {
+
+struct ValidateUSMResult {
+    enum ErrorType {
+        SUCCESS,
+        NULL_POINTER,
+        MAYBE_HOST_POINTER,
+        RELEASED_POINTER,
+        BAD_CONTEXT,
+        BAD_DEVICE,
+        OUT_OF_BOUNDS
+    };
+    ErrorType Type;
+    std::shared_ptr<AllocInfo> AI;
+
+    operator bool() { return Type != SUCCESS; }
+
+    static ValidateUSMResult success() { return {SUCCESS, nullptr}; }
+
+    static ValidateUSMResult fail(ErrorType Type,
+                                  const std::shared_ptr<AllocInfo> &AI) {
+        assert(Type != SUCCESS && "The error type shouldn't be SUCCESS");
+        return {Type, AI};
+    }
+
+    static ValidateUSMResult fail(ErrorType Type) {
+        assert(Type != SUCCESS && "The error type shouldn't be SUCCESS");
+        return {Type, nullptr};
+    }
+};
+
+ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context,
+                                     ur_device_handle_t Device, uptr Ptr);
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/stacktrace.cpp b/source/loader/layers/sanitizer/stacktrace.cpp
index cf28c8b091..e33fcf0416 100644
--- a/source/loader/layers/sanitizer/stacktrace.cpp
+++ b/source/loader/layers/sanitizer/stacktrace.cpp
@@ -99,7 +99,7 @@ void StackTrace::print() const {
             uptr Offset;
             ParseBacktraceInfo(BI, ModuleName, Offset);
             if (SymbolizeCode(ModuleName, Offset, Result)) {
-                SourceInfo SrcInfo = ParseSymbolizerOutput(Result);
+                SourceInfo SrcInfo = ParseSymbolizerOutput(std::move(Result));
                 if (SrcInfo.file != "??") {
                     getContext()->logger.always(" #{} in {} {}:{}:{}", index,
                                                 SrcInfo.function, SrcInfo.file,
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 7fae0285b8..e5e963806b 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -11,9 +11,13 @@
  */
 
 #include "asan_interceptor.hpp"
+#include "asan_options.hpp"
+#include "stacktrace.hpp"
 #include "ur_sanitizer_layer.hpp"
 #include "ur_sanitizer_utils.hpp"
 
+#include <memory>
+
 namespace ur_sanitizer_layer {
 
 namespace {
@@ -31,7 +35,11 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
             getContext()->logger.error("Unsupport device");
             return UR_RESULT_ERROR_INVALID_DEVICE;
         }
-        getContext()->logger.info("Add {} into context {}", ToString(DI->Type),
+        getContext()->logger.info(
+            "DeviceInfo {} (Type={}, IsSupportSharedSystemUSM={})",
+            (void *)DI->Handle, ToString(DI->Type),
+            DI->IsSupportSharedSystemUSM);
+        getContext()->logger.info("Add {} into context {}", (void *)DI->Handle,
                                   (void *)Context);
         if (!DI->ShadowOffset) {
             UR_CALL(DI->allocShadowMemory(Context));
@@ -44,6 +52,38 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
 
 } // namespace
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urAdapterGet
+__urdlllocal ur_result_t UR_APICALL urAdapterGet(
+    uint32_t
+        NumEntries, ///< [in] the number of adapters to be added to phAdapters.
+    ///< If phAdapters is not NULL, then NumEntries should be greater than
+    ///< zero, otherwise ::UR_RESULT_ERROR_INVALID_SIZE,
+    ///< will be returned.
+    ur_adapter_handle_t *
+        phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters.
+    ///< If NumEntries is less than the number of adapters available, then
+    ///< ::urAdapterGet shall only retrieve that number of platforms.
+    uint32_t *
+        pNumAdapters ///< [out][optional] returns the total number of adapters available.
+) {
+    auto pfnAdapterGet = getContext()->urDdiTable.Global.pfnAdapterGet;
+
+    if (nullptr == pfnAdapterGet) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters);
+    if (result == UR_RESULT_SUCCESS && phAdapters) {
+        const uint32_t NumAdapters = pNumAdapters ? *pNumAdapters : NumEntries;
+        for (uint32_t i = 0; i < NumAdapters; ++i) {
+            UR_CALL(getContext()->interceptor->holdAdapter(phAdapters[i]));
+        }
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urUSMHostAlloc
 __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc(
@@ -424,6 +464,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate(
 
     std::shared_ptr<MemBuffer> pMemBuffer =
         std::make_shared<MemBuffer>(hContext, size, hostPtrOrNull);
+
+    if (Host && (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
+        std::shared_ptr<ContextInfo> CtxInfo =
+            getContext()->interceptor->getContextInfo(hContext);
+        for (const auto &hDevice : CtxInfo->DeviceList) {
+            ManagedQueue InternalQueue(hContext, hDevice);
+            char *Handle = nullptr;
+            UR_CALL(pMemBuffer->getHandle(hDevice, Handle));
+            UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                InternalQueue, true, Handle, Host, size, 0, nullptr, nullptr));
+        }
+    }
+
     ur_result_t result = getContext()->interceptor->insertMemBuffer(pMemBuffer);
     *phBuffer = ur_cast<ur_mem_handle_t>(pMemBuffer.get());
 
@@ -1283,6 +1336,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgPointer
+__urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    const ur_kernel_arg_pointer_properties_t
+        *pProperties, ///< [in][optional] pointer to USM pointer properties.
+    const void *
+        pArgValue ///< [in][optional] Pointer obtained by USM allocation or virtual memory
+    ///< mapping operation. If null then argument value is considered null.
+) {
+    auto pfnSetArgPointer = getContext()->urDdiTable.Kernel.pfnSetArgPointer;
+
+    if (nullptr == pfnSetArgPointer) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    getContext()->logger.debug(
+        "==== urKernelSetArgPointer (argIndex={}, pArgValue={})", argIndex,
+        pArgValue);
+
+    if (Options(getContext()->logger).DetectKernelArguments) {
+        auto KI = getContext()->interceptor->getKernelInfo(hKernel);
+        std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
+        KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()};
+    }
+
+    ur_result_t result =
+        pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Global table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetGlobalProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_global_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnAdapterGet = ur_sanitizer_layer::urAdapterGet;
+
+    return result;
+}
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Context table
 ///        with current process' addresses
@@ -1379,6 +1495,7 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnSetArgValue = ur_sanitizer_layer::urKernelSetArgValue;
     pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::urKernelSetArgMemObj;
     pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal;
+    pDdiTable->pfnSetArgPointer = ur_sanitizer_layer::urKernelSetArgPointer;
 
     return result;
 }
@@ -1555,6 +1672,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
 
     urDdiTable = *dditable;
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetGlobalProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Global);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_sanitizer_layer::urGetContextProcAddrTable(
             UR_API_VERSION_CURRENT, &dditable->Context);
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp
index 2dd98b945d..feaff8757a 100644
--- a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp
+++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp
@@ -152,6 +152,23 @@ DeviceType GetDeviceType(ur_context_handle_t Context,
     }
 }
 
+ur_device_handle_t GetParentDevice(ur_device_handle_t Device) {
+    ur_device_handle_t ParentDevice{};
+    [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo(
+        Device, UR_DEVICE_INFO_PARENT_DEVICE, sizeof(ur_device_handle_t),
+        &ParentDevice, nullptr);
+    assert(Result == UR_RESULT_SUCCESS && "getParentDevice() failed");
+    return ParentDevice;
+}
+
+bool GetDeviceUSMCapability(ur_device_handle_t Device,
+                            ur_device_info_t USMInfo) {
+    ur_device_usm_access_capability_flags_t Flag;
+    [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo(
+        Device, USMInfo, sizeof(Flag), &Flag, nullptr);
+    return (bool)Flag;
+}
+
 std::vector<ur_device_handle_t> GetProgramDevices(ur_program_handle_t Program) {
     size_t PropSize;
     [[maybe_unused]] ur_result_t Result =
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp
index 92cb4cebc4..44ddf46922 100644
--- a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp
+++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp
@@ -36,6 +36,9 @@ ur_context_handle_t GetContext(ur_kernel_handle_t Kernel);
 ur_device_handle_t GetDevice(ur_queue_handle_t Queue);
 DeviceType GetDeviceType(ur_context_handle_t Context,
                          ur_device_handle_t Device);
+ur_device_handle_t GetParentDevice(ur_device_handle_t Device);
+bool GetDeviceUSMCapability(ur_device_handle_t Device,
+                            ur_device_info_t Feature);
 std::string GetKernelName(ur_kernel_handle_t Kernel);
 size_t GetDeviceLocalMemorySize(ur_device_handle_t Device);
 ur_program_handle_t GetProgram(ur_kernel_handle_t Kernel);
diff --git a/source/loader/layers/tracing/ur_tracing_layer.cpp b/source/loader/layers/tracing/ur_tracing_layer.cpp
index 88aff57526..722ee77faa 100644
--- a/source/loader/layers/tracing/ur_tracing_layer.cpp
+++ b/source/loader/layers/tracing/ur_tracing_layer.cpp
@@ -21,7 +21,7 @@
 namespace ur_tracing_layer {
 context_t *getContext() { return context_t::get_direct(); }
 
-constexpr auto CALL_STREAM_NAME = "ur";
+constexpr auto CALL_STREAM_NAME = "ur.call";
 constexpr auto STREAM_VER_MAJOR = UR_MAJOR_VERSION(UR_API_VERSION_CURRENT);
 constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT);
 
@@ -29,36 +29,19 @@ constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT);
 // Unfortunately this doesn't match the semantics of XPTI, which can be initialized
 // and finalized exactly once. To workaround this, XPTI is globally initialized on
 // first use and finalized in the destructor.
-class XptiContext {
-    XptiContext() {
-        xptiFrameworkInitialize();
-        inited = true;
-    }
-
-    ~XptiContext() {
-        xptiFrameworkFinalize();
-        inited = false;
-    }
-
-    // Accessing this after destruction is technically UB, but if we get there,
-    // it means something is calling UR after it has been destroyed at program
-    // exit.
-    std::atomic_bool inited;
-
-  public:
-    static bool running() {
-        static XptiContext context;
-        return context.inited;
-    }
+struct XptiContextManager {
+    XptiContextManager() { xptiFrameworkInitialize(); }
+    ~XptiContextManager() { xptiFrameworkFinalize(); }
 };
 
+static std::shared_ptr<XptiContextManager> xptiContextManagerGlobal = [] {
+    return std::make_shared<XptiContextManager>();
+}();
 static thread_local xpti_td *activeEvent;
 
 ///////////////////////////////////////////////////////////////////////////////
 context_t::context_t() : logger(logger::create_logger("tracing", true, true)) {
-    if (!XptiContext::running()) {
-        return;
-    }
+    this->xptiContextManager = xptiContextManagerGlobal;
 
     call_stream_id = xptiRegisterStream(CALL_STREAM_NAME);
     std::ostringstream streamv;
@@ -69,20 +52,12 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) {
 
 void context_t::notify(uint16_t trace_type, uint32_t id, const char *name,
                        void *args, ur_result_t *resultp, uint64_t instance) {
-    if (!XptiContext::running()) {
-        return;
-    }
-
     xpti::function_with_args_t payload{id, name, args, resultp, nullptr};
     xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent,
                           instance, &payload);
 }
 
 uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) {
-    if (!XptiContext::running()) {
-        return 0;
-    }
-
     if (auto loc = codelocData.get_codeloc()) {
         xpti::payload_t payload =
             xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber,
@@ -101,20 +76,10 @@ uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) {
 
 void context_t::notify_end(uint32_t id, const char *name, void *args,
                            ur_result_t *resultp, uint64_t instance) {
-    if (!XptiContext::running()) {
-        return;
-    }
-
     notify((uint16_t)xpti::trace_point_type_t::function_with_args_end, id, name,
            args, resultp, instance);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-context_t::~context_t() {
-    if (!XptiContext::running()) {
-        return;
-    }
-
-    xptiFinalize(CALL_STREAM_NAME);
-}
+context_t::~context_t() { xptiFinalize(CALL_STREAM_NAME); }
 } // namespace ur_tracing_layer
diff --git a/source/loader/layers/tracing/ur_tracing_layer.hpp b/source/loader/layers/tracing/ur_tracing_layer.hpp
index 1a5c542ee6..b7e3fc0314 100644
--- a/source/loader/layers/tracing/ur_tracing_layer.hpp
+++ b/source/loader/layers/tracing/ur_tracing_layer.hpp
@@ -21,6 +21,8 @@
 #define TRACING_COMP_NAME "tracing layer"
 
 namespace ur_tracing_layer {
+struct XptiContextManager;
+
 ///////////////////////////////////////////////////////////////////////////////
 class __urdlllocal context_t : public proxy_layer_context_t,
                                public AtomicSingleton<context_t> {
@@ -47,6 +49,8 @@ class __urdlllocal context_t : public proxy_layer_context_t,
     uint8_t call_stream_id;
 
     inline static const std::string name = "UR_LAYER_TRACING";
+
+    std::shared_ptr<XptiContextManager> xptiContextManager;
 };
 
 context_t *getContext();
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 315be97531..a368ae7b1a 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -705,7 +705,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -719,14 +720,14 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     }
 
     ur_device_create_with_native_handle_params_t params = {
-        &hNativeDevice, &hPlatform, &pProperties, &phDevice};
+        &hNativeDevice, &hAdapter, &pProperties, &phDevice};
     uint64_t instance =
         getContext()->notify_begin(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE,
                                    "urDeviceCreateWithNativeHandle", &params);
 
     getContext()->logger.info("---> urDeviceCreateWithNativeHandle");
 
-    ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform,
+    ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter,
                                                    pProperties, phDevice);
 
     getContext()->notify_end(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE,
@@ -6048,6 +6049,51 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp
+__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+) {
+    auto pfnMapExternalLinearMemoryExp =
+        getContext()
+            ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp;
+
+    if (nullptr == pfnMapExternalLinearMemoryExp) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_bindless_images_map_external_linear_memory_exp_params_t params = {
+        &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem};
+    uint64_t instance = getContext()->notify_begin(
+        UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP,
+        "urBindlessImagesMapExternalLinearMemoryExp", &params);
+
+    getContext()->logger.info(
+        "---> urBindlessImagesMapExternalLinearMemoryExp");
+
+    ur_result_t result = pfnMapExternalLinearMemoryExp(
+        hContext, hDevice, offset, size, hExternalMem, ppRetMem);
+
+    getContext()->notify_end(
+        UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP,
+        "urBindlessImagesMapExternalLinearMemoryExp", &params, &result,
+        instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP,
+        &params);
+    getContext()->logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp
 __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
@@ -8126,6 +8172,11 @@ __urdlllocal ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
     pDdiTable->pfnMapExternalArrayExp =
         ur_tracing_layer::urBindlessImagesMapExternalArrayExp;
 
+    dditable.pfnMapExternalLinearMemoryExp =
+        pDdiTable->pfnMapExternalLinearMemoryExp;
+    pDdiTable->pfnMapExternalLinearMemoryExp =
+        ur_tracing_layer::urBindlessImagesMapExternalLinearMemoryExp;
+
     dditable.pfnReleaseExternalMemoryExp =
         pDdiTable->pfnReleaseExternalMemoryExp;
     pDdiTable->pfnReleaseExternalMemoryExp =
diff --git a/source/loader/layers/validation/ur_leak_check.hpp b/source/loader/layers/validation/ur_leak_check.hpp
index 56998797a3..7ce5415d96 100644
--- a/source/loader/layers/validation/ur_leak_check.hpp
+++ b/source/loader/layers/validation/ur_leak_check.hpp
@@ -109,7 +109,7 @@ struct RefCountContext {
         // No more active adapters, so any references still held are leaked
         if (adapterCount == 0) {
             logInvalidReferences();
-            clear();
+            counts.clear();
         }
     }
 
@@ -133,9 +133,8 @@ struct RefCountContext {
         updateRefCount(handle, REFCOUNT_CREATE_OR_INCREASE, isAdapterHandle);
     }
 
-    void clear() { counts.clear(); }
-
     template <typename T> bool isReferenceValid(T handle) {
+        std::unique_lock<std::mutex> lock(mutex);
         auto it = counts.find(static_cast<void *>(handle));
         if (it == counts.end() || it->second.refCount < 1) {
             return false;
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 8539951293..8c178b1091 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -719,7 +719,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -733,7 +734,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     }
 
     if (getContext()->enableParameterValidation) {
-        if (NULL == hPlatform) {
+        if (NULL == hAdapter) {
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
@@ -742,7 +743,12 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
         }
     }
 
-    ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform,
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hAdapter)) {
+        getContext()->refCountContext->logInvalidReference(hAdapter);
+    }
+
+    ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter,
                                                    pProperties, phDevice);
 
     if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
@@ -4822,9 +4828,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hBuffer, offset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, offset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -4902,9 +4910,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hBuffer, offset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, offset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5033,9 +5043,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = bounds(hBuffer, bufferOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, bufferOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5168,9 +5180,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = bounds(hBuffer, bufferOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, bufferOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5248,14 +5262,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hBufferSrc, srcOffset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBufferSrc, srcOffset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
-        if (auto boundsError = bounds(hBufferDst, dstOffset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBufferDst, dstOffset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5383,14 +5401,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = bounds(hBufferSrc, srcOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBufferSrc, srcOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
-        if (auto boundsError = bounds(hBufferDst, dstOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBufferDst, dstOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5492,9 +5514,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = bounds(hBuffer, offset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, offset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5579,9 +5603,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = boundsImage(hImage, origin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = boundsImage(hImage, origin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5667,9 +5693,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = boundsImage(hImage, origin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = boundsImage(hImage, origin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5756,14 +5784,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
-        if (auto boundsError = boundsImage(hImageDst, dstOrigin, region);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = boundsImage(hImageDst, dstOrigin, region);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -5850,9 +5882,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hBuffer, offset, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hBuffer, offset, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -6012,9 +6046,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hQueue, pMem, 0, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pMem, 0, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -6089,14 +6125,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hQueue, pDst, 0, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pDst, 0, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
-        if (auto boundsError = bounds(hQueue, pSrc, 0, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pSrc, 0, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -6169,9 +6209,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hQueue, pMem, 0, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pMem, 0, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -6230,9 +6272,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise(
             return UR_RESULT_ERROR_INVALID_SIZE;
         }
 
-        if (auto boundsError = bounds(hQueue, pMem, 0, size);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pMem, 0, size);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
     }
 
@@ -6332,9 +6376,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -6431,14 +6477,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
             return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
         }
 
-        if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
-        if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height);
-            boundsError != UR_RESULT_SUCCESS) {
-            return boundsError;
+        if (getContext()->enableBoundsChecking) {
+            if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height);
+                boundsError != UR_RESULT_SUCCESS) {
+                return boundsError;
+            }
         }
 
         if (phEventWaitList != NULL && numEventsInWaitList > 0) {
@@ -7549,6 +7599,59 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp
+__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+) {
+    auto pfnMapExternalLinearMemoryExp =
+        getContext()
+            ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp;
+
+    if (nullptr == pfnMapExternalLinearMemoryExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (getContext()->enableParameterValidation) {
+        if (NULL == hContext) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == hDevice) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == hExternalMem) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == ppRetMem) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+    }
+
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hContext)) {
+        getContext()->refCountContext->logInvalidReference(hContext);
+    }
+
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hDevice)) {
+        getContext()->refCountContext->logInvalidReference(hDevice);
+    }
+
+    ur_result_t result = pfnMapExternalLinearMemoryExp(
+        hContext, hDevice, offset, size, hExternalMem, ppRetMem);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp
 __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
@@ -9751,6 +9854,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
     pDdiTable->pfnMapExternalArrayExp =
         ur_validation_layer::urBindlessImagesMapExternalArrayExp;
 
+    dditable.pfnMapExternalLinearMemoryExp =
+        pDdiTable->pfnMapExternalLinearMemoryExp;
+    pDdiTable->pfnMapExternalLinearMemoryExp =
+        ur_validation_layer::urBindlessImagesMapExternalLinearMemoryExp;
+
     dditable.pfnReleaseExternalMemoryExp =
         pDdiTable->pfnReleaseExternalMemoryExp;
     pDdiTable->pfnReleaseExternalMemoryExp =
@@ -10939,9 +11047,13 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
 
     if (enabledLayerNames.count(nameFullValidation)) {
         enableParameterValidation = true;
+        enableBoundsChecking = true;
         enableLeakChecking = true;
         enableLifetimeValidation = true;
     } else {
+        if (enabledLayerNames.count(nameBoundsChecking)) {
+            enableBoundsChecking = true;
+        }
         if (enabledLayerNames.count(nameParameterValidation)) {
             enableParameterValidation = true;
         }
@@ -11069,13 +11181,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
 }
 
 ur_result_t context_t::tearDown() {
-    ur_result_t result = UR_RESULT_SUCCESS;
-
     if (enableLeakChecking) {
         getContext()->refCountContext->logInvalidReferences();
-        getContext()->refCountContext->clear();
     }
-    return result;
+
+    return UR_RESULT_SUCCESS;
 }
 
 } // namespace ur_validation_layer
diff --git a/source/loader/layers/validation/ur_validation_layer.hpp b/source/loader/layers/validation/ur_validation_layer.hpp
index aa3d4629b7..642829f7f0 100644
--- a/source/loader/layers/validation/ur_validation_layer.hpp
+++ b/source/loader/layers/validation/ur_validation_layer.hpp
@@ -24,6 +24,7 @@ class __urdlllocal context_t : public proxy_layer_context_t,
                                public AtomicSingleton<context_t> {
   public:
     bool enableParameterValidation = false;
+    bool enableBoundsChecking = false;
     bool enableLeakChecking = false;
     bool enableLifetimeValidation = false;
     logger::Logger logger;
@@ -35,7 +36,7 @@ class __urdlllocal context_t : public proxy_layer_context_t,
 
     static std::vector<std::string> getNames() {
         return {nameFullValidation, nameParameterValidation, nameLeakChecking,
-                nameLifetimeValidation};
+                nameBoundsChecking, nameLifetimeValidation};
     }
     ur_result_t init(ur_dditable_t *dditable,
                      const std::set<std::string> &enabledLayerNames,
@@ -49,6 +50,8 @@ class __urdlllocal context_t : public proxy_layer_context_t,
         "UR_LAYER_FULL_VALIDATION";
     inline static const std::string nameParameterValidation =
         "UR_LAYER_PARAMETER_VALIDATION";
+    inline static const std::string nameBoundsChecking =
+        "UR_LAYER_BOUNDS_CHECKING";
     inline static const std::string nameLeakChecking = "UR_LAYER_LEAK_CHECKING";
     inline static const std::string nameLifetimeValidation =
         "UR_LAYER_LIFETIME_VALIDATION";
diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in
index b94442c9a7..5e628b4faf 100644
--- a/source/loader/loader.def.in
+++ b/source/loader/loader.def.in
@@ -12,6 +12,7 @@ EXPORTS
 	urBindlessImagesImportExternalMemoryExp
 	urBindlessImagesImportExternalSemaphoreExp
 	urBindlessImagesMapExternalArrayExp
+	urBindlessImagesMapExternalLinearMemoryExp
 	urBindlessImagesMipmapFreeExp
 	urBindlessImagesMipmapGetLevelExp
 	urBindlessImagesReleaseExternalMemoryExp
@@ -181,6 +182,7 @@ EXPORTS
 	urPrintBindlessImagesImportExternalMemoryExpParams
 	urPrintBindlessImagesImportExternalSemaphoreExpParams
 	urPrintBindlessImagesMapExternalArrayExpParams
+	urPrintBindlessImagesMapExternalLinearMemoryExpParams
 	urPrintBindlessImagesMipmapFreeExpParams
 	urPrintBindlessImagesMipmapGetLevelExpParams
 	urPrintBindlessImagesReleaseExternalMemoryExpParams
diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in
index ab5a7c19ab..18e4018aee 100644
--- a/source/loader/loader.map.in
+++ b/source/loader/loader.map.in
@@ -12,6 +12,7 @@
 		urBindlessImagesImportExternalMemoryExp;
 		urBindlessImagesImportExternalSemaphoreExp;
 		urBindlessImagesMapExternalArrayExp;
+		urBindlessImagesMapExternalLinearMemoryExp;
 		urBindlessImagesMipmapFreeExp;
 		urBindlessImagesMipmapGetLevelExp;
 		urBindlessImagesReleaseExternalMemoryExp;
@@ -181,6 +182,7 @@
 		urPrintBindlessImagesImportExternalMemoryExpParams;
 		urPrintBindlessImagesImportExternalSemaphoreExpParams;
 		urPrintBindlessImagesMapExternalArrayExpParams;
+		urPrintBindlessImagesMapExternalLinearMemoryExpParams;
 		urPrintBindlessImagesMipmapFreeExpParams;
 		urPrintBindlessImagesMipmapGetLevelExpParams;
 		urPrintBindlessImagesReleaseExternalMemoryExpParams;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 1a867fb57d..c1d023af55 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -764,7 +764,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -775,8 +776,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     [[maybe_unused]] auto context = getContext();
 
     // extract platform's function pointer table
-    auto dditable =
-        reinterpret_cast<ur_platform_object_t *>(hPlatform)->dditable;
+    auto dditable = reinterpret_cast<ur_adapter_object_t *>(hAdapter)->dditable;
     auto pfnCreateWithNativeHandle =
         dditable->ur.Device.pfnCreateWithNativeHandle;
     if (nullptr == pfnCreateWithNativeHandle) {
@@ -784,10 +784,10 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     }
 
     // convert loader handle to platform handle
-    hPlatform = reinterpret_cast<ur_platform_object_t *>(hPlatform)->handle;
+    hAdapter = reinterpret_cast<ur_adapter_object_t *>(hAdapter)->handle;
 
     // forward to device-platform
-    result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties,
+    result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties,
                                        phDevice);
 
     if (UR_RESULT_SUCCESS != result) {
@@ -6641,6 +6641,46 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp
+__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    [[maybe_unused]] auto context = getContext();
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_context_object_t *>(hContext)->dditable;
+    auto pfnMapExternalLinearMemoryExp =
+        dditable->ur.BindlessImagesExp.pfnMapExternalLinearMemoryExp;
+    if (nullptr == pfnMapExternalLinearMemoryExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hContext = reinterpret_cast<ur_context_object_t *>(hContext)->handle;
+
+    // convert loader handle to platform handle
+    hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;
+
+    // convert loader handle to platform handle
+    hExternalMem =
+        reinterpret_cast<ur_exp_external_mem_object_t *>(hExternalMem)->handle;
+
+    // forward to device-platform
+    result = pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size,
+                                           hExternalMem, ppRetMem);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp
 __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp(
@@ -8691,6 +8731,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
                 ur_loader::urBindlessImagesImportExternalMemoryExp;
             pDdiTable->pfnMapExternalArrayExp =
                 ur_loader::urBindlessImagesMapExternalArrayExp;
+            pDdiTable->pfnMapExternalLinearMemoryExp =
+                ur_loader::urBindlessImagesMapExternalLinearMemoryExp;
             pDdiTable->pfnReleaseExternalMemoryExp =
                 ur_loader::urBindlessImagesReleaseExternalMemoryExp;
             pDdiTable->pfnImportExternalSemaphoreExp =
diff --git a/source/loader/ur_lib.cpp b/source/loader/ur_lib.cpp
index 12b159b0e5..9aad7159c3 100644
--- a/source/loader/ur_lib.cpp
+++ b/source/loader/ur_lib.cpp
@@ -57,7 +57,8 @@ void context_t::initLayers() const {
 }
 
 void context_t::tearDownLayers() const {
-    for (auto &[layer, destroy] : layers) {
+    for (auto it = layers.rbegin(); it != layers.rend(); ++it) {
+        auto [layer, destroy] = *it;
         layer->tearDown();
         destroy();
     }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index aadcb5cda1..5ab7c58803 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -1135,7 +1135,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle(
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `NULL == hPlatform`
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phDevice`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -1143,7 +1143,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle(
 ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -1155,7 +1156,7 @@ ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties,
+    return pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties,
                                      phDevice);
 } catch (...) {
     return exceptionToResult(std::current_exception());
@@ -7111,6 +7112,48 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Map an external memory handle to a device memory region described by
+///        void*
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hContext`
+///         + `NULL == hDevice`
+///         + `NULL == hExternalMem`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == ppRetMem`
+///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+    ) try {
+    auto pfnMapExternalLinearMemoryExp =
+        ur_lib::getContext()
+            ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp;
+    if (nullptr == pfnMapExternalLinearMemoryExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size,
+                                         hExternalMem, ppRetMem);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Release external memory
 ///
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index db31600e2c..f9d510e95d 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1219,6 +1219,15 @@ ur_result_t urPrintBindlessImagesMapExternalArrayExpParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintBindlessImagesMapExternalLinearMemoryExpParams(
+    const struct ur_bindless_images_map_external_linear_memory_exp_params_t
+        *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintBindlessImagesReleaseExternalMemoryExpParams(
     const struct ur_bindless_images_release_external_memory_exp_params_t
         *params,
diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp
index cff431069a..a9da40775e 100644
--- a/source/ur/ur.cpp
+++ b/source/ur/ur.cpp
@@ -14,10 +14,11 @@
 
 // Controls tracing UR calls from within the UR itself.
 bool PrintTrace = [] {
+  const char *UrRet = std::getenv("SYCL_UR_TRACE");
   const char *PiRet = std::getenv("SYCL_PI_TRACE");
-  const char *Trace = PiRet ? PiRet : nullptr;
+  const char *Trace = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   const int TraceValue = Trace ? std::stoi(Trace) : 0;
-  if (TraceValue == -1 || TraceValue == 2) { // Means print all traces
+  if (TraceValue == -1 || TraceValue == 2) {
     return true;
   }
   return false;
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 0b3ee0b936..54bfdcda42 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -997,7 +997,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle(
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `NULL == hPlatform`
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phDevice`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -1005,7 +1005,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle(
 ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t
         hNativeDevice, ///< [in][nocheck] the native handle of the device.
-    ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs
     const ur_device_native_properties_t *
         pProperties, ///< [in][optional] pointer to native device properties struct.
     ur_device_handle_t
@@ -6051,6 +6052,39 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Map an external memory handle to a device memory region described by
+///        void*
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hContext`
+///         + `NULL == hDevice`
+///         + `NULL == hExternalMem`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == ppRetMem`
+///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    uint64_t offset,              ///< [in] offset into memory region to map
+    uint64_t size,                ///< [in] size of memory region to map
+    ur_exp_external_mem_handle_t
+        hExternalMem, ///< [in] external memory handle to the external memory
+    void **ppRetMem   ///< [out] pointer of the externally allocated memory
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Release external memory
 ///
diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
index 6eb502907b..e4ac022507 100644
--- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
+++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
@@ -17,6 +17,6 @@ TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
 
     ur_native_handle_t nativeCuda = static_cast<ur_native_handle_t>(cudaDevice);
     ur_device_handle_t urDevice;
-    ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr,
+    ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, adapter, nullptr,
                                                   &urDevice));
 }
diff --git a/test/adapters/level_zero/v2/command_list_cache_test.cpp b/test/adapters/level_zero/v2/command_list_cache_test.cpp
index b8c7244352..31e40244fd 100644
--- a/test/adapters/level_zero/v2/command_list_cache_test.cpp
+++ b/test/adapters/level_zero/v2/command_list_cache_test.cpp
@@ -193,10 +193,10 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
                         QueueProps.pNext = &IndexProps;
                     }
 
-                    ur_queue_handle_t Queue;
-                    ASSERT_EQ(
-                        urQueueCreate(context, device, &QueueProps, &Queue),
-                        UR_RESULT_SUCCESS);
+                    uur::raii::Queue Queue;
+                    ASSERT_EQ(urQueueCreate(context, device, &QueueProps,
+                                            Queue.ptr()),
+                              UR_RESULT_SUCCESS);
 
                     Queues.emplace_back(Queue);
                 }
diff --git a/test/adapters/level_zero/v2/event_pool_test.cpp b/test/adapters/level_zero/v2/event_pool_test.cpp
index b4f7e46f11..e2aa3dc121 100644
--- a/test/adapters/level_zero/v2/event_pool_test.cpp
+++ b/test/adapters/level_zero/v2/event_pool_test.cpp
@@ -139,7 +139,7 @@ TEST_P(EventPoolTest, Basic) {
         ur_event *first;
         ze_event_handle_t zeFirst;
         {
-            auto pool = cache->borrow(device->Id);
+            auto pool = cache->borrow(device->Id.value());
 
             first = pool->allocate();
             zeFirst = first->getZeEvent();
@@ -148,7 +148,7 @@ TEST_P(EventPoolTest, Basic) {
         ur_event *second;
         ze_event_handle_t zeSecond;
         {
-            auto pool = cache->borrow(device->Id);
+            auto pool = cache->borrow(device->Id.value());
 
             second = pool->allocate();
             zeSecond = second->getZeEvent();
@@ -165,7 +165,7 @@ TEST_P(EventPoolTest, Threaded) {
     for (int iters = 0; iters < 3; ++iters) {
         for (int th = 0; th < 10; ++th) {
             threads.emplace_back([&] {
-                auto pool = cache->borrow(device->Id);
+                auto pool = cache->borrow(device->Id.value());
                 std::vector<ur_event *> events;
                 for (int i = 0; i < 100; ++i) {
                     events.push_back(pool->allocate());
@@ -183,7 +183,7 @@ TEST_P(EventPoolTest, Threaded) {
 }
 
 TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) {
-    auto pool = cache->borrow(device->Id);
+    auto pool = cache->borrow(device->Id.value());
     std::list<ur_event *> events;
     for (int i = 0; i < 128; ++i) {
         events.push_back(pool->allocate());
diff --git a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
index 071183aa9b..5d64b11e09 100644
--- a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
+++ b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
@@ -20,7 +20,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) {
         // and perform some query on it to verify that it works.
         ur_device_handle_t dev = nullptr;
         UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
-            native_handle, platform, nullptr, &dev));
+            native_handle, adapter, nullptr, &dev));
         ASSERT_NE(dev, nullptr);
 
         uint32_t dev_id = 0;
@@ -41,7 +41,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
         ur_device_native_properties_t props{
             UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, true};
         UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
-            native_handle, platform, &props, &dev));
+            native_handle, adapter, &props, &dev));
         ASSERT_NE(dev, nullptr);
 
         uint32_t ref_count = 0;
@@ -64,7 +64,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
         ur_device_native_properties_t props{
             UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, false};
         UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
-            native_handle, platform, &props, &dev));
+            native_handle, adapter, &props, &dev));
         ASSERT_NE(dev, nullptr);
 
         uint32_t ref_count = 0;
@@ -93,7 +93,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullPointerDevice) {
         ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle));
 
         ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
-                         urDeviceCreateWithNativeHandle(native_handle, platform,
+                         urDeviceCreateWithNativeHandle(native_handle, adapter,
                                                         nullptr, nullptr));
     }
 }
diff --git a/test/conformance/exp_command_buffer/CMakeLists.txt b/test/conformance/exp_command_buffer/CMakeLists.txt
index a8ecf793ab..a28d692d9b 100644
--- a/test/conformance/exp_command_buffer/CMakeLists.txt
+++ b/test/conformance/exp_command_buffer/CMakeLists.txt
@@ -12,4 +12,6 @@ add_conformance_test_with_kernels_environment(exp_command_buffer
   release.cpp
   retain.cpp
   invalid_update.cpp
+  commands.cpp
+  fill.cpp
 )
diff --git a/test/conformance/exp_command_buffer/commands.cpp b/test/conformance/exp_command_buffer/commands.cpp
new file mode 100644
index 0000000000..412e4ab6de
--- /dev/null
+++ b/test/conformance/exp_command_buffer/commands.cpp
@@ -0,0 +1,204 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+
+struct urCommandBufferCommandsTest
+    : uur::command_buffer::urCommandBufferExpTest {
+
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::command_buffer::urCommandBufferExpTest::SetUp());
+
+        // Allocate USM pointers
+        for (auto &device_ptr : device_ptrs) {
+            ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                            allocation_size, &device_ptr));
+            ASSERT_NE(device_ptr, nullptr);
+        }
+
+        for (auto &buffer : buffers) {
+            ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE,
+                                             allocation_size, nullptr,
+                                             &buffer));
+
+            ASSERT_NE(buffer, nullptr);
+        }
+    }
+
+    void TearDown() override {
+        for (auto &device_ptr : device_ptrs) {
+            if (device_ptr) {
+                EXPECT_SUCCESS(urUSMFree(context, device_ptr));
+            }
+        }
+
+        for (auto &buffer : buffers) {
+            if (buffer) {
+                EXPECT_SUCCESS(urMemRelease(buffer));
+            }
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::command_buffer::urCommandBufferExpTest::TearDown());
+    }
+
+    static constexpr unsigned elements = 16;
+    static constexpr size_t allocation_size = elements * sizeof(uint32_t);
+
+    std::array<void *, 2> device_ptrs = {nullptr, nullptr};
+    std::array<ur_mem_handle_t, 2> buffers = {nullptr, nullptr};
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferCommandsTest);
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMMemcpyExp) {
+    ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp(
+        cmd_buf_handle, device_ptrs[0], device_ptrs[1], allocation_size, 0,
+        nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMFillExp) {
+    uint32_t pattern = 42;
+    ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp(
+        cmd_buf_handle, device_ptrs[0], &pattern, sizeof(pattern),
+        allocation_size, 0, nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyExp) {
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp(
+        cmd_buf_handle, buffers[0], buffers[1], 0, 0, allocation_size, 0,
+        nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyRectExp) {
+    ur_rect_offset_t origin{0, 0, 0};
+    ur_rect_region_t region{4, 4, 1};
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyRectExp(
+        cmd_buf_handle, buffers[0], buffers[1], origin, origin, region, 4, 16,
+        4, 16, 0, nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadExp) {
+    std::array<uint32_t, elements> host_data{};
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp(
+        cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0,
+        nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadRectExp) {
+    std::array<uint32_t, elements> host_data{};
+    ur_rect_offset_t origin{0, 0, 0};
+    ur_rect_region_t region{4, 4, 1};
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadRectExp(
+        cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16,
+        host_data.data(), 0, nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferWriteExp) {
+    std::array<uint32_t, elements> host_data{};
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteExp(
+        cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0,
+        nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest,
+       urCommandBufferAppendMemBufferWriteRectExp) {
+    std::array<uint32_t, elements> host_data{};
+    ur_rect_offset_t origin{0, 0, 0};
+    ur_rect_region_t region{4, 4, 1};
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteRectExp(
+        cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16,
+        host_data.data(), 0, nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) {
+    uint32_t pattern = 42;
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
+        cmd_buf_handle, buffers[0], &pattern, sizeof(pattern), 0,
+        allocation_size, 0, nullptr, nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) {
+    ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp(
+        cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr,
+        nullptr));
+}
+
+TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMAdviseExp) {
+    ASSERT_SUCCESS(urCommandBufferAppendUSMAdviseExp(
+        cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr,
+        nullptr));
+}
+
+struct urCommandBufferAppendKernelLaunchExpTest
+    : uur::command_buffer::urCommandBufferExpExecutionTest {
+    virtual void SetUp() override {
+        program_name = "saxpy_usm";
+        UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::SetUp());
+        for (auto &shared_ptr : shared_ptrs) {
+            ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                            allocation_size, &shared_ptr));
+            ASSERT_NE(shared_ptr, nullptr);
+        }
+
+        int32_t *ptrX = static_cast<int32_t *>(shared_ptrs[1]);
+        int32_t *ptrY = static_cast<int32_t *>(shared_ptrs[2]);
+        for (size_t i = 0; i < global_size; i++) {
+            ptrX[i] = i;
+            ptrY[i] = i * 2;
+        }
+
+        // Index 0 is output
+        ASSERT_SUCCESS(
+            urKernelSetArgPointer(kernel, 0, nullptr, shared_ptrs[0]));
+        // Index 1 is A
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(A), nullptr, &A));
+        // Index 2 is X
+        ASSERT_SUCCESS(
+            urKernelSetArgPointer(kernel, 2, nullptr, shared_ptrs[1]));
+        // Index 3 is Y
+        ASSERT_SUCCESS(
+            urKernelSetArgPointer(kernel, 3, nullptr, shared_ptrs[2]));
+    }
+
+    virtual void TearDown() override {
+        for (auto &shared_ptr : shared_ptrs) {
+            if (shared_ptr) {
+                EXPECT_SUCCESS(urUSMFree(context, shared_ptr));
+            }
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urCommandBufferExpExecutionTest::TearDown());
+    }
+
+    static constexpr size_t local_size = 4;
+    static constexpr size_t global_size = 32;
+    static constexpr size_t global_offset = 0;
+    static constexpr size_t n_dimensions = 1;
+    static constexpr size_t allocation_size = sizeof(uint32_t) * global_size;
+    static constexpr uint32_t A = 42;
+    std::array<void *, 3> shared_ptrs = {nullptr, nullptr, nullptr};
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferAppendKernelLaunchExpTest);
+TEST_P(urCommandBufferAppendKernelLaunchExpTest, Basic) {
+    ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+        cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size,
+        &local_size, 0, nullptr, nullptr, nullptr));
+
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+
+    ASSERT_SUCCESS(
+        urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    int32_t *ptrZ = static_cast<int32_t *>(shared_ptrs[0]);
+    for (size_t i = 0; i < global_size; i++) {
+        uint32_t result = (A * i) + (i * 2);
+        ASSERT_EQ(result, ptrZ[i]);
+    }
+}
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
index e69de29bb2..8b13789179 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
@@ -0,0 +1 @@
+
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
index e69de29bb2..8b13789179 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
@@ -0,0 +1 @@
+
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match
index e69de29bb2..afb0fb95c5 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match
@@ -0,0 +1,15 @@
+urCommandBufferAppendKernelLaunchExpTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
+urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}}
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index 0a5a2b1317..2508f92fed 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -25,3 +25,4 @@
 {{OPT}}InvalidUpdateTest.GlobalLocalSizeMistach/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
 {{OPT}}InvalidUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
 {{OPT}}InvalidUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
+{{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
diff --git a/test/conformance/exp_command_buffer/fill.cpp b/test/conformance/exp_command_buffer/fill.cpp
new file mode 100644
index 0000000000..2b9a27cf2a
--- /dev/null
+++ b/test/conformance/exp_command_buffer/fill.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+
+struct testParametersFill {
+    size_t size;
+    size_t pattern_size;
+};
+
+struct urCommandBufferFillCommandsTest
+    : uur::command_buffer::urCommandBufferExpTestWithParam<testParametersFill> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::command_buffer::urCommandBufferExpTestWithParam<
+                testParametersFill>::SetUp());
+
+        size = std::get<1>(GetParam()).size;
+        pattern_size = std::get<1>(GetParam()).pattern_size;
+        pattern = std::vector<uint8_t>(pattern_size);
+        uur::generateMemFillPattern(pattern);
+
+        // Allocate USM pointers
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, size,
+                                        &device_ptr));
+        ASSERT_NE(device_ptr, nullptr);
+
+        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
+                                         nullptr, &buffer));
+
+        ASSERT_NE(buffer, nullptr);
+    }
+
+    void TearDown() override {
+        if (device_ptr) {
+            EXPECT_SUCCESS(urUSMFree(context, device_ptr));
+        }
+
+        if (buffer) {
+            EXPECT_SUCCESS(urMemRelease(buffer));
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::command_buffer::urCommandBufferExpTestWithParam<
+                testParametersFill>::TearDown());
+    }
+
+    void verifyData(std::vector<uint8_t> &output, size_t verify_size) {
+        size_t pattern_index = 0;
+        for (size_t i = 0; i < verify_size; ++i) {
+            ASSERT_EQ(output[i], pattern[pattern_index])
+                << "Result mismatch at index: " << i;
+
+            ++pattern_index;
+            if (pattern_index % pattern_size == 0) {
+                pattern_index = 0;
+            }
+        }
+    }
+
+    static constexpr unsigned elements = 16;
+    static constexpr size_t allocation_size = elements * sizeof(uint32_t);
+
+    std::vector<uint8_t> pattern;
+    size_t size;
+    size_t pattern_size;
+
+    ur_exp_command_buffer_sync_point_t sync_point;
+    void *device_ptr = nullptr;
+    ur_mem_handle_t buffer = nullptr;
+};
+
+static std::vector<testParametersFill> test_cases{
+    /* Everything set to 1 */
+    {1, 1},
+    /* pattern_size == size */
+    {256, 256},
+    /* pattern_size < size */
+    {1024, 256},
+    /* pattern sizes corresponding to some common scalar and vector types */
+    {256, 4},
+    {256, 8},
+    {256, 16},
+    {256, 32}};
+
+template <typename T>
+static std::string
+printFillTestString(const testing::TestParamInfo<typename T::ParamType> &info) {
+    const auto device_handle = std::get<0>(info.param);
+    const auto platform_device_name =
+        uur::GetPlatformAndDeviceName(device_handle);
+    std::stringstream test_name;
+    test_name << platform_device_name << "__size__"
+              << std::get<1>(info.param).size << "__patternSize__"
+              << std::get<1>(info.param).pattern_size;
+    return test_name.str();
+}
+
+UUR_TEST_SUITE_P(urCommandBufferFillCommandsTest, testing::ValuesIn(test_cases),
+                 printFillTestString<urCommandBufferFillCommandsTest>);
+
+TEST_P(urCommandBufferFillCommandsTest, Buffer) {
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
+        cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0,
+        nullptr, &sync_point));
+
+    std::vector<uint8_t> output(size, 1);
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp(
+        cmd_buf_handle, buffer, 0, size, output.data(), 1, &sync_point,
+        nullptr));
+
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+
+    ASSERT_SUCCESS(
+        urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    verifyData(output, size);
+}
+
+TEST_P(urCommandBufferFillCommandsTest, USM) {
+    ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp(
+        cmd_buf_handle, device_ptr, pattern.data(), pattern_size, size, 0,
+        nullptr, &sync_point));
+
+    std::vector<uint8_t> output(size, 1);
+    ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp(
+        cmd_buf_handle, output.data(), device_ptr, size, 1, &sync_point,
+        nullptr));
+
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+
+    ASSERT_SUCCESS(
+        urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    verifyData(output, size);
+}
diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h
index eeb0a5d5d8..85457bea97 100644
--- a/test/conformance/exp_command_buffer/fixtures.h
+++ b/test/conformance/exp_command_buffer/fixtures.h
@@ -55,6 +55,46 @@ struct urCommandBufferExpTest : uur::urContextTest {
     ur_bool_t updatable_command_buffer_support = false;
 };
 
+template <class T>
+struct urCommandBufferExpTestWithParam : urQueueTestWithParam<T> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam<T>::SetUp());
+
+        size_t returned_size;
+        ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS,
+                                       0, nullptr, &returned_size));
+
+        std::unique_ptr<char[]> returned_extensions(new char[returned_size]);
+
+        ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS,
+                                       returned_size, returned_extensions.get(),
+                                       nullptr));
+
+        std::string_view extensions_string(returned_extensions.get());
+        bool command_buffer_support =
+            extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) !=
+            std::string::npos;
+
+        if (!command_buffer_support) {
+            GTEST_SKIP() << "EXP command-buffer feature is not supported.";
+        }
+
+        // Create a command-buffer
+        ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device,
+                                                nullptr, &cmd_buf_handle));
+        ASSERT_NE(cmd_buf_handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (cmd_buf_handle) {
+            EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam<T>::TearDown());
+    }
+
+    ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr;
+};
+
 struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp());
diff --git a/test/conformance/kernel/kernel_adapter_level_zero-v2.match b/test/conformance/kernel/kernel_adapter_level_zero-v2.match
index 77d2096d92..9589496bcb 100644
--- a/test/conformance/kernel/kernel_adapter_level_zero-v2.match
+++ b/test/conformance/kernel/kernel_adapter_level_zero-v2.match
@@ -1,17 +1,10 @@
-urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS
-urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoUSMPointersTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelGetSuggestedLocalWorkSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelGetSuggestedLocalWorkSizeTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelGetSuggestedLocalWorkSizeTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+urKernelGetSuggestedLocalWorkSizeTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
\ No newline at end of file
diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match
index 7b74766ac2..c448f6363a 100644
--- a/test/conformance/kernel/kernel_adapter_level_zero.match
+++ b/test/conformance/kernel/kernel_adapter_level_zero.match
@@ -1,10 +1,3 @@
-urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS
-urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp
index ea210a921f..28d5d9c4e3 100644
--- a/test/conformance/memory/urMemImageCreate.cpp
+++ b/test/conformance/memory/urMemImageCreate.cpp
@@ -26,10 +26,10 @@ struct urMemImageCreateTest : public uur::urContextTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp());
 
-        ur_mem_handle_t image_handle = nullptr;
+        uur::raii::Mem image_handle = nullptr;
         auto ret =
             urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format,
-                             &image_desc, nullptr, &image_handle);
+                             &image_desc, nullptr, image_handle.ptr());
 
         if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
             GTEST_SKIP() << "urMemImageCreate not supported";
@@ -50,10 +50,10 @@ struct urMemImageCreateTestWithParam
         UUR_RETURN_ON_FATAL_FAILURE(
             uur::urContextTestWithParam<Param>::SetUp());
 
-        ur_mem_handle_t image_handle = nullptr;
+        uur::raii::Mem image_handle = nullptr;
         auto ret = urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE,
                                     &image_format, &image_desc, nullptr,
-                                    &image_handle);
+                                    image_handle.ptr());
 
         if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
             GTEST_SKIP() << "urMemImageCreate not supported";
@@ -89,12 +89,11 @@ TEST_P(urMemImageCreateTestWith1DMemoryTypeParam, Success) {
         0           ///< [in] number of samples
     };
 
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                     &image_format, &image_desc_with_param,
-                                    nullptr, &image_handle));
+                                    nullptr, image_handle.ptr()));
     ASSERT_NE(nullptr, image_handle);
-    ASSERT_SUCCESS(urMemRelease(image_handle));
 }
 
 using urMemImageCreateTestWith2DMemoryTypeParam =
@@ -120,12 +119,11 @@ TEST_P(urMemImageCreateTestWith2DMemoryTypeParam, Success) {
         0           ///< [in] number of samples
     };
 
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                     &image_format, &image_desc_with_param,
-                                    nullptr, &image_handle));
+                                    nullptr, image_handle.ptr()));
     ASSERT_NE(nullptr, image_handle);
-    ASSERT_SUCCESS(urMemRelease(image_handle));
 }
 
 TEST_P(urMemImageCreateTest, SuccessWith3DImageType) {
@@ -143,28 +141,27 @@ TEST_P(urMemImageCreateTest, SuccessWith3DImageType) {
         0                    ///< [in] number of samples
     };
 
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                     &image_format, &image_desc_with_param,
-                                    nullptr, &image_handle));
+                                    nullptr, image_handle.ptr()));
     ASSERT_NE(nullptr, image_handle);
-    ASSERT_SUCCESS(urMemRelease(image_handle));
 }
 
 TEST_P(urMemImageCreateTest, InvalidNullHandleContext) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urMemImageCreate(nullptr, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &image_desc, nullptr,
-                                      &image_handle));
+                                      image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidEnumerationFlags) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION,
                      urMemImageCreate(context, UR_MEM_FLAG_FORCE_UINT32,
                                       &image_format, &image_desc, nullptr,
-                                      &image_handle));
+                                      image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) {
@@ -175,23 +172,24 @@ TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) {
 }
 
 TEST_P(urMemImageCreateTest, InvalidNullPointerImageDesc) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, nullptr, nullptr,
-                                      &image_handle));
+                                      image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidNullPointerImageFormat) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, nullptr,
-                                      &image_desc, nullptr, &image_handle));
+                                      &image_desc, nullptr,
+                                      image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidSize) {
 
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.width = std::numeric_limits<size_t>::max();
@@ -199,7 +197,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 
     invalid_image_desc = image_desc;
     invalid_image_desc.height = std::numeric_limits<size_t>::max();
@@ -207,7 +205,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 
     invalid_image_desc = image_desc;
     invalid_image_desc.depth = std::numeric_limits<size_t>::max();
@@ -215,21 +213,21 @@ TEST_P(urMemImageCreateTest, InvalidSize) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescStype) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.stype = UR_STRUCTURE_TYPE_FORCE_UINT32;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescType) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.type = UR_MEM_TYPE_FORCE_UINT32;
@@ -237,11 +235,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescType) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.numMipLevel = 1; /* Must be 0 */
@@ -249,11 +247,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.numSamples = 1; /* Must be 0 */
@@ -261,11 +259,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.rowPitch = 1; /* Must be 0 if pHost is NULL */
@@ -273,11 +271,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
 
     ur_image_desc_t invalid_image_desc = image_desc;
     invalid_image_desc.slicePitch = 1; /* Must be 0 if pHost is NULL */
@@ -285,7 +283,7 @@ TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR,
                      urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
                                       &image_format, &invalid_image_desc,
-                                      nullptr, &image_handle));
+                                      nullptr, image_handle.ptr()));
 }
 
 using urMemImageCreateWithHostPtrFlagsTest =
@@ -310,8 +308,9 @@ TEST_P(urMemImageCreateWithHostPtrFlagsTest, Success) {
 }
 
 TEST_P(urMemImageCreateWithHostPtrFlagsTest, InvalidHostPtr) {
-    ur_mem_handle_t image_handle = nullptr;
+    uur::raii::Mem image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_HOST_PTR,
                      urMemImageCreate(context, getParam(), &image_format,
-                                      &image_desc, nullptr, &image_handle));
+                                      &image_desc, nullptr,
+                                      image_handle.ptr()));
 }
diff --git a/test/conformance/program/program_adapter_level_zero-v2.match b/test/conformance/program/program_adapter_level_zero-v2.match
index 05b71211b8..c25be22424 100644
--- a/test/conformance/program/program_adapter_level_zero-v2.match
+++ b/test/conformance/program/program_adapter_level_zero-v2.match
@@ -1,6 +1,9 @@
 urProgramCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
+urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_UR_PROGRAM_BUILD_INFO_STATUS
 urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-Aborted
+urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+Segmentation fault
\ No newline at end of file
diff --git a/test/conformance/program/program_adapter_level_zero.match b/test/conformance/program/program_adapter_level_zero.match
index 9e902dca94..f8d65b426e 100644
--- a/test/conformance/program/program_adapter_level_zero.match
+++ b/test/conformance/program/program_adapter_level_zero.match
@@ -3,4 +3,6 @@ urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Uni
 urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_UR_PROGRAM_BUILD_INFO_STATUS
 urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-Aborted
+urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
+{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
\ No newline at end of file
diff --git a/test/layers/CMakeLists.txt b/test/layers/CMakeLists.txt
index 2c10a08518..fbf532c274 100644
--- a/test/layers/CMakeLists.txt
+++ b/test/layers/CMakeLists.txt
@@ -8,3 +8,7 @@ add_subdirectory(validation)
 if(UR_ENABLE_TRACING)
     add_subdirectory(tracing)
 endif()
+
+if(UR_ENABLE_SANITIZER)
+    add_subdirectory(sanitizer)
+endif()
diff --git a/test/layers/sanitizer/CMakeLists.txt b/test/layers/sanitizer/CMakeLists.txt
new file mode 100644
index 0000000000..a9601a89c8
--- /dev/null
+++ b/test/layers/sanitizer/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(UR_SANITIZER_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(SAN_TEST_PREFIX sanitizer_test)
+
+function(add_sanitizer_test_executable name)
+    add_ur_executable(${SAN_TEST_PREFIX}-${name}
+        ${ARGN})
+    target_link_libraries(${SAN_TEST_PREFIX}-${name}
+        PRIVATE
+        ${PROJECT_NAME}::loader
+        ${PROJECT_NAME}::headers
+        ${PROJECT_NAME}::testing
+        ${PROJECT_NAME}::mock
+        GTest::gtest_main)
+endfunction()
+
+function(set_sanitizer_test_properties name)
+    set_tests_properties(${name} PROPERTIES LABELS "sanitizer")
+    set_property(TEST ${name} PROPERTY ENVIRONMENT
+        "UR_LOG_SANITIZER=level:debug\;flush:debug\;output:stdout")
+endfunction()
+
+function(add_sanitizer_test name)
+    add_sanitizer_test_executable(${name} ${ARGN})
+
+    add_test(NAME ${name}
+        COMMAND ${SAN_TEST_PREFIX}-${name}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+    set_sanitizer_test_properties(${name})
+endfunction()
+
+add_sanitizer_test(asan asan.cpp)
diff --git a/test/layers/sanitizer/asan.cpp b/test/layers/sanitizer/asan.cpp
new file mode 100644
index 0000000000..0fbfe4cefe
--- /dev/null
+++ b/test/layers/sanitizer/asan.cpp
@@ -0,0 +1,58 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan.cpp
+ *
+ */
+
+#include <gtest/gtest.h>
+#include <ur_api.h>
+
+TEST(DeviceAsan, Initialization) {
+    ur_result_t status;
+
+    ur_loader_config_handle_t loaderConfig;
+    status = urLoaderConfigCreate(&loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+    status = urLoaderConfigEnableLayer(loaderConfig, "UR_LAYER_ASAN");
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderInit(0, loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_adapter_handle_t adapter;
+    status = urAdapterGet(1, &adapter, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_platform_handle_t platform;
+    status = urPlatformGet(&adapter, 1, 1, &platform, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_device_handle_t device;
+    status = urDeviceGet(platform, UR_DEVICE_TYPE_DEFAULT, 1, &device, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_context_handle_t context;
+    status = urContextCreate(1, &device, nullptr, &context);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urContextRelease(context);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urDeviceRelease(device);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urAdapterRelease(adapter);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderTearDown();
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderConfigRelease(loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+}
diff --git a/test/layers/tracing/test_collector.cpp b/test/layers/tracing/test_collector.cpp
index 6c942c63ec..2e412427a7 100644
--- a/test/layers/tracing/test_collector.cpp
+++ b/test/layers/tracing/test_collector.cpp
@@ -25,7 +25,7 @@ constexpr uint16_t TRACE_FN_BEGIN =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_begin);
 constexpr uint16_t TRACE_FN_END =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_end);
-constexpr std::string_view UR_STREAM_NAME = "ur";
+constexpr std::string_view UR_STREAM_NAME = "ur.call";
 
 XPTI_CALLBACK_API void trace_cb(uint16_t trace_type, xpti::trace_event_data_t *,
                                 xpti::trace_event_data_t *child, uint64_t,
diff --git a/test/layers/validation/fixtures.hpp b/test/layers/validation/fixtures.hpp
index 9e261f0a1d..00b983138f 100644
--- a/test/layers/validation/fixtures.hpp
+++ b/test/layers/validation/fixtures.hpp
@@ -133,7 +133,7 @@ inline ur_result_t genericSuccessCallback(void *) { return UR_RESULT_SUCCESS; };
 // This returns valid (non-null) handles that we can safely leak.
 inline ur_result_t fakeContext_urContextCreate(void *pParams) {
     static std::atomic_int handle = 42;
-    auto params = *static_cast<ur_context_create_params_t *>(pParams);
+    const auto &params = *static_cast<ur_context_create_params_t *>(pParams);
     // There are two casts because windows doesn't implicitly extend the 32 bit
     // result of atomic_int::operator++.
     **params.pphContext =
diff --git a/test/layers/validation/leaks.cpp b/test/layers/validation/leaks.cpp
index 59b6bdb750..cd4fc4b739 100644
--- a/test/layers/validation/leaks.cpp
+++ b/test/layers/validation/leaks.cpp
@@ -9,7 +9,7 @@
 
 // We need a fake handle for the below adapter leak test.
 inline ur_result_t fakeAdapter_urAdapterGet(void *pParams) {
-    auto params = *static_cast<ur_adapter_get_params_t *>(pParams);
+    const auto &params = *static_cast<ur_adapter_get_params_t *>(pParams);
     **params.pphAdapters = reinterpret_cast<ur_adapter_handle_t>(0x1);
     return UR_RESULT_SUCCESS;
 }
diff --git a/test/loader/handles/fixtures.hpp b/test/loader/handles/fixtures.hpp
index 8044c90414..441433d899 100644
--- a/test/loader/handles/fixtures.hpp
+++ b/test/loader/handles/fixtures.hpp
@@ -15,7 +15,7 @@
 #endif
 
 ur_result_t replace_urPlatformGet(void *pParams) {
-    auto params = *static_cast<ur_platform_get_params_t *>(pParams);
+    const auto &params = *static_cast<ur_platform_get_params_t *>(pParams);
 
     if (*params.ppNumPlatforms) {
         **params.ppNumPlatforms = 1;
@@ -29,7 +29,7 @@ ur_result_t replace_urPlatformGet(void *pParams) {
 }
 
 ur_result_t replace_urDeviceGetInfo(void *pParams) {
-    auto params = *static_cast<ur_device_get_info_params_t *>(pParams);
+    const auto &params = *static_cast<ur_device_get_info_params_t *>(pParams);
     if (*params.ppropName == UR_DEVICE_INFO_PLATFORM) {
         if (*params.ppPropSizeRet) {
             **params.ppPropSizeRet = sizeof(ur_platform_handle_t);
diff --git a/tools/urtrace/collector.cpp b/tools/urtrace/collector.cpp
index 766e7c9dfe..eb8c18d164 100644
--- a/tools/urtrace/collector.cpp
+++ b/tools/urtrace/collector.cpp
@@ -36,7 +36,7 @@ constexpr uint16_t TRACE_FN_BEGIN =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_begin);
 constexpr uint16_t TRACE_FN_END =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_end);
-constexpr std::string_view UR_STREAM_NAME = "ur";
+constexpr std::string_view UR_STREAM_NAME = "ur.call";
 
 static logger::Logger out = logger::create_logger("collector", true);