diff --git a/.github/docker/README.md b/.github/docker/README.md index 782dce372e..81adbc8f1a 100644 --- a/.github/docker/README.md +++ b/.github/docker/README.md @@ -8,10 +8,10 @@ development environment. # How to build docker image -To build docker image on local machine execute: +To build docker image on local machine, enter the root dir of the repository and execute: ```sh -docker build -t ur:ubuntu-22.04 -f ./ubuntu-22.04.Dockerfile . +docker build -t ur:ubuntu-22.04 -f .github/docker/ubuntu-22.04.Dockerfile . ``` To set any build time variable (e.g., an optional ARG from docker recipe), add to the command (after `build`), e.g.: diff --git a/.github/docker/fedora-40.Dockerfile b/.github/docker/fedora-40.Dockerfile new file mode 100644 index 0000000000..70f77345fa --- /dev/null +++ b/.github/docker/fedora-40.Dockerfile @@ -0,0 +1,82 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of fedora-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("40") +FROM registry.hub.docker.com/library/fedora@sha256:5ce8497aeea599bf6b54ab3979133923d82aaa4f6ca5ced1812611b197c79eb0 + +# Set environment variables +ENV OS fedora +ENV OS_VER 40 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.4 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m ${USER} \ + && echo "${USER}:${USERPASS}" | chpasswd \ + && gpasswd wheel -a ${USER} + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh index 0aac93eee4..aa5831c734 100755 --- a/.github/docker/install_dpcpp.sh +++ b/.github/docker/install_dpcpp.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -15,9 +15,6 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then exit fi -apt-get install -y --no-install-recommends \ - libncurses5 - -mkdir -p ${DPCPP_PATH} -wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz -tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/ +mkdir -p ${DPCPP_PATH}/dpcpp_compiler +wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz +tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler diff --git a/.github/docker/opensuse-leap-15.Dockerfile b/.github/docker/opensuse-leap-15.Dockerfile new file mode 100644 index 0000000000..62a09b27ef --- /dev/null +++ b/.github/docker/opensuse-leap-15.Dockerfile @@ -0,0 +1,92 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of opensuse-leap-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("15") +FROM registry.hub.docker.com/opensuse/leap@sha256:1cf79e78bb69f39fb2f78a7c2c7ebc4b64cf8d82eb1df76cd36767a595ada7a8 + +# Set environment variables +ENV OS opensuse-leap +ENV OS_VER 15 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + gcc \ + gcc-c++ \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-devel \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + gzip \ + libncurses5 \ + sudo \ + tar \ + wget" + +# add openSUSE Leap 15.5 Oss repo +RUN zypper ar -f https://download.opensuse.org/distribution/leap/15.5/repo/oss/ oss + +# Update and install required packages +RUN zypper update -y \ + && zypper install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && zypper clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +ENV PFILE ./password +RUN useradd -m ${USER} \ + && echo ${USERPASS} > ${PFILE} \ + && echo ${USERPASS} >> ${PFILE} \ + && passwd ${USER} < ${PFILE} \ + && rm -f ${PFILE} \ + && sed -i 's/# %wheel/%wheel/g' /etc/sudoers \ + && groupadd wheel \ + && gpasswd wheel -a ${USER} +USER test_user diff --git a/.github/docker/rockylinux-8.Dockerfile b/.github/docker/rockylinux-8.Dockerfile new file mode 100644 index 0000000000..7581cf5bd7 --- /dev/null +++ b/.github/docker/rockylinux-8.Dockerfile @@ -0,0 +1,93 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("8.9") +FROM registry.hub.docker.com/library/rockylinux@sha256:9794037624aaa6212aeada1d28861ef5e0a935adaf93e4ef79837119f2a2d04c + +# Set environment variables +ENV OS rockylinux +ENV OS_VER 8 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Packages required by requirements.txt +ARG PRE_PYTHON_DEPS="\ + libjpeg-turbo-devel \ + python3-devel \ + python3-wheel \ + zlib-devel" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.1 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf --enablerepo devel install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${PRE_PYTHON_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m $USER \ + && echo "${USERPASS}" | passwd "${USER}" --stdin \ + && gpasswd wheel -a "${USER}" \ + && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/rockylinux-9.Dockerfile b/.github/docker/rockylinux-9.Dockerfile new file mode 100644 index 0000000000..171e315cbe --- /dev/null +++ b/.github/docker/rockylinux-9.Dockerfile @@ -0,0 +1,85 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("9.3") +FROM registry.hub.docker.com/library/rockylinux@sha256:d7be1c094cc5845ee815d4632fe377514ee6ebcf8efaed6892889657e5ddaaa6 + +# Set environment variables +ENV OS rockylinux +ENV OS_VER 9 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.2 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf --enablerepo devel install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m $USER \ + && echo "${USERPASS}" | passwd "${USER}" --stdin \ + && gpasswd wheel -a "${USER}" \ + && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/ubuntu-20.04.Dockerfile b/.github/docker/ubuntu-20.04.Dockerfile new file mode 100644 index 0000000000..2560bb10b9 --- /dev/null +++ b/.github/docker/ubuntu-20.04.Dockerfile @@ -0,0 +1,75 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("20.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:d86db849e59626d94f768c679aba441163c996caf7a3426f44924d0239ffe03f + +# Set environment variables +ENV OS ubuntu +ENV OS_VER 20.04 +ENV NOTTY 1 +ENV DEBIAN_FRONTEND noninteractive + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + build-essential \ + git" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + g++-7 \ + libncurses5 \ + sudo \ + wget \ + whois" + +# Update and install required packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean all + +# Install CMake from source (the version in apt it's too old) +RUN wget https://cmake.org/files/v3.20/cmake-3.20.0-linux-x86_64.sh -O cmake.sh \ + && chmod +x cmake.sh \ + && ./cmake.sh --skip-license --prefix=/usr/local + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})" +USER test_user diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile index 55e63f2c03..d4b3a828fc 100644 --- a/.github/docker/ubuntu-22.04.Dockerfile +++ b/.github/docker/ubuntu-22.04.Dockerfile @@ -4,11 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# Dockerfile - image with all Unified Runtime dependencies. +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. # -# Pull base image -FROM registry.hub.docker.com/library/ubuntu:22.04 +# Pull base image ("22.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:0eb0f877e1c869a300c442c41120e778db7161419244ee5cbc6fa5f134e74736 # Set environment variables ENV OS ubuntu @@ -35,13 +36,10 @@ ARG UR_DEPS="\ python3-pip \ libhwloc-dev" -# Unified Runtime's dependencies (installed via pip) -ARG UR_PYTHON_DEPS="\ - clang-format==15.0.7" - # Miscellaneous for our builds/CI (optional) ARG MISC_DEPS="\ clang \ + libncurses5 \ sudo \ wget \ whois" @@ -55,18 +53,21 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean all -# pip package is pinned to a version, but it's probably improperly parsed here -# hadolint ignore=DL3013 -RUN pip3 install --no-cache-dir ${UR_PYTHON_DEPS} +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt # Install DPC++ -COPY install_dpcpp.sh /opt/install_dpcpp.sh +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh ENV DPCPP_PATH=/opt/dpcpp -RUN /opt/install_dpcpp.sh +RUN /opt/ur/install_dpcpp.sh # Install libbacktrace -COPY install_libbacktrace.sh /opt/install_libbacktrace.sh -RUN /opt/install_libbacktrace.sh +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh # Add a new (non-root) 'test_user' and switch to it ENV USER test_user diff --git a/.github/docker/ubuntu-24.04.Dockerfile b/.github/docker/ubuntu-24.04.Dockerfile new file mode 100644 index 0000000000..6d232e1296 --- /dev/null +++ b/.github/docker/ubuntu-24.04.Dockerfile @@ -0,0 +1,75 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("24.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:340d9b015b194dc6e2a13938944e0d016e57b9679963fdeb9ce021daac430221 + +# Set environment variables +ENV OS ubuntu +ENV OS_VER 24.04 +ENV NOTTY 1 +ENV DEBIAN_FRONTEND noninteractive + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + build-essential \ + cmake \ + git" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + libncurses5 \ + sudo \ + wget \ + whois" + +# Update and install required packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})" +USER test_user diff --git a/.github/workflows/benchmarks_compute.yml b/.github/workflows/benchmarks_compute.yml index 619784b263..86fbb1ddc8 100644 --- a/.github/workflows/benchmarks_compute.yml +++ b/.github/workflows/benchmarks_compute.yml @@ -34,6 +34,16 @@ on: type: string required: false default: '' + sycl_repo: + description: 'Compiler repo' + type: string + required: true + default: 'intel/llvm' + sycl_commit: + description: 'Compiler commit' + type: string + required: false + default: '' permissions: contents: read @@ -41,8 +51,6 @@ permissions: jobs: e2e-build-hw: - # Run only on upstream; forks will not have the HW - # if: github.repository == 'oneapi-src/unified-runtime' name: Build SYCL, UR, run Compute Benchmarks strategy: matrix: @@ -105,12 +113,19 @@ jobs: - name: Checkout SYCL uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - repository: intel/llvm + repository: ${{inputs.sycl_repo}} ref: refs/heads/sycl path: sycl-repo fetch-depth: 1 fetch-tags: false + - name: Fetch specific SYCL commit + if: inputs.sycl_commit != '' + working-directory: ./sycl-repo + run: | + git fetch --depth=1 origin ${{ inputs.sycl_commit }} + git checkout ${{ inputs.sycl_commit }} + - name: Set CUDA env vars if: matrix.adapter.str_name == 'cuda' run: | diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 3b5ef70e19..41d57bb591 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -36,8 +36,7 @@ jobs: - os: 'ubuntu-20.04' build_type: Release compiler: {c: gcc-7, cxx: g++-7} - - runs-on: ${{matrix.os}} + runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }} steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -50,6 +49,9 @@ jobs: - name: Install libhwloc run: .github/scripts/install_hwloc.sh + - name: Setup PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Install g++-7 if: matrix.compiler.cxx == 'g++-7' run: | diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2ad96fb348..fdc5d0c0c0 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,7 +12,7 @@ permissions: jobs: analyze-ubuntu: name: Analyze on Ubuntu - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: security-events: write diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0704038829..710aa659c8 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,7 +23,7 @@ concurrency: jobs: # Build job build: - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} steps: - name: Checkout uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -57,7 +57,7 @@ jobs: environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} needs: build steps: - name: Deploy to GitHub Pages diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml index 00055638df..f606b4a8fa 100644 --- a/.github/workflows/e2e_core.yml +++ b/.github/workflows/e2e_core.yml @@ -54,7 +54,7 @@ permissions: jobs: changed-files: name: Check for changed files - runs-on: ubuntu-22.04 + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} outputs: any_changed: ${{ steps.get-changed.outputs.any_changed }} steps: @@ -66,6 +66,7 @@ jobs: files: | source/adapters/${{inputs.str_name}}/** source/loader/** + .github/workflows/e2e* e2e-build-hw: # We want to run the job only if there are changes in the specific adapter @@ -168,17 +169,24 @@ jobs: -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py" - - name: Set test filters for L0 - if: matrix.adapter.name == 'L0' - run: | - echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV - echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV - echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV + - name: Set LIT_XFAIL_NOT + if: inputs.xfail_not != '' + run: echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV + + - name: Set LIT_XFAIL + if: inputs.xfail != '' + run: echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV + + - name: Set LIT_FILTER_OUT + if: inputs.filter_out != '' + run: echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV # TODO: remove once intel/llvm lit tests can properly recognize the GPU - name: Configure hardware platform feature for L0 if: matrix.adapter.name == 'L0' - run: sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py + run: | + sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py + sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc")' build-e2e/lit.site.cfg.py - name: Run e2e tests id: tests diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml index 39f4a3082c..3fff36bb9c 100644 --- a/.github/workflows/e2e_level_zero.yml +++ b/.github/workflows/e2e_level_zero.yml @@ -21,9 +21,9 @@ jobs: config: "" unit: "gpu" # Failing tests - xfail: "Matrix/SG32/get_coord_int8_matB.cpp;Matrix/element_wise_all_ops_1d.cpp;Matrix/element_wise_all_ops_1d_cont.cpp;Matrix/element_wise_all_ops_scalar.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Plugin/level_zero_barrier_optimization.cpp" + xfail: "DeviceCodeSplit/grf.cpp;ESIMD/grf.cpp;ESIMD/named_barriers/loop_extended.cpp;KernelAndProgram/target_register_alloc_mode.cpp;Matrix/SG32/get_coord_int8_matB.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_prefetch.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp" # Flaky tests - filter_out: "UserDefinedReductions/user_defined_reductions.cpp" + filter_out: "" # These runners by default spawn upwards of 260 workers. # We also add a time out just in case some test hangs - extra_lit_flags: "--param gpu-intel-pvc=True -sv -j 100 --max-time 600" + extra_lit_flags: "--param gpu-intel-pvc=True --param gpu-intel-pvc-1T=True -sv -j 100 --max-time 600" diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index d0cb335d96..faf7060503 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -18,6 +18,6 @@ jobs: permissions: contents: read pull-requests: write - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} steps: - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index f1436fc46a..f466cc693e 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -11,7 +11,7 @@ permissions: jobs: weekly-prerelease: - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: contents: write steps: diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 2efb04c86a..c2ef1d47e7 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -19,7 +19,7 @@ permissions: jobs: linux: name: Trivy - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: security-events: write diff --git a/README.md b/README.md index ae61b76b09..7ba72b43d3 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Unified Runtime [![Build and test](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml) -[![E2E Cuda](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml) -[![E2E OpenCL](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml) -[![E2E Level Zero](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml) -[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml) [![Bandit](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml) +[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml) [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime) +[![Nightly](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/unified-runtime/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/unified-runtime) +[![Trivy](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml) +[![Deploy documentation to Pages](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml) diff --git a/examples/collector/collector.cpp b/examples/collector/collector.cpp index 910964e02c..cc9580bc4f 100644 --- a/examples/collector/collector.cpp +++ b/examples/collector/collector.cpp @@ -31,7 +31,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; /** * @brief Formats the function parameters and arguments for urAdapterGet diff --git a/include/ur_api.h b/include/ur_api.h index 8dd757afd2..5be733a429 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -227,6 +227,7 @@ typedef enum ur_function_t { UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228, ///< Enumerator for ::urEnqueueNativeCommandExp UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED = 229, ///< Enumerator for ::urLoaderConfigSetMockingEnabled UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP = 230, ///< Enumerator for ::urBindlessImagesReleaseExternalMemoryExp + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 231, ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2061,7 +2062,7 @@ typedef struct ur_device_native_properties_t { /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -2069,7 +2070,7 @@ typedef struct ur_device_native_properties_t { UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t *pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t *phDevice ///< [out] pointer to the handle of the device object created. ); @@ -3796,7 +3797,7 @@ urUSMPoolGetInfo( #endif // Intel 'oneAPI' Unified Runtime APIs #if !defined(__GNUC__) -#pragma region virtual memory +#pragma region virtual_memory #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Virtual memory granularity info @@ -7353,7 +7354,7 @@ urEnqueueWriteHostPipe( #endif // Bindless Images Extension APIs #if !defined(__GNUC__) -#pragma region bindless images(experimental) +#pragma region bindless_images_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Handle of bindless image @@ -7962,6 +7963,36 @@ urBindlessImagesMapExternalArrayExp( ur_exp_image_mem_native_handle_t *phImageMem ///< [out] image memory handle to the externally allocated memory ); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +UR_APIEXPORT ur_result_t UR_APICALL +urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +); + /////////////////////////////////////////////////////////////////////////////// /// @brief Release external memory /// @@ -8121,7 +8152,7 @@ urBindlessImagesSignalExternalSemaphoreExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for Command-Buffers #if !defined(__GNUC__) -#pragma region command buffer(experimental) +#pragma region command_buffer_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Command-buffer query information type @@ -8974,7 +9005,7 @@ urCommandBufferCommandGetInfoExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for Cooperative Kernels #if !defined(__GNUC__) -#pragma region cooperative kernels(experimental) +#pragma region cooperative_kernels_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_COOPERATIVE_KERNELS_EXTENSION_STRING_EXP @@ -9062,7 +9093,7 @@ urKernelSuggestMaxCooperativeGroupCountExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings #if !defined(__GNUC__) -#pragma region enqueue timestamp recording(experimental) +#pragma region enqueue_timestamp_recording_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp @@ -9104,7 +9135,7 @@ urEnqueueTimestampRecordingExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for (kernel) Launch Properties #if !defined(__GNUC__) -#pragma region launch properties(experimental) +#pragma region launch_properties_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP @@ -9231,7 +9262,7 @@ urEnqueueKernelLaunchCustomExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for multi-device compile #if !defined(__GNUC__) -#pragma region multi device compile(experimental) +#pragma region multi_device_compile_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_MULTI_DEVICE_COMPILE_EXTENSION_STRING_EXP @@ -9361,7 +9392,7 @@ urProgramLinkExp( #endif // Intel 'oneAPI' USM Import/Release Extension APIs #if !defined(__GNUC__) -#pragma region usm import release(experimental) +#pragma region usm_import_release_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM @@ -9414,7 +9445,7 @@ urUSMReleaseExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for USM P2P #if !defined(__GNUC__) -#pragma region usm p2p(experimental) +#pragma region usm_p2p_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_USM_P2P_EXTENSION_STRING_EXP @@ -9570,7 +9601,7 @@ urUsmP2PPeerAccessGetInfoExp( #endif // Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs #if !defined(__GNUC__) -#pragma region native enqueue(experimental) +#pragma region native_enqueue_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Native enqueue properties @@ -11231,6 +11262,19 @@ typedef struct ur_bindless_images_map_external_array_exp_params_t { ur_exp_image_mem_native_handle_t **pphImageMem; } ur_bindless_images_map_external_array_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urBindlessImagesMapExternalLinearMemoryExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_bindless_images_map_external_linear_memory_exp_params_t { + ur_context_handle_t *phContext; + ur_device_handle_t *phDevice; + uint64_t *poffset; + uint64_t *psize; + ur_exp_external_mem_handle_t *phExternalMem; + void ***pppRetMem; +} ur_bindless_images_map_external_linear_memory_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesReleaseExternalMemoryExp /// @details Each entry is a pointer to the parameter passed to the function; @@ -11928,7 +11972,7 @@ typedef struct ur_device_get_native_handle_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_device_create_with_native_handle_params_t { ur_native_handle_t *phNativeDevice; - ur_platform_handle_t *phPlatform; + ur_adapter_handle_t *phAdapter; const ur_device_native_properties_t **ppProperties; ur_device_handle_t **pphDevice; } ur_device_create_with_native_handle_params_t; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index ac47d7559f..13785a2d65 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1639,6 +1639,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalArrayExp_t)( ur_exp_external_mem_handle_t, ur_exp_image_mem_native_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urBindlessImagesMapExternalLinearMemoryExp +typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalLinearMemoryExp_t)( + ur_context_handle_t, + ur_device_handle_t, + uint64_t, + uint64_t, + ur_exp_external_mem_handle_t, + void **); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urBindlessImagesReleaseExternalMemoryExp typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesReleaseExternalMemoryExp_t)( @@ -1699,6 +1709,7 @@ typedef struct ur_bindless_images_exp_dditable_t { ur_pfnBindlessImagesMipmapFreeExp_t pfnMipmapFreeExp; ur_pfnBindlessImagesImportExternalMemoryExp_t pfnImportExternalMemoryExp; ur_pfnBindlessImagesMapExternalArrayExp_t pfnMapExternalArrayExp; + ur_pfnBindlessImagesMapExternalLinearMemoryExp_t pfnMapExternalLinearMemoryExp; ur_pfnBindlessImagesReleaseExternalMemoryExp_t pfnReleaseExternalMemoryExp; ur_pfnBindlessImagesImportExternalSemaphoreExp_t pfnImportExternalSemaphoreExp; ur_pfnBindlessImagesReleaseExternalSemaphoreExp_t pfnReleaseExternalSemaphoreExp; @@ -2362,7 +2373,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnDeviceGetNativeHandle_t)( /// @brief Function-pointer for urDeviceCreateWithNativeHandle typedef ur_result_t(UR_APICALL *ur_pfnDeviceCreateWithNativeHandle_t)( ur_native_handle_t, - ur_platform_handle_t, + ur_adapter_handle_t, const ur_device_native_properties_t *, ur_device_handle_t *); diff --git a/include/ur_print.h b/include/ur_print.h index a3a915827b..c70e661fb1 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -2146,6 +2146,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesImportExternalMemoryExp /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalArrayExpParams(const struct ur_bindless_images_map_external_array_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_bindless_images_map_external_linear_memory_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalLinearMemoryExpParams(const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_release_external_memory_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index d8ac521bdc..9aeb5e3341 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -942,6 +942,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP"; break; + case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP: + os << "UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP"; + break; default: os << "unknown enumerator"; break; @@ -15190,6 +15193,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_bindless_images_map_external_linear_memory_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params) { + + os << ".hContext = "; + + ur::details::printPtr(os, + *(params->phContext)); + + os << ", "; + os << ".hDevice = "; + + ur::details::printPtr(os, + *(params->phDevice)); + + os << ", "; + os << ".offset = "; + + os << *(params->poffset); + + os << ", "; + os << ".size = "; + + os << *(params->psize); + + os << ", "; + os << ".hExternalMem = "; + + ur::details::printPtr(os, + *(params->phExternalMem)); + + os << ", "; + os << ".ppRetMem = "; + + ur::details::printPtr(os, + *(params->pppRetMem)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_release_external_memory_exp_params_t type /// @returns @@ -17312,10 +17357,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->phNativeDevice))); os << ", "; - os << ".hPlatform = "; + os << ".hAdapter = "; ur::details::printPtr(os, - *(params->phPlatform)); + *(params->phAdapter)); os << ", "; os << ".pProperties = "; @@ -17804,6 +17849,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP: { os << (const struct ur_bindless_images_map_external_array_exp_params_t *)params; } break; + case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP: { + os << (const struct ur_bindless_images_map_external_linear_memory_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP: { os << (const struct ur_bindless_images_release_external_memory_exp_params_t *)params; } break; diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py index e976bfaee8..b28681c2ee 100644 --- a/scripts/benchmarks/benches/SobelFilter.py +++ b/scripts/benchmarks/benches/SobelFilter.py @@ -12,7 +12,10 @@ class SobelFilter(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("sobel_filter", "sobel_filter", vb) + + def download_deps(self): self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz") + return def name(self): return "Velocity-Bench Sobel Filter" diff --git a/scripts/benchmarks/benches/api_overhead.py b/scripts/benchmarks/benches/api_overhead.py deleted file mode 100644 index d34f4c4ee8..0000000000 --- a/scripts/benchmarks/benches/api_overhead.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import os -import csv -import io -from utils.utils import run, git_clone -from .base import Benchmark -from .result import Result -from .options import options - -class APIOverheadSYCL(Benchmark): - def __init__(self, directory): - super().__init__(directory) - - def name(self): - return "api_overhead_benchmark_sycl, mean execution time per 10 kernels" - - def unit(self): - return "μs" - - def setup(self): - repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59") - build_path = self.create_build_path('compute-benchmarks-build') - - configure_command = [ - "cmake", - f"-B {build_path}", - f"-S {repo_path}", - f"-DCMAKE_BUILD_TYPE=Release", - f"-DBUILD_SYCL=ON", - f"-DSYCL_COMPILER_ROOT={options.sycl}", - f"-DALLOW_WARNINGS=ON" - ] - run(configure_command, add_sycl=True) - - run(f"cmake --build {build_path} -j", add_sycl=True) - self.benchmark_bin = f"{build_path}/bin/api_overhead_benchmark_sycl" - - def run_internal(self, ioq, env_vars): - command = [ - f"{self.benchmark_bin}", - "--test=SubmitKernel", - f"--Ioq={ioq}", - "--DiscardEvents=0", - "--MeasureCompletion=0", - "--iterations=100000", - "--Profiling=0", - "--NumKernels=10", - "--KernelExecTime=1", - "--csv", - "--noHeaders" - ] - result = self.run_bench(command, env_vars) - (label, mean) = self.parse_output(result) - return Result(label=label, value=mean, command=command, env=env_vars, stdout=result) - - def run(self, env_vars) -> list[Result]: - results = [] - for ioq in [0, 1]: - results.append(self.run_internal(ioq, env_vars)) - - return results - - def parse_output(self, output): - csv_file = io.StringIO(output) - reader = csv.reader(csv_file) - next(reader, None) - data_row = next(reader, None) - if data_row is None: - raise ValueError("Benchmark output does not contain data.") - try: - label = data_row[0] - mean = float(data_row[1]) - return (label, mean) - except (ValueError, IndexError) as e: - raise ValueError(f"Error parsing output: {e}") - - def teardown(self): - return diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py index 25b5d2619f..c7f263c253 100644 --- a/scripts/benchmarks/benches/base.py +++ b/scripts/benchmarks/benches/base.py @@ -20,16 +20,6 @@ def __init__(self, directory): def run_bench(self, command, env_vars): return run(command=command, env_vars=env_vars, add_sycl=True, cwd=options.benchmark_cwd).stdout.decode() - def create_build_path(self, name): - build_path = os.path.join(self.directory, name) - - if options.rebuild and Path(build_path).exists(): - shutil.rmtree(build_path) - - Path(build_path).mkdir(parents=True, exist_ok=True) - - return build_path - def create_data_path(self, name): data_path = os.path.join(self.directory, "data", name) @@ -58,10 +48,13 @@ def name(self): def unit(self): raise NotImplementedError() + def lower_is_better(self): + return True + def setup(self): raise NotImplementedError() - def run(self, env_vars): + def run(self, env_vars) -> Result: raise NotImplementedError() def teardown(self): diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py new file mode 100644 index 0000000000..19bc0b7fd0 --- /dev/null +++ b/scripts/benchmarks/benches/compute.py @@ -0,0 +1,212 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +import csv +import io +from utils.utils import run, git_clone, create_build_path +from .base import Benchmark +from .result import Result +from .options import options + +class ComputeBench: + def __init__(self, directory): + self.directory = directory + self.built = False + return + + def setup(self): + if self.built: + return + + repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59") + build_path = create_build_path(self.directory, 'compute-benchmarks-build') + + configure_command = [ + "cmake", + f"-B {build_path}", + f"-S {repo_path}", + f"-DCMAKE_BUILD_TYPE=Release", + f"-DBUILD_SYCL=ON", + f"-DSYCL_COMPILER_ROOT={options.sycl}", + f"-DALLOW_WARNINGS=ON" + ] + run(configure_command, add_sycl=True) + + run(f"cmake --build {build_path} -j", add_sycl=True) + + self.built = True + self.bins = os.path.join(build_path, 'bin') + +class ComputeBenchmark(Benchmark): + def __init__(self, bench, name, test): + self.bench = bench + self.bench_name = name + self.test = test + super().__init__(bench.directory) + + def bin_args(self) -> list[str]: + return [] + + def extra_env_vars(self) -> dict: + return {} + + def unit(self): + return "μs" + + def setup(self): + self.bench.setup() + self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name) + + def run(self, env_vars) -> Result: + command = [ + f"{self.benchmark_bin}", + f"--test={self.test}", + "--csv", + "--noHeaders" + ] + + command += self.bin_args() + env_vars.update(self.extra_env_vars()) + + result = self.run_bench(command, env_vars) + (label, mean) = self.parse_output(result) + return Result(label=label, value=mean, command=command, env=env_vars, stdout=result) + + def parse_output(self, output): + csv_file = io.StringIO(output) + reader = csv.reader(csv_file) + next(reader, None) + data_row = next(reader, None) + if data_row is None: + raise ValueError("Benchmark output does not contain data.") + try: + label = data_row[0] + mean = float(data_row[1]) + return (label, mean) + except (ValueError, IndexError) as e: + raise ValueError(f"Error parsing output: {e}") + + def teardown(self): + return + +class SubmitKernelSYCL(ComputeBenchmark): + def __init__(self, bench, ioq): + self.ioq = ioq + super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel") + + def name(self): + order = "in order" if self.ioq else "out of order" + return f"api_overhead_benchmark_sycl SubmitKernel {order}" + + def bin_args(self) -> list[str]: + return [ + f"--Ioq={self.ioq}", + "--DiscardEvents=0", + "--MeasureCompletion=0", + "--iterations=100000", + "--Profiling=0", + "--NumKernels=10", + "--KernelExecTime=1" + ] + +class ExecImmediateCopyQueue(ComputeBenchmark): + def __init__(self, bench, ioq, isCopyOnly, source, destination, size): + self.ioq = ioq + self.isCopyOnly = isCopyOnly + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue") + + def name(self): + order = "in order" if self.ioq else "out of order" + return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=100000", + f"--ioq={self.ioq}", + f"--IsCopyOnly={self.isCopyOnly}", + "--MeasureCompletionTime=0", + f"--src={self.destination}", + f"--dst={self.destination}", + f"--size={self.size}" + ] + +class QueueInOrderMemcpy(ComputeBenchmark): + def __init__(self, bench, isCopyOnly, source, destination, size): + self.isCopyOnly = isCopyOnly + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy") + + def name(self): + return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--IsCopyOnly={self.isCopyOnly}", + f"--sourcePlacement={self.source}", + f"--destinationPlacement={self.destination}", + f"--size={self.size}", + "--count=100" + ] + +class QueueMemcpy(ComputeBenchmark): + def __init__(self, bench, source, destination, size): + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy") + + def name(self): + return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--sourcePlacement={self.source}", + f"--destinationPlacement={self.destination}", + f"--size={self.size}", + ] + +class StreamMemory(ComputeBenchmark): + def __init__(self, bench, type, size, placement): + self.type = type + self.size = size + self.placement = placement + super().__init__(bench, "memory_benchmark_sycl", "StreamMemory") + + def name(self): + return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--type={self.type}", + f"--size={self.size}", + f"--memoryPlacement={self.placement}", + "--useEvents=0", + "--contents=Zeros", + ] + +class VectorSum(ComputeBenchmark): + def __init__(self, bench): + super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum") + + def name(self): + return f"miscellaneous_benchmark_sycl VectorSum" + + def bin_args(self) -> list[str]: + return [ + "--iterations=1000", + "--numberOfElementsX=512", + "--numberOfElementsY=256", + "--numberOfElementsZ=256", + ] + diff --git a/scripts/benchmarks/benches/cudaSift.py b/scripts/benchmarks/benches/cudaSift.py index 6f9c19040e..482d258052 100644 --- a/scripts/benchmarks/benches/cudaSift.py +++ b/scripts/benchmarks/benches/cudaSift.py @@ -9,11 +9,18 @@ from utils.utils import run import os import re +import shutil class CudaSift(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("cudaSift", "cudaSift", vb) + def download_deps(self): + images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData') + dest = os.path.join(self.directory, 'inputData') + if not os.path.exists(dest): + shutil.copytree(images, dest) + def name(self): return "Velocity-Bench CudaSift" diff --git a/scripts/benchmarks/benches/easywave.py b/scripts/benchmarks/benches/easywave.py index 2fa4d95685..2f89482329 100644 --- a/scripts/benchmarks/benches/easywave.py +++ b/scripts/benchmarks/benches/easywave.py @@ -14,6 +14,8 @@ class Easywave(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("easywave", "easyWave_sycl", vb) + + def download_deps(self): self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz") def name(self): diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py index c8cb0bdb03..7558183bf0 100644 --- a/scripts/benchmarks/benches/hashtable.py +++ b/scripts/benchmarks/benches/hashtable.py @@ -23,6 +23,9 @@ def unit(self): def bin_args(self) -> list[str]: return ["--no-verify"] + def lower_is_better(self): + return False + def parse_output(self, stdout: str) -> float: match = re.search(r'(\d+\.\d+) million keys/second', stdout) if match: diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py index c990a44d5f..c035ce6800 100644 --- a/scripts/benchmarks/benches/options.py +++ b/scripts/benchmarks/benches/options.py @@ -5,6 +5,9 @@ class Options: sycl: str = "" rebuild: bool = True benchmark_cwd: str = "INVALID" + timeout: float = 600 + iterations: int = 5 + verbose: bool = False options = Options() diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py index 383c8dd5be..7e1f65ee1d 100644 --- a/scripts/benchmarks/benches/quicksilver.py +++ b/scripts/benchmarks/benches/quicksilver.py @@ -15,10 +15,10 @@ def __init__(self, vb: VelocityBench): super().__init__("QuickSilver", "qs", vb) self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering") - def run(self, env_vars) -> list[Result]: + def run(self, env_vars) -> Result: # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0 if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0': - return [] + return None return super().run(env_vars) @@ -28,6 +28,9 @@ def name(self): def unit(self): return "MMS/CTT" + def lower_is_better(self): + return False + def bin_args(self) -> list[str]: return ["-i", f"{self.data_path}/scatteringOnly.inp"] diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py index 8dd2f4ba9c..896ff4da98 100644 --- a/scripts/benchmarks/benches/result.py +++ b/scripts/benchmarks/benches/result.py @@ -16,3 +16,4 @@ class Result: stdout: str unit: str = "" name: str = "" + lower_is_better: bool = True diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py index fec3abb842..e5601c6563 100644 --- a/scripts/benchmarks/benches/velocity.py +++ b/scripts/benchmarks/benches/velocity.py @@ -6,15 +6,14 @@ from utils.utils import git_clone from .base import Benchmark from .result import Result -from utils.utils import run +from utils.utils import run, create_build_path import os import re class VelocityBench: def __init__(self, directory): self.directory = directory - # TODO: replace with https://github.com/oneapi-src/Velocity-Bench once all fixes land upstream - self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/pbalcer/Velocity-Bench.git", "ae0ae05c7fd1469779ecea4f36e4741b1d956eb4") + self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench", "34ee4ebe18d91dfdd38b7d798fd986b41874fcbc") class VelocityBase(Benchmark): def __init__(self, name: str, bin_name: str, vb: VelocityBench): @@ -24,8 +23,13 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench): self.bin_name = bin_name self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL') + def download_deps(self): + return + def setup(self): - build_path = self.create_build_path(self.bench_name) + self.download_deps() + + build_path = create_build_path(self.directory, self.bench_name) configure_command = [ "cmake", @@ -47,7 +51,7 @@ def extra_env_vars(self) -> dict: def parse_output(self, stdout: str) -> float: raise NotImplementedError() - def run(self, env_vars) -> list[Result]: + def run(self, env_vars) -> Result: env_vars.update(self.extra_env_vars()) command = [ @@ -57,7 +61,7 @@ def run(self, env_vars) -> list[Result]: result = self.run_bench(command, env_vars) - return [Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)] + return Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result) def teardown(self): return diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 5dad40c7fe..34238f773c 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -5,9 +5,8 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import os from utils.utils import prepare_workdir, load_benchmark_results, save_benchmark_results; -from benches.api_overhead import APIOverheadSYCL +from benches.compute import * from benches.hashtable import Hashtable from benches.bitcracker import Bitcracker from benches.cudaSift import CudaSift @@ -18,46 +17,72 @@ from benches.options import options from output import generate_markdown import argparse +import re # Update this if you are changing the layout of the results files -INTERNAL_WORKDIR_VERSION = '1.0' - -def main(directory, additional_env_vars, save_name, compare_names): - variants = [ - ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"), - ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""), - ] +INTERNAL_WORKDIR_VERSION = '1.6' +def main(directory, additional_env_vars, save_name, compare_names, filter): prepare_workdir(directory, INTERNAL_WORKDIR_VERSION) vb = VelocityBench(directory) + cb = ComputeBench(directory) benchmarks = [ - APIOverheadSYCL(directory), + SubmitKernelSYCL(cb, 0), + SubmitKernelSYCL(cb, 1), + QueueInOrderMemcpy(cb, 0, 'Device', 'Device', 1024), + QueueInOrderMemcpy(cb, 0, 'Host', 'Device', 1024), + QueueMemcpy(cb, 'Device', 'Device', 1024), + StreamMemory(cb, 'Triad', 10 * 1024, 'Device'), + ExecImmediateCopyQueue(cb, 0, 1, 'Device', 'Device', 1024), + ExecImmediateCopyQueue(cb, 1, 1, 'Device', 'Host', 1024), + VectorSum(cb), Hashtable(vb), Bitcracker(vb), - #CudaSift(vb), TODO: the benchmark is passing, but is outputting "Failed to allocate device data" + CudaSift(vb), Easywave(vb), QuickSilver(vb), SobelFilter(vb) ] + if filter: + benchmarks = [benchmark for benchmark in benchmarks if filter.search(benchmark.name())] + for benchmark in benchmarks: + print(f"setting up {benchmark.name()}... ", end='', flush=True) benchmark.setup() + print("complete.") results = [] for benchmark in benchmarks: - for env_vars, extra_label in variants: - merged_env_vars = {**env_vars, **additional_env_vars} + merged_env_vars = {**additional_env_vars} + iteration_results = [] + for iter in range(options.iterations): + print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True) bench_results = benchmark.run(merged_env_vars) - for res in bench_results: - res.unit = benchmark.unit() - res.name = benchmark.name() - res.label += f" {extra_label}" - results.append(res) + if bench_results is not None: + print(f"complete ({bench_results.value} {benchmark.unit()}).") + iteration_results.append(bench_results) + else: + print(f"did not finish.") + + if len(iteration_results) == 0: + continue + + iteration_results.sort(key=lambda res: res.value) + median_index = len(iteration_results) // 2 + median_result = iteration_results[median_index] + + median_result.unit = benchmark.unit() + median_result.name = benchmark.name() + + results.append(median_result) for benchmark in benchmarks: + print(f"tearing down {benchmark.name()}... ", end='', flush=True) benchmark.teardown() + print("complete.") chart_data = {"This PR" : results} @@ -93,11 +118,20 @@ def validate_and_parse_env_args(env_args): parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[]) parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.') parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"]) + parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5) + parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600) + parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None) + parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true") args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) + options.verbose = args.verbose options.rebuild = not args.no_rebuild options.sycl = args.sycl + options.iterations = args.iterations + options.timeout = args.timeout + + benchmark_filter = re.compile(args.filter) if args.filter else None - main(args.benchmark_directory, additional_env_vars, args.save, args.compare) + main(args.benchmark_directory, additional_env_vars, args.save, args.compare, benchmark_filter) diff --git a/scripts/benchmarks/output.py b/scripts/benchmarks/output.py index 9cfee303b1..26deabe099 100644 --- a/scripts/benchmarks/output.py +++ b/scripts/benchmarks/output.py @@ -5,6 +5,7 @@ import collections from benches.base import Result +import math # Function to generate the mermaid bar chart script def generate_mermaid_script(chart_data: dict[str, list[Result]]): @@ -19,6 +20,9 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]): # remove duplicates labels = list(dict.fromkeys(labels)) mermaid_script += f""" +
+{bname} + ```mermaid --- config: @@ -57,6 +61,8 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]): """ mermaid_script += f""" ``` + +
""" return mermaid_script @@ -83,44 +89,52 @@ def generate_markdown_details(results: list[Result]): """) return "\n".join(markdown_sections) -def generate_summary(chart_data: dict[str, list[Result]]) -> str: - # Calculate the mean value of "This PR" for each benchmark - this_pr_means = {} - for res in chart_data["This PR"]: - if res.name not in this_pr_means: - this_pr_means[res.name] = [] - this_pr_means[res.name].append(res.value) - for bname in this_pr_means: - this_pr_means[bname] = sum(this_pr_means[bname]) / len(this_pr_means[bname]) - - # Calculate the percentage for each entry relative to "This PR" - summary_data = {"This PR": 100} - for entry_name, results in chart_data.items(): - if entry_name == "This PR": - continue - entry_sum = 0 - for res in results: - if res.name in this_pr_means: - percentage = (res.value / this_pr_means[res.name]) * 100 - entry_sum += percentage - - entry_average = entry_sum / len(results) if results else 0 - summary_data[entry_name] = entry_average +def generate_summary_table(chart_data: dict[str, list[Result]]): + summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " |\n" + summary_table += "|---" * (len(chart_data) + 1) + "|\n" - markdown_table = "| Name | Result % |\n| --- | --- |\n" - for entry_name, percentage in summary_data.items(): - markdown_table += f"| {entry_name} | {percentage:.2f}% |\n" - - return markdown_table + # Collect all benchmarks and their results + benchmark_results = collections.defaultdict(dict) + for key, results in chart_data.items(): + for res in results: + benchmark_results[res.name][key] = res + + # Generate the table rows + for bname, results in benchmark_results.items(): + row = f"| {bname} |" + best_value = None + best_key = None + + # Determine the best value + for key, res in results.items(): + if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value): + best_value = res.value + best_key = key + + # Generate the row with the best value highlighted + for key in chart_data.keys(): + if key in results: + value = results[key].value + if key == best_key: + row += f" `**{value}**` |" # Highlight the best value + else: + row += f" {value} |" + else: + row += " - |" + + summary_table += row + "\n" + + return summary_table def generate_markdown(chart_data: dict[str, list[Result]]): mermaid_script = generate_mermaid_script(chart_data) + summary_table = generate_summary_table(chart_data) return f""" # Summary -{generate_summary(chart_data)} -# Benchmark Results +{summary_table} +# Charts {mermaid_script} -## Details +# Details {generate_markdown_details(chart_data["This PR"])} """ diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py index 9dc3f23a9b..5c7beb95d0 100644 --- a/scripts/benchmarks/utils/utils.py +++ b/scripts/benchmarks/utils/utils.py @@ -28,9 +28,12 @@ def run(command, env_vars={}, cwd=None, add_sycl=False): env['LD_LIBRARY_PATH'] = sycl_lib_path + os.pathsep + env.get('LD_LIBRARY_PATH', '') env.update(env_vars) - result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) # nosec B603 - print(result.stdout.decode()) - print(result.stderr.decode()) + result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, timeout=options.timeout) # nosec B603 + + if options.verbose: + print(result.stdout.decode()) + print(result.stderr.decode()) + return result except subprocess.CalledProcessError as e: print(e.stdout.decode()) @@ -70,7 +73,8 @@ def load_benchmark_results(dir, compare_name) -> list[Result]: return None def prepare_bench_cwd(dir): - options.benchmark_cwd = os.path.join(dir, 'bcwd') + # we need 2 deep to workaround a problem with a fixed relative path in cudaSift + options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd') if os.path.exists(options.benchmark_cwd): shutil.rmtree(options.benchmark_cwd) os.makedirs(options.benchmark_cwd) @@ -97,3 +101,13 @@ def prepare_workdir(dir, version): with open(version_file_path, 'w') as version_file: version_file.write(version) + +def create_build_path(directory, name): + build_path = os.path.join(directory, name) + + if options.rebuild and Path(build_path).exists(): + shutil.rmtree(build_path) + + Path(build_path).mkdir(parents=True, exist_ok=True) + + return build_path diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst index 398d3ba06c..c2b3d1114e 100644 --- a/scripts/core/EXP-BINDLESS-IMAGES.rst +++ b/scripts/core/EXP-BINDLESS-IMAGES.rst @@ -181,6 +181,7 @@ Functions * Interop * ${x}BindlessImagesImportExternalMemoryExp * ${x}BindlessImagesMapExternalArrayExp + * ${x}BindlessImagesMapExternalLinearMemoryExp * ${x}BindlessImagesReleaseExternalMemoryExp * ${x}BindlessImagesImportExternalSemaphoreExp * ${x}BindlessImagesReleaseExternalSemaphoreExp @@ -250,6 +251,8 @@ Changelog | 17.0 || Rename interop related structs and funcs with "external" | | || keyword over "interop". | +----------+-------------------------------------------------------------+ +| 18.0 | Added BindlessImagesMapExternalLinearMemoryExp function. | ++----------+-------------------------------------------------------------+ Contributors -------------------------------------------------------------------------------- diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst index 1f319d6884..448e3569e2 100644 --- a/scripts/core/INTRO.rst +++ b/scripts/core/INTRO.rst @@ -161,7 +161,7 @@ Tracing Unified Runtime loader implements tracing support through the `XPTI framework `__. -.. list-table:: UR Stream `"ur"` Notification Signatures +.. list-table:: UR Stream `"ur.call"` Notification Signatures :header-rows: 1 * - Trace Point Type @@ -295,6 +295,8 @@ Layers currently included with the runtime are as follows: - Description * - UR_LAYER_PARAMETER_VALIDATION - Enables non-adapter-specific parameter validation (e.g. checking for null values). + * - UR_LAYER_BOUNDS_CHECKING + - Enables non-adapter-specific bounds checking of USM allocations for enqueued commands. Automatically enables UR_LAYER_PARAMETER_VALIDATION. * - UR_LAYER_LEAK_CHECKING - Performs some leak checking for API calls involving object creation/destruction. * - UR_LAYER_LIFETIME_VALIDATION diff --git a/scripts/core/device.yml b/scripts/core/device.yml index ead3ceeb8d..23c0233ef7 100644 --- a/scripts/core/device.yml +++ b/scripts/core/device.yml @@ -820,9 +820,9 @@ params: - type: $x_native_handle_t name: hNativeDevice desc: "[in][nocheck] the native handle of the device." - - type: $x_platform_handle_t - name: hPlatform - desc: "[in] handle of the platform instance" + - type: $x_adapter_handle_t + name: hAdapter + desc: "[in] handle of the adapter to which `hNativeDevice` belongs" - type: const $x_device_native_properties_t* name: pProperties desc: "[in][optional] pointer to native device properties struct." diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml index 52cd754644..d78583ac39 100644 --- a/scripts/core/exp-bindless-images.yml +++ b/scripts/core/exp-bindless-images.yml @@ -737,6 +737,37 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function +desc: "Map an external memory handle to a device memory region described by void*" +class: $xBindlessImages +name: MapExternalLinearMemoryExp +ordinal: "0" +params: + - type: $x_context_handle_t + name: hContext + desc: "[in] handle of the context object" + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device object" + - type: uint64_t + name: offset + desc: "[in] offset into memory region to map" + - type: uint64_t + name: size + desc: "[in] size of memory region to map" + - type: $x_exp_external_mem_handle_t + name: hExternalMem + desc: "[in] external memory handle to the external memory" + - type: void** + name: ppRetMem + desc: "[out] pointer of the externally allocated memory" +returns: + - $X_RESULT_ERROR_INVALID_CONTEXT + - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_IMAGE_SIZE + - $X_RESULT_ERROR_INVALID_OPERATION + - $X_RESULT_ERROR_OUT_OF_RESOURCES +--- #-------------------------------------------------------------------------- +type: function desc: "Release external memory" class: $xBindlessImages name: ReleaseExternalMemoryExp diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index d80c56b500..ab59404bb4 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -595,6 +595,9 @@ etors: - name: BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP desc: Enumerator for $xBindlessImagesReleaseExternalMemoryExp value: '230' +- name: BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP + desc: Enumerator for $xBindlessImagesMapExternalLinearMemoryExp + value: '231' --- type: enum desc: Defines structure types diff --git a/scripts/templates/api.h.mako b/scripts/templates/api.h.mako index 9fc9944b47..85b8a78c2a 100644 --- a/scripts/templates/api.h.mako +++ b/scripts/templates/api.h.mako @@ -42,7 +42,7 @@ extern "C" { %if len(spec['objects']): // ${th.subt(n, tags, spec['header']['desc'])} #if !defined(__GNUC__) -#pragma region ${spec['name']} +#pragma region ${spec['name'].replace(' ', '_')} #endif %endif %for obj in spec['objects']: diff --git a/scripts/templates/valddi.cpp.mako b/scripts/templates/valddi.cpp.mako index 778595b052..8cc4a9dc0f 100644 --- a/scripts/templates/valddi.cpp.mako +++ b/scripts/templates/valddi.cpp.mako @@ -57,8 +57,16 @@ namespace ur_validation_layer { %for key, values in sorted_param_checks: %for val in values: - if( ${val} ) + %if 'boundsError' in val: + if ( getContext()->enableBoundsChecking ) { + if ( ${val} ) { + return ${key}; + } + } + %else: + if ( ${val} ) return ${key}; + %endif %endfor %endfor @@ -178,9 +186,13 @@ namespace ur_validation_layer if (enabledLayerNames.count(nameFullValidation)) { enableParameterValidation = true; + enableBoundsChecking = true; enableLeakChecking = true; enableLifetimeValidation = true; } else { + if (enabledLayerNames.count(nameBoundsChecking)) { + enableBoundsChecking = true; + } if (enabledLayerNames.count(nameParameterValidation)) { enableParameterValidation = true; } @@ -209,13 +221,11 @@ namespace ur_validation_layer } ${x}_result_t context_t::tearDown() { - ${x}_result_t result = ${X}_RESULT_SUCCESS; - if (enableLeakChecking) { getContext()->refCountContext->logInvalidReferences(); - getContext()->refCountContext->clear(); } - return result; + + return ${X}_RESULT_SUCCESS; } } // namespace ur_validation_layer diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index 013bd9b4d9..ca27eaa422 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -98,6 +98,7 @@ if (UR_ENABLE_TRACING) endif() target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_ENABLE_INSTRUMENTATION + XPTI_STATIC_LIBRARY ) target_include_directories(${TARGET_NAME} PUBLIC ${XPTI_INCLUDES} diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 8d21a93c75..2fdb6b08a3 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -74,12 +74,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, CUDA_KERNEL_NODE_PARAMS Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr) - : CommandBuffer(CommandBuffer), Kernel(Kernel), Node{std::move(Node)}, - Params(Params), WorkDim(WorkDim), RefCountInternal(1), - RefCountExternal(1) { + CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr) + : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), + WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) { CommandBuffer->incrementInternalReferenceCount(); const size_t CopySize = sizeof(size_t) * WorkDim; @@ -124,7 +123,7 @@ static ur_result_t getNodesFromSyncPoints( for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); NodeHandle != SyncPoints.end()) { - CuNodesList.push_back(*NodeHandle->second.get()); + CuNodesList.push_back(NodeHandle->second); } else { return UR_RESULT_ERROR_INVALID_VALUE; } @@ -161,22 +160,22 @@ static ur_result_t enqueueCommandBufferFillHelper( const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); try { + // Graph node added to graph, if multiple nodes are created this will + // be set to the leaf node + CUgraphNode GraphNode; + const size_t N = Size / PatternSize; auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstDevice) : (CUdeviceptr)DstDevice; if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { - // Create a new node - CUgraphNode GraphNode; CUDA_MEMSET_NODE_PARAMS NodeParams = {}; NodeParams.dst = DstPtr; NodeParams.elementSize = PatternSize; @@ -207,11 +206,6 @@ static ur_result_t enqueueCommandBufferFillHelper( cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, CommandBuffer->Device->getNativeContext())); - - // Get sync point and register the cuNode with it. - *SyncPoint = - CommandBuffer->addSyncPoint(std::make_shared(GraphNode)); - } else { // CUDA has no memset functions that allow setting values more than 4 // bytes. UR API lets you pass an arbitrary "pattern" to the buffer @@ -222,10 +216,6 @@ static ur_result_t enqueueCommandBufferFillHelper( size_t NumberOfSteps = PatternSize / sizeof(uint8_t); - // Shared pointer that will point to the last node created - std::shared_ptr GraphNodePtr; - // Create a new node - CUgraphNode GraphNodeFirst; // Update NodeParam CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {}; NodeParamsStepFirst.dst = DstPtr; @@ -236,16 +226,12 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParamsStepFirst.width = 1; UR_CHECK_ERROR(cuGraphAddMemsetNode( - &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(), + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParamsStepFirst, CommandBuffer->Device->getNativeContext())); - // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNodeFirst)); - DepsList.clear(); - DepsList.push_back(GraphNodeFirst); + DepsList.push_back(GraphNode); // we walk up the pattern in 1-byte steps, and call cuMemset for each // 1-byte chunk of the pattern. @@ -256,8 +242,6 @@ static ur_result_t enqueueCommandBufferFillHelper( // offset the pointer to the part of the buffer we want to write to auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t)); - // Create a new node - CUgraphNode GraphNode; // Update NodeParam CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {}; NodeParamsStep.dst = (CUdeviceptr)OffsetPtr; @@ -272,18 +256,20 @@ static ur_result_t enqueueCommandBufferFillHelper( DepsList.size(), &NodeParamsStep, CommandBuffer->Device->getNativeContext())); - GraphNodePtr = std::make_shared(GraphNode); - // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); - DepsList.clear(); - DepsList.push_back(*GraphNodePtr.get()); + DepsList.push_back(GraphNode); } } + + // Get sync point and register the cuNode with it. + auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( @@ -368,18 +354,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); if (*pGlobalWorkSize == 0) { try { @@ -388,12 +367,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( DepsList.data(), DepsList.size())); // Get sync point and register the cuNode with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } // Set the number of threads per block to the number of threads per warp @@ -403,13 +384,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); - Result = + UR_CHECK_ERROR( setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid)); try { // Set node param structure with the kernel related data @@ -434,14 +412,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( hKernel->clearLocalSize(); // Get sync point and register the cuNode with it. - auto NodeSP = std::make_shared(GraphNode); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); if (pSyncPoint) { - *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP); + *pSyncPoint = SyncPoint; } auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, std::move(NodeSP), NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + hCommandBuffer, hKernel, GraphNode, NodeParams, + workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; NewCommand->incrementInternalReferenceCount(); hCommandBuffer->CommandHandles.push_back(NewCommand); @@ -451,9 +429,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( @@ -461,16 +439,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { CUDA_MEMCPY3D NodeParams = {}; @@ -482,12 +454,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( @@ -496,7 +470,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; @@ -505,13 +478,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hSrcMem->Mem) @@ -528,12 +496,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -544,16 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -571,12 +535,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -586,16 +552,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Dst = std::get(hBuffer->Mem) @@ -610,12 +570,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -624,16 +586,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hBuffer->Mem) @@ -648,12 +604,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -665,16 +623,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto DstPtr = @@ -691,12 +643,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -708,16 +662,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -734,12 +682,14 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( @@ -750,13 +700,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_sync_point_t *pSyncPoint) { // Prefetch cmd is not supported by Cuda Graph. // We implement it as an empty node to enforce dependencies. - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Add an empty node to preserve dependencies. @@ -764,17 +712,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( DepsList.data(), DepsList.size())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } - setErrorMessage("Prefetch hint ignored and replaced with empty node as " - "prefetch is not supported by CUDA Graph backend", - UR_RESULT_SUCCESS); - Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC; } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( @@ -785,13 +731,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_sync_point_t *pSyncPoint) { // Mem-Advise cmd is not supported by Cuda Graph. // We implement it as an empty node to enforce dependencies. - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Add an empty node to preserve dependencies. @@ -799,18 +743,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( DepsList.data(), DepsList.size())); // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); - - setErrorMessage("Memory advice ignored and replaced with empty node as " - "memory advice is not supported by CUDA Graph backend", - UR_RESULT_SUCCESS); - Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( @@ -860,7 +801,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; try { std::unique_ptr RetImplEvent{nullptr}; @@ -870,10 +810,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList)) != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr( @@ -890,10 +828,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( @@ -1067,7 +1005,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.sharedMemBytes = Kernel->getLocalSize(); Params.kernelParams = const_cast(Kernel->getArgIndices().data()); - CUgraphNode Node = *(hCommand->Node); + CUgraphNode Node = hCommand->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; UR_CHECK_ERROR(cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params)); return UR_RESULT_SUCCESS; diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index d83269f2ae..504095612b 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -42,9 +42,9 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, CUDA_KERNEL_NODE_PARAMS Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr); + CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr); void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -97,7 +97,7 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t CommandBuffer; ur_kernel_handle_t Kernel; - std::shared_ptr Node; + CUgraphNode Node; CUDA_KERNEL_NODE_PARAMS Params; uint32_t WorkDim; @@ -118,8 +118,8 @@ struct ur_exp_command_buffer_handle_t_ { ~ur_exp_command_buffer_handle_t_(); void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - std::shared_ptr CuNode) { - SyncPoints[SyncPoint] = std::move(CuNode); + CUgraphNode CuNode) { + SyncPoints[SyncPoint] = CuNode; NextSyncPoint++; } @@ -130,8 +130,7 @@ struct ur_exp_command_buffer_handle_t_ { // Helper to register next sync point // @param CuNode Node to register as next sync point // @return Pointer to the sync that registers the Node - ur_exp_command_buffer_sync_point_t - addSyncPoint(std::shared_ptr CuNode) { + ur_exp_command_buffer_sync_point_t addSyncPoint(CUgraphNode CuNode) { ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; registerSyncPoint(SyncPoint, std::move(CuNode)); return SyncPoint; @@ -173,8 +172,7 @@ struct ur_exp_command_buffer_handle_t_ { std::atomic_uint32_t RefCountExternal; // Map of sync_points to ur_events - std::unordered_map> + std::unordered_map SyncPoints; // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index f6e6bbe4b3..bbaaa27cdb 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1185,27 +1185,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( /// \return TBD UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, - const ur_device_native_properties_t *pProperties, + ur_native_handle_t hNativeDevice, + [[maybe_unused]] ur_adapter_handle_t hAdapter, + [[maybe_unused]] const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { - std::ignore = pProperties; - CUdevice CuDevice = static_cast(hNativeDevice); auto IsDevice = [=](std::unique_ptr &Dev) { return Dev->get() == CuDevice; }; - // If a platform is provided just check if the device is in it - if (hPlatform) { - auto SearchRes = std::find_if(begin(hPlatform->Devices), - end(hPlatform->Devices), IsDevice); - if (SearchRes != end(hPlatform->Devices)) { - *phDevice = SearchRes->get(); - return UR_RESULT_SUCCESS; - } - } - // Get list of platforms uint32_t NumPlatforms = 0; ur_adapter_handle_t AdapterHandle = &adapter; diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index 58ee98184d..c1154ec9c4 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -455,21 +455,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( // Allocate a cuArray if (pImageDesc->numMipLevel == 1) { - CUarray ImageArray; + CUarray ImageArray{}; try { UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &array_desc)); *phImageMem = (ur_exp_image_mem_native_handle_t)ImageArray; } catch (ur_result_t Err) { - cuArrayDestroy(ImageArray); + if (ImageArray != CUarray{}) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } return Err; } catch (...) { - cuArrayDestroy(ImageArray); + if (ImageArray != CUarray{}) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } return UR_RESULT_ERROR_UNKNOWN; } } else // Allocate a cuMipmappedArray { - CUmipmappedArray mip_array; + CUmipmappedArray mip_array{}; array_desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; try { @@ -477,10 +481,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( pImageDesc->numMipLevel)); *phImageMem = (ur_exp_image_mem_native_handle_t)mip_array; } catch (ur_result_t Err) { - cuMipmappedArrayDestroy(mip_array); + if (mip_array) { + UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array)); + } return Err; } catch (...) { - cuMipmappedArrayDestroy(mip_array); + if (mip_array) { + UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array)); + } return UR_RESULT_ERROR_UNKNOWN; } } @@ -1169,6 +1177,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **ppRetMem) { + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice); + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC BufferDesc = {}; + BufferDesc.size = size; + BufferDesc.offset = offset; + BufferDesc.flags = 0; + + CUdeviceptr retMem; + UR_CHECK_ERROR(cuExternalMemoryGetMappedBuffer( + &retMem, (CUexternalMemory)hExternalMem, &BufferDesc)); + + *ppRetMem = (void *)retMem; + + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_external_mem_handle_t hExternalMem) { diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 1aefb15a3d..ea55c1669a 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -439,7 +439,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size)); } } else { - CUarray ImageArray; + CUarray ImageArray{}; CUsurfObject Surface; try { auto &Image = std::get(Mem->Mem); @@ -465,12 +465,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); Image.SurfObjs[DeviceIdx] = Surface; } catch (ur_result_t Err) { - if (ImageArray) { + if (ImageArray != CUarray{}) { UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); } return Err; } catch (...) { - if (ImageArray) { + if (ImageArray != CUarray{}) { UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); } return UR_RESULT_ERROR_UNKNOWN; diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index aa992f44bf..6dcaa28414 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -197,20 +197,15 @@ struct SurfaceMem { void *HostPtr) : Arrays(Context->Devices.size(), CUarray{0}), SurfObjs(Context->Devices.size(), CUsurfObject{0}), - OuterMemStruct{OuterMemStruct}, - ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} { + OuterMemStruct{OuterMemStruct}, ImageDesc{ImageDesc}, ArrayDesc{}, + HostPtr{HostPtr} { // We have to use hipArray3DCreate, which has some caveats. The height and // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc // gives a minimum value of 1, so we need to convert the answer. ArrayDesc.NumChannels = 4; // Only support 4 channel image - ArrayDesc.Flags = 0; // No flags required ArrayDesc.Width = ImageDesc.width; - if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { - ArrayDesc.Height = 0; - ArrayDesc.Depth = 0; - } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { ArrayDesc.Height = ImageDesc.height; - ArrayDesc.Depth = 0; } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { ArrayDesc.Height = ImageDesc.height; ArrayDesc.Depth = ImageDesc.depth; @@ -414,10 +409,14 @@ struct ur_mem_handle_t_ { } ur_result_t clear() { - if (isBuffer()) { - return std::get(Mem).clear(); + try { + if (isBuffer()) { + return std::get(Mem).clear(); + } + return std::get(Mem).clear(); + } catch (const ur_result_t &error) { + return error; } - return std::get(Mem).clear(); } ur_context_handle_t getContext() const noexcept { return Context; } diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index bd92a01400..e5cce87798 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -45,7 +45,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { // change NumComputeStreams after that if (NumComputeStreams < ComputeStreams.size()) { UR_CHECK_ERROR(cuStreamCreateWithPriority( - &ComputeStreams[NumComputeStreams++], Flags, Priority)); + &ComputeStreams[NumComputeStreams], Flags, Priority)); + ++NumComputeStreams; } } Token = ComputeStreamIndex++; @@ -110,7 +111,8 @@ CUstream ur_queue_handle_t_::getNextTransferStream() { // change NumTransferStreams after that if (NumTransferStreams < TransferStreams.size()) { UR_CHECK_ERROR(cuStreamCreateWithPriority( - &TransferStreams[NumTransferStreams++], Flags, Priority)); + &TransferStreams[NumTransferStreams], Flags, Priority)); + ++NumTransferStreams; } } uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 2e01a4b7a3..bb3fb9aee5 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -340,6 +340,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index d9438eeb9c..4ff38626af 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -76,12 +76,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, hipKernelNodeParams Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr) - : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(std::move(Node)), - Params(Params), WorkDim(WorkDim), RefCountInternal(1), - RefCountExternal(1) { + hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr) + : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), + WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) { CommandBuffer->incrementInternalReferenceCount(); const size_t CopySize = sizeof(size_t) * WorkDim; @@ -125,7 +124,7 @@ static ur_result_t getNodesFromSyncPoints( for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); NodeHandle != SyncPoints.end()) { - HIPNodesList.push_back(*NodeHandle->second.get()); + HIPNodesList.push_back(NodeHandle->second); } else { return UR_RESULT_ERROR_INVALID_VALUE; } @@ -139,29 +138,23 @@ static ur_result_t enqueueCommandBufferFillHelper( const hipMemoryType DstType, const void *Pattern, size_t PatternSize, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { std::vector DepsList; - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); try { + // Graph node added to graph, if multiple nodes are created this will + // be set to the leaf node + hipGraphNode_t GraphNode; + const size_t N = Size / PatternSize; auto DstPtr = DstType == hipMemoryTypeDevice ? *static_cast(DstDevice) : DstDevice; if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { - // Create a new node - hipGraphNode_t GraphNode; hipMemsetParams NodeParams = {}; NodeParams.dst = DstPtr; NodeParams.elementSize = PatternSize; @@ -192,10 +185,6 @@ static ur_result_t enqueueCommandBufferFillHelper( DepsList.data(), DepsList.size(), &NodeParams)); - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - } else { // HIP has no memset functions that allow setting values more than 4 // bytes. UR API lets you pass an arbitrary "pattern" to the buffer @@ -206,11 +195,6 @@ static ur_result_t enqueueCommandBufferFillHelper( size_t NumberOfSteps = PatternSize / sizeof(uint8_t); - // Shared pointer that will point to the last node created - std::shared_ptr GraphNodePtr; - - // Create a new node - hipGraphNode_t GraphNodeFirst; // Update NodeParam hipMemsetParams NodeParamsStepFirst = {}; NodeParamsStepFirst.dst = DstPtr; @@ -220,16 +204,12 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParamsStepFirst.value = *(static_cast(Pattern)); NodeParamsStepFirst.width = 1; - UR_CHECK_ERROR(hipGraphAddMemsetNode( - &GraphNodeFirst, CommandBuffer->HIPGraph, DepsList.data(), - DepsList.size(), &NodeParamsStepFirst)); - - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNodeFirst)); + UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParamsStepFirst)); DepsList.clear(); - DepsList.push_back(GraphNodeFirst); + DepsList.push_back(GraphNode); // we walk up the pattern in 1-byte steps, and add Memset node for each // 1-byte chunk of the pattern. @@ -241,8 +221,6 @@ static ur_result_t enqueueCommandBufferFillHelper( auto OffsetPtr = reinterpret_cast( reinterpret_cast(DstPtr) + (Step * sizeof(uint8_t))); - // Create a new node - hipGraphNode_t GraphNode; // Update NodeParam hipMemsetParams NodeParamsStep = {}; NodeParamsStep.dst = reinterpret_cast(OffsetPtr); @@ -256,14 +234,17 @@ static ur_result_t enqueueCommandBufferFillHelper( &GraphNode, CommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), &NodeParamsStep)); - GraphNodePtr = std::make_shared(GraphNode); - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); - DepsList.clear(); - DepsList.push_back(*GraphNodePtr.get()); + DepsList.push_back(GraphNode); } } + + // Get sync point and register the node with it. + auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + } catch (ur_result_t Err) { return Err; } @@ -346,14 +327,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( hipGraphNode_t GraphNode; std::vector DepsList; - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); if (*pGlobalWorkSize == 0) { try { @@ -362,8 +337,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -377,13 +354,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( uint32_t LocalSize = hKernel->getLocalSize(); hipFunction_t HIPFunc = hKernel->get(); - UR_CALL(setKernelParams(hCommandBuffer->Device, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, hKernel, HIPFunc, - ThreadsPerBlock, BlocksPerGrid), - Result); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(setKernelParams( + hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); try { // Set node param structure with the kernel related data @@ -409,14 +382,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( hKernel->clearLocalSize(); // Get sync point and register the node with it. - auto NodeSP = std::make_shared(GraphNode); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); if (pSyncPoint) { - *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP); + *pSyncPoint = SyncPoint; } auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, std::move(NodeSP), NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + hCommandBuffer, hKernel, GraphNode, NodeParams, + workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; NewCommand->incrementInternalReferenceCount(); hCommandBuffer->CommandHandles.push_back(NewCommand); @@ -442,25 +415,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { - UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( - &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), - pDst, pSrc, size, hipMemcpyHostToHost)); + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + pDst, pSrc, size, hipMemcpyDefault)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -483,16 +450,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hSrcMem->Mem) @@ -505,8 +464,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( Dst, Src, size, hipMemcpyDeviceToDevice)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -527,16 +488,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -554,8 +507,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -575,16 +530,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Dst = std::get(hBuffer->Mem) @@ -595,8 +542,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( Dst, pSrc, size, hipMemcpyHostToDevice)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -615,16 +564,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hBuffer->Mem) @@ -635,8 +576,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( pDst, Src, size, hipMemcpyDeviceToHost)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -658,16 +601,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto DstPtr = @@ -683,8 +618,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -706,16 +643,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -731,8 +660,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -753,16 +684,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Create an empty node if the kernel workload size is zero @@ -770,13 +693,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - - setErrorMessage("Prefetch hint ignored and replaced with empty node as " - "prefetch is not supported by HIP Graph backend", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -797,16 +717,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Create an empty node if the kernel workload size is zero @@ -814,13 +726,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - - setErrorMessage("Memory advice ignored and replaced with empty node as " - "memory advice is not supported by HIP Graph backend", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -878,8 +787,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { std::unique_ptr RetImplEvent{nullptr}; ScopedContext Active(hQueue->getDevice()); @@ -888,10 +795,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if ((Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList)) != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr( @@ -908,10 +813,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( @@ -978,12 +883,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( uint32_t ArgIndex = PointerArgDesc.argIndex; const void *ArgValue = PointerArgDesc.pNewPointerArg; - ur_result_t Result = UR_RESULT_SUCCESS; try { Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } @@ -996,7 +899,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( uint32_t ArgIndex = MemobjArgDesc.argIndex; ur_mem_handle_t ArgValue = MemobjArgDesc.hNewMemObjArg; - ur_result_t Result = UR_RESULT_SUCCESS; try { if (ArgValue == nullptr) { Kernel->setKernelArg(ArgIndex, 0, nullptr); @@ -1005,8 +907,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Kernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); } } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } @@ -1020,13 +921,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( size_t ArgSize = ValueArgDesc.argSize; const void *ArgValue = ValueArgDesc.pNewValueArg; - ur_result_t Result = UR_RESULT_SUCCESS; - try { Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } @@ -1064,12 +962,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; hipFunction_t HIPFunc = Kernel->get(); - auto Result = setKernelParams(Device, WorkDim, GlobalWorkOffset, - GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc, - ThreadsPerBlock, BlocksPerGrid); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(setKernelParams(Device, WorkDim, GlobalWorkOffset, + GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc, + ThreadsPerBlock, BlocksPerGrid)); hipKernelNodeParams &Params = hCommand->Params; @@ -1083,7 +978,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.sharedMemBytes = Kernel->getLocalSize(); Params.kernelParams = const_cast(Kernel->getArgIndices().data()); - hipGraphNode_t Node = *(hCommand->Node); + hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params)); return UR_RESULT_SUCCESS; diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index 751fde3720..d744a3544d 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -41,9 +41,9 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, hipKernelNodeParams Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr); + hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr); void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -96,7 +96,7 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t CommandBuffer; ur_kernel_handle_t Kernel; - std::shared_ptr Node; + hipGraphNode_t Node; hipKernelNodeParams Params; uint32_t WorkDim; @@ -117,7 +117,7 @@ struct ur_exp_command_buffer_handle_t_ { ~ur_exp_command_buffer_handle_t_(); void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - std::shared_ptr &&HIPNode) { + hipGraphNode_t HIPNode) { SyncPoints[SyncPoint] = std::move(HIPNode); NextSyncPoint++; } @@ -129,8 +129,7 @@ struct ur_exp_command_buffer_handle_t_ { // Helper to register next sync point // @param HIPNode Node to register as next sync point // @return Pointer to the sync that registers the Node - ur_exp_command_buffer_sync_point_t - addSyncPoint(std::shared_ptr HIPNode) { + ur_exp_command_buffer_sync_point_t addSyncPoint(hipGraphNode_t HIPNode) { ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; registerSyncPoint(SyncPoint, std::move(HIPNode)); return SyncPoint; @@ -171,8 +170,7 @@ struct ur_exp_command_buffer_handle_t_ { std::atomic_uint32_t RefCountExternal; // Map of sync_points to ur_events - std::unordered_map> + std::unordered_map SyncPoints; // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index da92fa6a87..3ae98e929d 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -988,7 +988,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_native_handle_t hNativeDevice, + [[maybe_unused]] ur_adapter_handle_t hAdapter, [[maybe_unused]] const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { // We can't cast between ur_native_handle_t and hipDevice_t, so memcpy the @@ -1000,16 +1001,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( return Dev->get() == HIPDevice; }; - // If a platform is provided just check if the device is in it - if (hPlatform) { - auto SearchRes = std::find_if(begin(hPlatform->Devices), - end(hPlatform->Devices), IsDevice); - if (SearchRes != end(hPlatform->Devices)) { - *phDevice = SearchRes->get(); - return UR_RESULT_SUCCESS; - } - } - // Get list of platforms uint32_t NumPlatforms = 0; ur_adapter_handle_t AdapterHandle = &adapter; diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp index 75f93ca4f3..7449c3ba3f 100644 --- a/source/adapters/hip/image.cpp +++ b/source/adapters/hip/image.cpp @@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 5f06567064..eafce43d1c 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -498,7 +498,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(hipMalloc(&DevPtr, Buffer.Size)); } } else { - hipArray *ImageArray; + hipArray *ImageArray{}; hipSurfaceObject_t Surface; try { auto &Image = std::get(Mem->Mem); diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index 0605b9a40c..5ae1d52e7b 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -43,15 +43,31 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { } std::vector ZeDrivers; + std::vector ZeDevices; ZeDrivers.resize(ZeDriverCount); ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); for (uint32_t I = 0; I < ZeDriverCount; ++I) { - auto platform = std::make_unique(ZeDrivers[I]); - UR_CALL(platform->initialize()); - - // Save a copy in the cache for future uses. - platforms.push_back(std::move(platform)); + ze_device_properties_t device_properties{}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + uint32_t ZeDeviceCount = 0; + ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr)); + ZeDevices.resize(ZeDeviceCount); + ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data())); + // Check if this driver has GPU Devices + for (uint32_t D = 0; D < ZeDeviceCount; ++D) { + ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties)); + + if (ZE_DEVICE_TYPE_GPU == device_properties.type) { + // If this Driver is a GPU, save it as a usable platform. + auto platform = std::make_unique(ZeDrivers[I]); + UR_CALL(platform->initialize()); + + // Save a copy in the cache for future uses. + platforms.push_back(std::move(platform)); + break; + } + } } return UR_RESULT_SUCCESS; } catch (...) { @@ -105,8 +121,16 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() // We must only initialize the driver once, even if urPlatformGet() is // called multiple times. Declaring the return value as "static" ensures // it's only called once. - GlobalAdapter->ZeResult = - ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY)); + + // Init with all flags set to enable for all driver types to be init in + // the application. + ze_init_flags_t L0InitFlags = ZE_INIT_FLAG_GPU_ONLY; + if (UrL0InitAllDrivers) { + L0InitFlags |= ZE_INIT_FLAG_VPU_ONLY; + } + logger::debug("\nzeInit with flags value of {}\n", + static_cast(L0InitFlags)); + GlobalAdapter->ZeResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags)); } assert(GlobalAdapter->ZeResult != std::nullopt); // verify that level-zero is initialized diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 3b4a91fc0a..ff4f0b56bc 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -610,8 +610,8 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, bool canBeInOrder(ur_context_handle_t Context, const ur_exp_command_buffer_desc_t *CommandBufferDesc) { // In-order command-lists are not available in old driver version. - bool CompatibleDriver = isDriverVersionNewerOrSimilar( - Context->getPlatform()->ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION); + bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar( + 1, 3, L0_DRIVER_INORDER_MIN_VERSION); return CompatibleDriver ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) : false; @@ -921,6 +921,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || !IsDevicePointer(CommandBuffer->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (CommandBuffer->Device->isDG2() && + (IsSharedPointer(CommandBuffer->Context, Src) || + IsSharedPointer(CommandBuffer->Context, Dst))) { + PreferCopyEngine = false; + } PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueCommandBufferMemCopyHelper( @@ -1293,13 +1300,14 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, * @param[in] CommandBuffer The command buffer. * @param[in] Queue The UR queue used to submit the command buffer. * @param[in] SignalCommandList The command-list to append the barrier to. - * @param[out] Event The host visible event which will be returned to the user. + * @param[out][optional] Event The host visible event which will be returned + * to the user, if user passed an output parameter to the UR API. * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_legacy_t Queue, ur_command_list_ptr_t SignalCommandList, - ur_event_handle_t &Event) { + ur_event_handle_t *Event) { // Execution event for this enqueue of the UR command-buffer ur_event_handle_t RetEvent{}; @@ -1335,7 +1343,9 @@ ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer, &(CommandBuffer->SignalEvent->ZeEvent))); } - Event = RetEvent; + if (Event) { + *Event = RetEvent; + } return UR_RESULT_SUCCESS; } @@ -1398,9 +1408,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent)); - if (Event) { - UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, *Event)); - } + // Appends a wait on the main command-list signal and registers output Event + // parameter with signal command-list completing. + UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, Event)); UR_CALL(Queue->executeCommandList(SignalCommandList, false, false)); diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 9dfb5a2b19..7031bb5f03 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -67,28 +67,6 @@ ur_result_t ze2urResult(ze_result_t ZeResult) { } } -/// Checks the version of the level-zero driver. -/// @param ZeDriver Level Zero Driver handle -/// @param VersionMajor Major verion number to compare to. -/// @param VersionMinor Minor verion number to compare to. -/// @param VersionBuild Build verion number to compare to. -/// @return true is the version of the driver is higher than or equal to the -/// compared version -bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver, - uint32_t VersionMajor, uint32_t VersionMinor, - uint32_t VersionBuild) { - ZeStruct ZeDriverProperties; - ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); - uint32_t DriverVersion = ZeDriverProperties.driverVersion; - auto DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24; - auto DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16; - auto DriverVersionBuild = DriverVersion & 0x0000FFFF; - - return ((DriverVersionMajor >= VersionMajor) && - (DriverVersionMinor >= VersionMinor) && - (DriverVersionBuild >= VersionBuild)); -} - // This function will ensure compatibility with both Linux and Windows for // setting environment variables. bool setEnvVar(const char *name, const char *value) { diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index a81b852727..5784d5bf78 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -207,6 +207,15 @@ const int UrL0LeaksDebug = [] { return std::atoi(UrRet); }(); +// Enable for UR L0 Adapter to Init all L0 Drivers on the system with filtering +// in place for only currently used Drivers. +const int UrL0InitAllDrivers = [] { + const char *UrRet = std::getenv("UR_L0_INIT_ALL_DRIVERS"); + if (!UrRet) + return 0; + return std::atoi(UrRet); +}(); + // Controls Level Zero calls serialization to w/a Level Zero driver being not MT // ready. Recognized values (can be used as a bit mask): enum { @@ -317,11 +326,6 @@ bool setEnvVar(const char *name, const char *value); // Map Level Zero runtime error code to UR error code. ur_result_t ze2urResult(ze_result_t ZeResult); -/// Checks the version of the level-zero driver. -bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver, - uint32_t VersionMajor, uint32_t VersionMinor, - uint32_t VersionBuild); - // Trace a call to Level-Zero RT #define ZE2UR_CALL(ZeName, ZeArgs) \ { \ diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index dc70a2470c..fab54f3783 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -43,6 +43,8 @@ struct ur_context_handle_t_ : _ur_object { ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} + virtual ~ur_context_handle_t_() {} + // A L0 context handle is primarily used during creation and management of // resources that may be used by multiple devices. // This field is only set at ur_context_handle_t creation time, and cannot diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 9e832bbb9a..08f13268eb 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1173,12 +1173,10 @@ bool ur_device_handle_t_::useDriverInOrderLists() { // Use in-order lists implementation from L0 driver instead // of adapter's implementation. - ze_driver_handle_t ZeDriver = this->Platform->ZeDriver; - static const bool UseDriverInOrderLists = [&] { const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); - bool CompatibleDriver = isDriverVersionNewerOrSimilar( - ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION); + bool CompatibleDriver = this->Platform->isDriverVersionNewerOrSimilar( + 1, 3, L0_DRIVER_INORDER_MIN_VERSION); if (!UrRet) return CompatibleDriver; return std::atoi(UrRet) != 0; @@ -1602,14 +1600,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t NativeDevice, ///< [in] the native handle of the device. - ur_platform_handle_t Platform, ///< [in] handle of the platform instance - const ur_device_native_properties_t + [[maybe_unused]] ur_adapter_handle_t + Adapter, ///< [in] handle of the platform instance + [[maybe_unused]] const ur_device_native_properties_t *Properties, ///< [in][optional] pointer to native device properties ///< struct. ur_device_handle_t *Device ///< [out] pointer to the handle of the device object created. ) { - std::ignore = Properties; auto ZeDevice = ur_cast(NativeDevice); // The SYCL spec requires that the set of devices must remain fixed for the @@ -1622,12 +1620,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( if (const auto *platforms = GlobalAdapter->PlatformCache->get_value()) { for (const auto &p : *platforms) { Dev = p->getDeviceFromNativeHandle(ZeDevice); - if (Dev) { - // Check that the input Platform, if was given, matches the found one. - UR_ASSERT(!Platform || Platform == p.get(), - UR_RESULT_ERROR_INVALID_PLATFORM); - break; - } } } else { return GlobalAdapter->PlatformCache->get_error(); diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 2d0443512d..898edff779 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -61,7 +61,7 @@ struct ur_device_handle_t_ : _ur_object { ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt, ur_device_handle_t ParentDevice = nullptr) : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice}, - ZeDeviceProperties{}, ZeDeviceComputeProperties{} { + ZeDeviceProperties{}, ZeDeviceComputeProperties{}, Id(std::nullopt) { // NOTE: one must additionally call initialize() to complete // UR device creation. } @@ -189,6 +189,9 @@ struct ur_device_handle_t_ : _ur_object { (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; } + // Checks if this GPU is an Intel Flex GPU or Intel Arc Alchemist + bool isDG2() { return (ZeDeviceProperties->deviceId & 0xff00) == 0x5600; } + bool isIntegrated() { return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); } @@ -226,5 +229,5 @@ struct ur_device_handle_t_ : _ur_object { ZeOffsetToImageHandleMap; // unique ephemeral identifer of the device in the adapter - DeviceId Id; + std::optional Id; }; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 33495f52b8..f4dee0d661 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -228,7 +228,9 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the // For in-order queue and wait-list which is empty or has events from // the same queue just use the last command event as the barrier event. - if (Queue->isInOrderQueue() && + // This optimization is disabled when profiling is enabled to ensure + // accurate profiling values & the overhead that profiling incurs. + if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() && WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList, EventWaitList) && Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) { @@ -1034,7 +1036,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { EndTimeRecording.EventHasDied = true; } else { // Otherwise we evict the entry. - Legacy(Event->UrQueue)->EndTimeRecordings.erase(Entry); + Queue->EndTimeRecordings.erase(Entry); } } } diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index 1537a1d201..f68b2d93be 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -1033,6 +1033,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( break; case UR_EXP_EXTERNAL_MEM_TYPE_OPAQUE_FD: default: + delete importWin32; + delete externalMemoryData; return UR_RESULT_ERROR_INVALID_VALUE; } importWin32->handle = Win32Handle->handle; @@ -1083,6 +1085,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **phRetMem) { + std::ignore = hContext; + std::ignore = hDevice; + std::ignore = size; + std::ignore = offset; + std::ignore = hExternalMem; + std::ignore = phRetMem; + logger::error("[UR][L0] {} function not implemented!", + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_external_mem_handle_t hExternalMem) { diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index cb020395ed..9e5670ae5d 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -613,6 +613,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite( // Copy engine is preferred only for host to device transfer. // Device to device transfers run faster on compute engines. bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Src)) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -663,6 +668,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableRead( // Copy engine is preferred only for host to device transfer. // Device to device transfers run faster on compute engines. bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Dst)) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -700,7 +710,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( ZeKernelDesc.pKernelName = KernelName; ze_kernel_handle_t ZeKernel; - ZE2UR_CALL(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel)); + auto ZeResult = + ZE_CALL_NOCHECK(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel)); + // Gracefully handle the case that kernel create fails. + if (ZeResult != ZE_RESULT_SUCCESS) { + delete *RetKernel; + *RetKernel = nullptr; + return ze2urResult(ZeResult); + } auto ZeDevice = It.first; @@ -754,20 +771,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( PArgValue = nullptr; } + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + std::scoped_lock Guard(Kernel->Mutex); + ze_result_t ZeResult = ZE_RESULT_SUCCESS; if (Kernel->ZeKernelMap.empty()) { auto ZeKernel = Kernel->ZeKernel; - ZE2UR_CALL(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); } else { for (auto It : Kernel->ZeKernelMap) { auto ZeKernel = It.second; - ZE2UR_CALL(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); } } - return UR_RESULT_SUCCESS; + if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; + } + + return ze2urResult(ZeResult); } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( @@ -816,6 +842,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + case UR_KERNEL_INFO_NUM_REGS: case UR_KERNEL_INFO_NUM_ARGS: return ReturnValue(uint32_t{Kernel->ZeKernelProperties->numKernelArgs}); case UR_KERNEL_INFO_REFERENCE_COUNT: @@ -1066,6 +1093,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( ) { std::ignore = Properties; std::scoped_lock Guard(Kernel->Mutex); + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, ArgIndex, sizeof(void *), &ArgValue->ZeSampler)); @@ -1085,6 +1115,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( // The ArgValue may be a NULL pointer in which case a NULL value is used for // the kernel argument declared as a pointer to global or constant memory. + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write; diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 95650a7b94..585a10ef4f 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -42,6 +42,19 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) { return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE); } +// Helper function to check if a pointer is a shared pointer. +bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) { + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + // Query memory type of the pointer + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED); +} + // Shared by all memory read/write/copy PI interfaces. // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. @@ -1191,6 +1204,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy( // (versus compute engine). bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -1390,6 +1409,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy2D( // (versus compute engine). bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; diff --git a/source/adapters/level_zero/memory.hpp b/source/adapters/level_zero/memory.hpp index b590165947..43d548f16b 100644 --- a/source/adapters/level_zero/memory.hpp +++ b/source/adapters/level_zero/memory.hpp @@ -32,6 +32,7 @@ using ur_queue_handle_legacy_t = ur_queue_handle_legacy_t_ *; struct ur_device_handle_t_; bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); +bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr); // This is an experimental option to test performance of device to device copy // operations on copy engines (versus compute engine) diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index 02b3663710..f51a8f1aa7 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -266,6 +266,67 @@ ur_result_t ur_platform_handle_t_::initialize() { return UR_RESULT_SUCCESS; } +/// Checks the version of the level-zero driver. +/// @param VersionMajor Major verion number to compare to. +/// @param VersionMinor Minor verion number to compare to. +/// @param VersionBuild Build verion number to compare to. +/// @return true is the version of the driver is higher than or equal to the +/// compared version +bool ur_platform_handle_t_::isDriverVersionNewerOrSimilar( + uint32_t VersionMajor, uint32_t VersionMinor, uint32_t VersionBuild) { + uint32_t DriverVersionMajor = 0; + uint32_t DriverVersionMinor = 0; + uint32_t DriverVersionBuild = 0; + if (!ZeDriverVersionString.Supported) { + ZeStruct ZeDriverProperties; + ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); + uint32_t DriverVersion = ZeDriverProperties.driverVersion; + DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24; + DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16; + DriverVersionBuild = DriverVersion & 0x0000FFFF; + } else { + std::string ZeDriverVersion; + size_t sizeOfDriverString = 0; + ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated, + nullptr, &sizeOfDriverString); + ZeDriverVersion.resize(sizeOfDriverString); + ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated, + ZeDriverVersion.data(), + &sizeOfDriverString); + + // Intel driver version string is in the format: + // Major.Minor.Build+Hotfix where hotfix is optional. + std::stringstream VersionString(ZeDriverVersion); + + std::string VersionValue; + std::vector VersionValues; + char VersionDelim = '.'; + char HotfixDelim = '+'; + + while (getline(VersionString, VersionValue, VersionDelim)) { + VersionValues.push_back(VersionValue); + } + // If the extension exists, but the string value comes by empty or + // malformed, assume this is a developer driver. + if (VersionValues.size() >= 3) { + DriverVersionMajor = atoi(VersionValues[0].c_str()); + DriverVersionMinor = atoi(VersionValues[1].c_str()); + std::stringstream HotfixString(VersionValues[2]); + std::vector BuildHotfixVersionValues; + // Check to see if there is a hotfix value and strip it off. + while (getline(HotfixString, VersionValue, HotfixDelim)) { + BuildHotfixVersionValues.push_back(VersionValue); + } + DriverVersionBuild = atoi(BuildHotfixVersionValues[0].c_str()); + } else { + return true; + } + } + return std::make_tuple(DriverVersionMajor, DriverVersionMinor, + DriverVersionBuild) >= + std::make_tuple(VersionMajor, VersionMinor, VersionBuild); +} + // Get the cached PI device created for the L0 device handle. // Return NULL if no such PI device found. ur_device_handle_t diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index f9fdcb117e..fa15c88bdf 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -62,6 +62,11 @@ struct ur_platform_handle_t_ : public _ur_platform { // If not found, then nullptr is returned. ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t); + /// Checks the version of the level-zero driver. + bool isDriverVersionNewerOrSimilar(uint32_t VersionMajor, + uint32_t VersionMinor, + uint32_t VersionBuild); + // Keep track of all contexts in the platform. This is needed to manage // a lifetime of memory allocations in each context when there are kernels // with indirect access. diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index 26c75aef31..2b40d736c4 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -58,6 +58,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL( *Program ///< [out] pointer to handle of program object created. ) { std::ignore = Properties; + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(IL && Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); try { ur_program_handle_t_ *UrProgram = new ur_program_handle_t_(ur_program_handle_t_::IL, Context, IL, Length); @@ -82,8 +84,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_program_handle_t *Program ///< [out] pointer to handle of Program object created. ) { - std::ignore = Device; - std::ignore = Properties; // In OpenCL, clCreateProgramWithBinary() can be used to load any of the // following: "program executable", "compiled program", or "library of // compiled programs". In addition, the loaded program can be either @@ -96,8 +96,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( // information to distinguish the cases. try { - ur_program_handle_t_ *UrProgram = new ur_program_handle_t_( - ur_program_handle_t_::Native, Context, Binary, Size); + ur_program_handle_t_ *UrProgram = + new ur_program_handle_t_(ur_program_handle_t_::Native, Context, Device, + Properties, Binary, Size); *Program = reinterpret_cast(UrProgram); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; @@ -208,8 +209,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( } } hProgram->ZeModuleMap.insert(std::make_pair(ZeDevice, ZeModuleHandle)); - hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } + hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } // We no longer need the IL / native code. @@ -597,11 +598,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( void **GlobalVariablePointerRet ///< [out] Returns the pointer to the global ///< variable if it is found in the program. ) { - std::ignore = Device; std::scoped_lock lock(Program->Mutex); + ze_module_handle_t ZeModuleEntry{}; + ZeModuleEntry = Program->ZeModule; + if (!Program->ZeModuleMap.empty()) { + auto It = Program->ZeModuleMap.find(Device->ZeDevice); + if (It != Program->ZeModuleMap.end()) { + ZeModuleEntry = It->second; + } + } + ze_result_t ZeResult = - zeModuleGetGlobalPointer(Program->ZeModule, GlobalVariableName, + zeModuleGetGlobalPointer(ZeModuleEntry, GlobalVariableName, GlobalVariableSizeRet, GlobalVariablePointerRet); if (ZeResult == ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) { @@ -632,11 +641,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( case UR_PROGRAM_INFO_CONTEXT: return ReturnValue(Program->Context); case UR_PROGRAM_INFO_NUM_DEVICES: - // TODO: return true number of devices this program exists for. - return ReturnValue(uint32_t{1}); + if (!Program->ZeModuleMap.empty()) + return ReturnValue( + uint32_t{ur_cast(Program->ZeModuleMap.size())}); + else + return ReturnValue(uint32_t{1}); case UR_PROGRAM_INFO_DEVICES: - // TODO: return all devices this program exists for. - return ReturnValue(Program->Context->Devices[0]); + if (!Program->ZeModuleMap.empty()) { + std::vector devices; + for (auto &ZeModulePair : Program->ZeModuleMap) { + auto It = Program->ZeModuleMap.find(ZeModulePair.first); + if (It != Program->ZeModuleMap.end()) { + for (auto &Device : Program->Context->Devices) { + if (Device->ZeDevice == ZeModulePair.first) { + devices.push_back(Device); + } + } + } + } + return ReturnValue(devices.data(), devices.size()); + } else { + return ReturnValue(Program->Context->Devices[0]); + } case UR_PROGRAM_INFO_BINARY_SIZES: { std::shared_lock Guard(Program->Mutex); size_t SzBinary; @@ -645,8 +671,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( Program->State == ur_program_handle_t_::Object) { SzBinary = Program->CodeLength; } else if (Program->State == ur_program_handle_t_::Exe) { - ZE2UR_CALL(zeModuleGetNativeBinary, - (Program->ZeModule, &SzBinary, nullptr)); + if (!Program->ZeModuleMap.empty()) { + std::vector binarySizes; + for (auto &ZeModulePair : Program->ZeModuleMap) { + size_t binarySize = 0; + ZE2UR_CALL(zeModuleGetNativeBinary, + (ZeModulePair.second, &binarySize, nullptr)); + binarySizes.push_back(binarySize); + } + return ReturnValue(binarySizes.data(), binarySizes.size()); + } else { + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, nullptr)); + return ReturnValue(SzBinary); + } } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -655,22 +693,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } case UR_PROGRAM_INFO_BINARIES: { // The caller sets "ParamValue" to an array of pointers, one for each - // device. Since Level Zero supports only one device, there is only one - // pointer. If the pointer is NULL, we don't do anything. Otherwise, we - // copy the program's binary image to the buffer at that pointer. - uint8_t **PBinary = ur_cast(ProgramInfo); - if (!PBinary[0]) - break; - + // device. + uint8_t **PBinary = nullptr; + if (ProgramInfo) { + PBinary = ur_cast(ProgramInfo); + if (!PBinary[0]) { + break; + } + } std::shared_lock Guard(Program->Mutex); + // If the caller is using a Program which is IL, Native or an object, then + // the program has not been built for multiple devices so a single IL is + // returned. if (Program->State == ur_program_handle_t_::IL || Program->State == ur_program_handle_t_::Native || Program->State == ur_program_handle_t_::Object) { - std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); + if (PropSizeRet) + *PropSizeRet = Program->CodeLength; + if (PBinary) { + std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); + } } else if (Program->State == ur_program_handle_t_::Exe) { + // If the caller is using a Program which is a built binary, then + // the program returned will either be a single module if this is a native + // binary or the native binary for each device will be returned. size_t SzBinary = 0; - ZE2UR_CALL(zeModuleGetNativeBinary, - (Program->ZeModule, &SzBinary, PBinary[0])); + uint8_t *NativeBinaryPtr = nullptr; + if (PBinary) { + NativeBinaryPtr = PBinary[0]; + } + if (!Program->ZeModuleMap.empty()) { + uint32_t deviceIndex = 0; + for (auto &ZeDeviceModule : Program->ZeModuleMap) { + size_t binarySize = 0; + if (PBinary) { + NativeBinaryPtr = PBinary[deviceIndex++]; + } + ZE2UR_CALL(zeModuleGetNativeBinary, + (ZeDeviceModule.second, &binarySize, NativeBinaryPtr)); + SzBinary += binarySize; + } + } else { + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, NativeBinaryPtr)); + } + if (PropSizeRet) + *PropSizeRet = SzBinary; } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -678,15 +746,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } case UR_PROGRAM_INFO_NUM_KERNELS: { std::shared_lock Guard(Program->Mutex); - uint32_t NumKernels; + uint32_t NumKernels = 0; if (Program->State == ur_program_handle_t_::IL || Program->State == ur_program_handle_t_::Native || Program->State == ur_program_handle_t_::Object) { return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; } else if (Program->State == ur_program_handle_t_::Exe) { - NumKernels = 0; - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &NumKernels, nullptr)); + if (!Program->ZeModuleMap.empty()) { + ZE2UR_CALL( + zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &NumKernels, nullptr)); + } else { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &NumKernels, nullptr)); + } } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -702,11 +775,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; } else if (Program->State == ur_program_handle_t_::Exe) { uint32_t Count = 0; - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &Count, nullptr)); - std::unique_ptr PNames(new const char *[Count]); - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &Count, PNames.get())); + std::unique_ptr PNames; + if (!Program->ZeModuleMap.empty()) { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &Count, nullptr)); + PNames = std::make_unique(Count); + ZE2UR_CALL( + zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &Count, PNames.get())); + } else { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, nullptr)); + PNames = std::make_unique(Count); + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, PNames.get())); + } for (uint32_t I = 0; I < Count; ++I) { PINames += (I > 0 ? ";" : ""); PINames += PNames[I]; @@ -720,8 +803,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + case UR_PROGRAM_INFO_SOURCE: + return ReturnValue(Program->Code.get()); default: - die("urProgramGetInfo: not implemented"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; } return UR_RESULT_SUCCESS; @@ -761,6 +846,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( // return for programs that were built outside and registered // with urProgramRegister? return ReturnValue(""); + } else if (PropName == UR_PROGRAM_BUILD_INFO_STATUS) { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } else if (PropName == UR_PROGRAM_BUILD_INFO_LOG) { // Check first to see if the plugin code recorded an error message. if (!Program->ErrorMessage.empty()) { @@ -852,6 +939,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ///< program object created. ) { std::ignore = Properties; + UR_ASSERT(Context && NativeProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); auto ZeModule = ur_cast(NativeProgram); // We assume here that programs created from a native handle always diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp index 8d148c8fa2..42330adcbf 100644 --- a/source/adapters/level_zero/program.hpp +++ b/source/adapters/level_zero/program.hpp @@ -65,10 +65,21 @@ struct ur_program_handle_t_ : _ur_object { ze_module_constants_t ZeSpecConstants; }; - // Construct a program in IL or Native state. + // Construct a program in IL. ur_program_handle_t_(state St, ur_context_handle_t Context, const void *Input, size_t Length) - : Context{Context}, + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, Code{new uint8_t[Length]}, + CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} { + std::memcpy(Code.get(), Input, Length); + } + + // Construct a program in NATIVE. + ur_program_handle_t_(state St, ur_context_handle_t Context, + ur_device_handle_t Device, + const ur_program_properties_t *Properties, + const void *Input, size_t Length) + : Context{Context}, NativeDevice(Device), NativeProperties(Properties), OwnZeModule{true}, State{St}, Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} { std::memcpy(Code.get(), Input, Length); @@ -78,26 +89,29 @@ struct ur_program_handle_t_ : _ur_object { ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule, ze_module_build_log_handle_t ZeBuildLog) - : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, - ZeBuildLog{ZeBuildLog} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{ + ZeBuildLog} {} // Construct a program in Exe state (interop). ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule, bool OwnZeModule) - : Context{Context}, OwnZeModule{OwnZeModule}, State{St}, - ZeModule{ZeModule}, ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{OwnZeModule}, State{St}, ZeModule{ZeModule}, ZeBuildLog{ + nullptr} {} // Construct a program from native handle ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule) - : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, - ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{nullptr} {} // Construct a program in Invalid state with a custom error message. ur_program_handle_t_(state St, ur_context_handle_t Context, const std::string &ErrorMessage) - : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage}, - State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, ErrorMessage{ErrorMessage}, State{St}, + ZeModule{nullptr}, ZeBuildLog{nullptr} {} ~ur_program_handle_t_(); void ur_release_program_resources(bool deletion); @@ -108,6 +122,12 @@ struct ur_program_handle_t_ : _ur_object { const ur_context_handle_t Context; // Context of the program. + // Device Handle used for the Native Build + ur_device_handle_t NativeDevice; + + // Properties used for the Native Build + const ur_program_properties_t *NativeProperties; + // Indicates if we own the ZeModule or it came from interop that // asked to not transfer the ownership to SYCL RT. const bool OwnZeModule; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index f467447753..34da252c74 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -607,7 +607,7 @@ ur_result_t ur_queue_handle_legacy_t_::queueRelease() { // internal reference count. When the External Reference count == 0, then // cleanup of the queue begins and the final decrement of the internal // reference count is completed. - Queue->RefCount.decrementAndTest(); + static_cast(Queue->RefCount.decrementAndTest()); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 939a625122..8941f756ea 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -387,6 +387,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index a25c57e21b..1069ec78da 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -311,8 +311,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Align > 65536 || (Align & (Align - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Align > 0) { + if (Align > 65536 || (Align & (Align - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Context->getPlatform(); // If indirect access tracking is enabled then lock the mutex which is @@ -381,8 +385,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Alignment > 0) { + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Device->Platform; @@ -482,8 +490,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Alignment > 0) { + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Device->Platform; diff --git a/source/adapters/level_zero/v2/event_pool.hpp b/source/adapters/level_zero/v2/event_pool.hpp index 8976daa939..8dd66654ad 100644 --- a/source/adapters/level_zero/v2/event_pool.hpp +++ b/source/adapters/level_zero/v2/event_pool.hpp @@ -37,7 +37,7 @@ class event_pool { event_pool(const event_pool &) = delete; event_pool &operator=(const event_pool &) = delete; - DeviceId Id() { return provider->device()->Id; }; + DeviceId Id() { return provider->device()->Id.value(); }; ur_event *allocate(); void free(ur_event *event); diff --git a/source/adapters/mock/ur_mock.cpp b/source/adapters/mock/ur_mock.cpp index b1fc9c8c29..c72c1e30ed 100644 --- a/source/adapters/mock/ur_mock.cpp +++ b/source/adapters/mock/ur_mock.cpp @@ -17,13 +17,14 @@ namespace driver { context_t d_context; ur_result_t mock_urPlatformGetApiVersion(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = + *static_cast(pParams); **params.ppVersion = d_context.version; return UR_RESULT_SUCCESS; } ur_result_t mock_urPlatformGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (!*params.phPlatform) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } @@ -49,7 +50,7 @@ ur_result_t mock_urPlatformGetInfo(void *pParams) { ////////////////////////////////////////////////////////////////////////// ur_result_t mock_urDeviceGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); switch (*params.ppropName) { case UR_DEVICE_INFO_TYPE: if (*params.ppPropValue != nullptr) { diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 1f42e5dbbe..d792c3bd2c 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -921,7 +921,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -930,7 +931,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_result_t result = UR_RESULT_SUCCESS; ur_device_create_with_native_handle_params_t params = { - &hNativeDevice, &hPlatform, &pProperties, &phDevice}; + &hNativeDevice, &hAdapter, &pProperties, &phDevice}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -7805,6 +7806,58 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_bindless_images_map_external_linear_memory_exp_params_t params = { + &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urBindlessImagesMapExternalLinearMemoryExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urBindlessImagesMapExternalLinearMemoryExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urBindlessImagesMapExternalLinearMemoryExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( @@ -10272,6 +10325,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = driver::urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + driver::urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = driver::urBindlessImagesReleaseExternalMemoryExp; diff --git a/source/adapters/native_cpu/context.hpp b/source/adapters/native_cpu/context.hpp index 30bfb31d71..0595f211d9 100644 --- a/source/adapters/native_cpu/context.hpp +++ b/source/adapters/native_cpu/context.hpp @@ -10,13 +10,141 @@ #pragma once +#include +#include #include #include "common.hpp" #include "device.hpp" +#include "ur/ur.hpp" + +namespace native_cpu { +struct usm_alloc_info { + ur_usm_type_t type; + const void *base_ptr; + size_t size; + ur_device_handle_t device; + ur_usm_pool_handle_t pool; + + // We store a pointer to the actual allocation because it is needed when + // freeing memory. + void *base_alloc_ptr; + constexpr usm_alloc_info(ur_usm_type_t type, const void *base_ptr, + size_t size, ur_device_handle_t device, + ur_usm_pool_handle_t pool, void *base_alloc_ptr) + : type(type), base_ptr(base_ptr), size(size), device(device), pool(pool), + base_alloc_ptr(base_alloc_ptr) {} +}; + +constexpr usm_alloc_info usm_alloc_info_null_entry(UR_USM_TYPE_UNKNOWN, nullptr, + 0, nullptr, nullptr, + nullptr); + +constexpr size_t alloc_header_size = sizeof(usm_alloc_info); + +// Computes the padding that we need to add to ensure the +// pointer returned by UR is aligned as the user requested. +static size_t get_padding(uint32_t alignment) { + assert(alignment >= alignof(usm_alloc_info) && + "memory not aligned to usm_alloc_info"); + if (!alignment || alloc_header_size % alignment == 0) + return 0; + size_t padd = 0; + if (alignment <= alloc_header_size) { + padd = alignment - (alloc_header_size % alignment); + } else { + padd = alignment - alloc_header_size; + } + return padd; +} + +// In order to satisfy the MemAllocInfo queries we allocate extra memory +// for the native_cpu::usm_alloc_info struct. +// To satisfy the alignment requirements we "pad" the memory +// allocation so that the pointer returned to the user +// always satisfies (ptr % align) == 0. +static inline void *malloc_impl(uint32_t alignment, size_t size) { + void *ptr = nullptr; + assert(alignment >= alignof(usm_alloc_info) && + "memory not aligned to usm_alloc_info"); +#ifdef _MSC_VER + ptr = _aligned_malloc(alloc_header_size + get_padding(alignment) + size, + alignment); + +#else + ptr = std::aligned_alloc(alignment, + alloc_header_size + get_padding(alignment) + size); +#endif + return ptr; +} + +// The info struct is retrieved by subtracting its size from the pointer +// returned to the user. +static inline uint8_t *get_alloc_info_addr(const void *ptr) { + return (uint8_t *)const_cast(ptr) - alloc_header_size; +} + +static usm_alloc_info get_alloc_info(void *ptr) { + return *(usm_alloc_info *)get_alloc_info_addr(ptr); +} + +} // namespace native_cpu struct ur_context_handle_t_ : RefCounted { ur_context_handle_t_(ur_device_handle_t_ *phDevices) : _device{phDevices} {} ur_device_handle_t _device; + + ur_result_t remove_alloc(void *ptr) { + std::lock_guard lock(alloc_mutex); + const native_cpu::usm_alloc_info &info = native_cpu::get_alloc_info(ptr); + UR_ASSERT(info.type != UR_USM_TYPE_UNKNOWN, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); +#ifdef _MSC_VER + _aligned_free(info.base_alloc_ptr); +#else + free(info.base_alloc_ptr); +#endif + allocations.erase(ptr); + return UR_RESULT_SUCCESS; + } + + const native_cpu::usm_alloc_info & + get_alloc_info_entry(const void *ptr) const { + auto it = allocations.find(ptr); + if (it == allocations.end()) { + return native_cpu::usm_alloc_info_null_entry; + } + + return *(native_cpu::usm_alloc_info *)native_cpu::get_alloc_info_addr(ptr); + } + + void *add_alloc(uint32_t alignment, ur_usm_type_t type, size_t size, + ur_usm_pool_handle_t pool) { + std::lock_guard lock(alloc_mutex); + // We need to ensure that we align to at least alignof(usm_alloc_info), + // otherwise its start address may be unaligned. + alignment = + std::max(alignment, alignof(native_cpu::usm_alloc_info)); + void *alloc = native_cpu::malloc_impl(alignment, size); + if (!alloc) + return nullptr; + // Compute the address of the pointer that we'll return to the user. + void *ptr = native_cpu::alloc_header_size + + native_cpu::get_padding(alignment) + (uint8_t *)alloc; + uint8_t *info_addr = native_cpu::get_alloc_info_addr(ptr); + if (!info_addr) + return nullptr; + // Do a placement new of the alloc_info to avoid allocation and copy + auto info = new (info_addr) + native_cpu::usm_alloc_info(type, ptr, size, this->_device, pool, alloc); + if (!info) + return nullptr; + allocations.insert(ptr); + return ptr; + } + +private: + std::mutex alloc_mutex; + std::set allocations; }; diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 64d99927ae..4b32c11e37 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -366,11 +366,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_native_handle_t hNativeDevice, ur_adapter_handle_t hAdapter, const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { std::ignore = hNativeDevice; - std::ignore = hPlatform; + std::ignore = hAdapter; std::ignore = pProperties; std::ignore = phDevice; diff --git a/source/adapters/native_cpu/image.cpp b/source/adapters/native_cpu/image.cpp index a1b862ad9f..d89990ed10 100644 --- a/source/adapters/native_cpu/image.cpp +++ b/source/adapters/native_cpu/image.cpp @@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 55c32eb84b..2f2f79cd5a 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -81,8 +81,11 @@ class worker_thread { // Waits for all tasks to finish and destroys the worker thread inline void stop() { - m_isRunning.store(false, std::memory_order_release); - m_startWorkCondition.notify_all(); + { + std::lock_guard lock(m_workMutex); + m_isRunning.store(false, std::memory_order_release); + m_startWorkCondition.notify_all(); + } if (m_worker.joinable()) { // Wait for the worker thread to finish handling the task queue m_worker.join(); diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 7b5f1b923d..ff6c9d8c0f 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -329,6 +329,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = diff --git a/source/adapters/native_cpu/usm.cpp b/source/adapters/native_cpu/usm.cpp index 45ac0596f3..dcae1881f1 100644 --- a/source/adapters/native_cpu/usm.cpp +++ b/source/adapters/native_cpu/usm.cpp @@ -8,90 +8,98 @@ // //===----------------------------------------------------------------------===// +#include "ur/ur.hpp" #include "ur_api.h" #include "common.hpp" +#include "context.hpp" +#include -UR_APIEXPORT ur_result_t UR_APICALL -urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, - ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; - std::ignore = pUSMDesc; - std::ignore = pool; +namespace native_cpu { +static ur_result_t alloc_helper(ur_context_handle_t hContext, + const ur_usm_desc_t *pUSMDesc, size_t size, + void **ppMem, ur_usm_type_t type) { + auto alignment = pUSMDesc ? pUSMDesc->align : 1u; + UR_ASSERT((alignment & (alignment - 1)) == 0, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - *ppMem = malloc(size); + auto *ptr = hContext->add_alloc(alignment, type, size, nullptr); + UR_ASSERT(ptr != nullptr, UR_RESULT_ERROR_OUT_OF_RESOURCES); + *ppMem = ptr; return UR_RESULT_SUCCESS; } +} // namespace native_cpu + +UR_APIEXPORT ur_result_t UR_APICALL +urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + std::ignore = pool; + + return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem, + UR_USM_TYPE_HOST); +} + UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; std::ignore = hDevice; - std::ignore = pUSMDesc; std::ignore = pool; - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented - UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - - *ppMem = malloc(size); - - return UR_RESULT_SUCCESS; + return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem, + UR_USM_TYPE_DEVICE); } UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; std::ignore = hDevice; - std::ignore = pUSMDesc; std::ignore = pool; - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented - UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - - *ppMem = malloc(size); - - return UR_RESULT_SUCCESS; + return native_cpu::alloc_helper(hContext, pUSMDesc, size, ppMem, + UR_USM_TYPE_SHARED); } UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { - std::ignore = hContext; UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); - free(pMem); + auto res = hContext->remove_alloc(pMem); - return UR_RESULT_SUCCESS; + return res; } UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, ur_usm_alloc_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - std::ignore = hContext; - std::ignore = pMem; - std::ignore = propName; - std::ignore = propSize; - std::ignore = pPropValue; - std::ignore = pPropSizeRet; + UR_ASSERT(pMem != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + if (propName == UR_USM_ALLOC_INFO_BASE_PTR) { + // TODO: logic to compute base ptr given ptr + DIE_NO_IMPLEMENTATION; + } + const native_cpu::usm_alloc_info &alloc_info = + hContext->get_alloc_info_entry(pMem); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: - // Todo implement this in context - return ReturnValue(UR_USM_TYPE_DEVICE); + return ReturnValue(alloc_info.type); + case UR_USM_ALLOC_INFO_SIZE: + return ReturnValue(alloc_info.size); + case UR_USM_ALLOC_INFO_DEVICE: + return ReturnValue(alloc_info.device); + case UR_USM_ALLOC_INFO_POOL: + return ReturnValue(alloc_info.pool); default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp index 399f668077..18b08bf095 100644 --- a/source/adapters/opencl/common.hpp +++ b/source/adapters/opencl/common.hpp @@ -319,45 +319,33 @@ cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer, template struct FuncPtrCache { std::map Map; std::mutex Mutex; + + void clear(cl_context context) { + std::lock_guard CacheLock{Mutex}; + Map.erase(context); + } }; -// FIXME: There's currently no mechanism for cleaning up this cache, meaning -// that it is invalidated whenever a context is destroyed. This could lead to -// reusing an invalid function pointer if another context happens to have the -// same native handle. struct ExtFuncPtrCacheT { - FuncPtrCache clHostMemAllocINTELCache; - FuncPtrCache clDeviceMemAllocINTELCache; - FuncPtrCache clSharedMemAllocINTELCache; - FuncPtrCache clGetDeviceFunctionPointerCache; - FuncPtrCache - clGetDeviceGlobalVariablePointerCache; - FuncPtrCache - clCreateBufferWithPropertiesINTELCache; - FuncPtrCache clMemBlockingFreeINTELCache; - FuncPtrCache - clSetKernelArgMemPointerINTELCache; - FuncPtrCache clEnqueueMemFillINTELCache; - FuncPtrCache clEnqueueMemcpyINTELCache; - FuncPtrCache clGetMemAllocInfoINTELCache; - FuncPtrCache - clEnqueueWriteGlobalVariableCache; - FuncPtrCache clEnqueueReadGlobalVariableCache; - FuncPtrCache clEnqueueReadHostPipeINTELCache; - FuncPtrCache clEnqueueWriteHostPipeINTELCache; - FuncPtrCache - clSetProgramSpecializationConstantCache; - FuncPtrCache clCreateCommandBufferKHRCache; - FuncPtrCache clRetainCommandBufferKHRCache; - FuncPtrCache clReleaseCommandBufferKHRCache; - FuncPtrCache clFinalizeCommandBufferKHRCache; - FuncPtrCache clCommandNDRangeKernelKHRCache; - FuncPtrCache clCommandCopyBufferKHRCache; - FuncPtrCache clCommandCopyBufferRectKHRCache; - FuncPtrCache clCommandFillBufferKHRCache; - FuncPtrCache clEnqueueCommandBufferKHRCache; - FuncPtrCache clGetCommandBufferInfoKHRCache; - FuncPtrCache clUpdateMutableCommandsKHRCache; +#define CL_EXTENSION_FUNC(func) FuncPtrCache func##Cache; + +#include "extension_functions.def" + +#undef CL_EXTENSION_FUNC + + // If a context stored in the current caching mechanism is destroyed by the + // CL driver all of its function pointers are invalidated. This can lead to a + // pathological case where a subsequently created context gets returned with + // a coincidentally identical handle to the destroyed one and ends up being + // used to retrieve bad function pointers. To avoid this we clear the cache + // when contexts are released. + void clearCache(cl_context context) { +#define CL_EXTENSION_FUNC(func) func##Cache.clear(context); + +#include "extension_functions.def" + +#undef CL_EXTENSION_FUNC + } }; // A raw pointer is used here since the lifetime of this map has to be tied to // piTeardown to avoid issues with static destruction order (a user application diff --git a/source/adapters/opencl/context.cpp b/source/adapters/opencl/context.cpp index 1478050cda..38202bbf58 100644 --- a/source/adapters/opencl/context.cpp +++ b/source/adapters/opencl/context.cpp @@ -113,9 +113,30 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t hContext) { + // If we're reasonably sure this context is about to be detroyed we should + // clear the ext function pointer cache. This isn't foolproof sadly but it + // should drastically reduce the chances of the pathological case described + // in the comments in common.hpp. + static std::mutex contextReleaseMutex; + auto clContext = cl_adapter::cast(hContext); - cl_int Ret = clReleaseContext(cl_adapter::cast(hContext)); - return mapCLErrorToUR(Ret); + { + std::lock_guard lock(contextReleaseMutex); + size_t refCount = 0; + CL_RETURN_ON_FAILURE(clGetContextInfo(clContext, CL_CONTEXT_REFERENCE_COUNT, + sizeof(size_t), &refCount, nullptr)); + + // ExtFuncPtrCache is destroyed in an atexit() callback, so it doesn't + // necessarily outlive the adapter (or all the contexts). + if (refCount == 1 && cl_ext::ExtFuncPtrCache) { + cl_ext::ExtFuncPtrCache->clearCache(clContext); + } + } + + CL_RETURN_ON_FAILURE( + clReleaseContext(cl_adapter::cast(hContext))); + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 44262df26a..a31d6580a0 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -1125,7 +1125,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t, + ur_native_handle_t hNativeDevice, ur_adapter_handle_t, const ur_device_native_properties_t *, ur_device_handle_t *phDevice) { *phDevice = reinterpret_cast(hNativeDevice); diff --git a/source/adapters/opencl/extension_functions.def b/source/adapters/opencl/extension_functions.def new file mode 100644 index 0000000000..76771744b2 --- /dev/null +++ b/source/adapters/opencl/extension_functions.def @@ -0,0 +1,27 @@ +CL_EXTENSION_FUNC(clHostMemAllocINTEL) +CL_EXTENSION_FUNC(clDeviceMemAllocINTEL) +CL_EXTENSION_FUNC(clSharedMemAllocINTEL) +CL_EXTENSION_FUNC(clGetDeviceFunctionPointer) +CL_EXTENSION_FUNC(clGetDeviceGlobalVariablePointer) +CL_EXTENSION_FUNC(clCreateBufferWithPropertiesINTEL) +CL_EXTENSION_FUNC(clMemBlockingFreeINTEL) +CL_EXTENSION_FUNC(clSetKernelArgMemPointerINTEL) +CL_EXTENSION_FUNC(clEnqueueMemFillINTEL) +CL_EXTENSION_FUNC(clEnqueueMemcpyINTEL) +CL_EXTENSION_FUNC(clGetMemAllocInfoINTEL) +CL_EXTENSION_FUNC(clEnqueueWriteGlobalVariable) +CL_EXTENSION_FUNC(clEnqueueReadGlobalVariable) +CL_EXTENSION_FUNC(clEnqueueReadHostPipeINTEL) +CL_EXTENSION_FUNC(clEnqueueWriteHostPipeINTEL) +CL_EXTENSION_FUNC(clSetProgramSpecializationConstant) +CL_EXTENSION_FUNC(clCreateCommandBufferKHR) +CL_EXTENSION_FUNC(clRetainCommandBufferKHR) +CL_EXTENSION_FUNC(clReleaseCommandBufferKHR) +CL_EXTENSION_FUNC(clFinalizeCommandBufferKHR) +CL_EXTENSION_FUNC(clCommandNDRangeKernelKHR) +CL_EXTENSION_FUNC(clCommandCopyBufferKHR) +CL_EXTENSION_FUNC(clCommandCopyBufferRectKHR) +CL_EXTENSION_FUNC(clCommandFillBufferKHR) +CL_EXTENSION_FUNC(clEnqueueCommandBufferKHR) +CL_EXTENSION_FUNC(clGetCommandBufferInfoKHR) +CL_EXTENSION_FUNC(clUpdateMutableCommandsKHR) diff --git a/source/adapters/opencl/image.cpp b/source/adapters/opencl/image.cpp index c33bb57b0f..0c628594bb 100644 --- a/source/adapters/opencl/image.cpp +++ b/source/adapters/opencl/image.cpp @@ -132,6 +132,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 41c6d6de70..9735abefbf 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -206,19 +206,14 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, // Two calls to urDeviceGetInfo are needed: the first determines the size // required to store the result, and the second returns the actual size // values. - ur_result_t URRet = - urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, 0, - nullptr, &ResultSize); - if (URRet != UR_RESULT_SUCCESS) { - return URRet; - } - assert(ResultSize % sizeof(size_t) == 0); - std::vector Result(ResultSize / sizeof(size_t)); - URRet = urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, - ResultSize, Result.data(), nullptr); - if (URRet != UR_RESULT_SUCCESS) { - return URRet; - } + UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice, + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, + 0, nullptr, &ResultSize)); + assert(ResultSize % sizeof(uint32_t) == 0); + std::vector Result(ResultSize / sizeof(uint32_t)); + UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice, + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, + ResultSize, Result.data(), nullptr)); RetVal = *std::max_element(Result.begin(), Result.end()); Ret = CL_SUCCESS; } else if (propName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) { diff --git a/source/adapters/opencl/sampler.cpp b/source/adapters/opencl/sampler.cpp index f05177a987..a47ba7f894 100644 --- a/source/adapters/opencl/sampler.cpp +++ b/source/adapters/opencl/sampler.cpp @@ -158,16 +158,38 @@ urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, static_assert(sizeof(cl_addressing_mode) == sizeof(ur_sampler_addressing_mode_t)); - size_t CheckPropSize = 0; - ur_result_t Err = mapCLErrorToUR( - clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, - propSize, pPropValue, &CheckPropSize)); - if (pPropValue && CheckPropSize != propSize) { - return UR_RESULT_ERROR_INVALID_SIZE; - } - UR_RETURN_ON_FAILURE(Err); - if (pPropSizeRet) { - *pPropSizeRet = CheckPropSize; + ur_result_t Err = UR_RESULT_SUCCESS; + // ur_bool_t have a size of uint8_t, but cl_bool size have the size of + // uint32_t so this adjust UR_SAMPLER_INFO_NORMALIZED_COORDS info to map + // between them. + if (propName == UR_SAMPLER_INFO_NORMALIZED_COORDS) { + cl_bool normalized_coords = false; + Err = mapCLErrorToUR( + clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, + sizeof(cl_bool), &normalized_coords, nullptr)); + if (pPropValue && propSize != sizeof(ur_bool_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + UR_RETURN_ON_FAILURE(Err); + if (pPropValue) { + *static_cast(pPropValue) = + static_cast(normalized_coords); + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(ur_bool_t); + } + } else { + size_t CheckPropSize = 0; + Err = mapCLErrorToUR( + clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, + propSize, pPropValue, &CheckPropSize)); + if (pPropValue && CheckPropSize != propSize) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + UR_RETURN_ON_FAILURE(Err); + if (pPropSizeRet) { + *pPropSizeRet = CheckPropSize; + } } // Convert OpenCL returns to UR diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 171c561c28..100bb888cf 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -347,6 +347,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 222b7f008b..e327d7672b 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -23,7 +23,8 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - set(UMF_TAG 413327815feb1585bfb36b1f34750f1ba961ceed) + # v0.9.x 19.08.2024: Merge pull request #688 ... + set(UMF_TAG 59c4150b7120a7af5b3c8eb2d9b8bbb5d2e96aa3) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") diff --git a/source/common/linux/ur_lib_loader.cpp b/source/common/linux/ur_lib_loader.cpp index 53b6e0bebe..4da7f98bc1 100644 --- a/source/common/linux/ur_lib_loader.cpp +++ b/source/common/linux/ur_lib_loader.cpp @@ -45,7 +45,13 @@ LibLoader::loadAdapterLibrary(const char *name) { } #endif HMODULE handle = dlopen(name, mode); - logger::info("loaded adapter 0x{} ({})", handle, name); + if (!handle) { + char *err = dlerror(); + logger::info("failed to load adapter '{}' with error: {}", name, + err ? err : "unknown error"); + } else { + logger::info("loaded adapter 0x{} ({})", handle, name); + } return std::unique_ptr(handle); } diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index e24c1153c5..0475cf31e4 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -462,7 +462,7 @@ template class AtomicSingleton { static int release(std::function deleter) { auto val = instance.acquire(); - int ret = val->release(deleter); + int ret = val->release(std::move(deleter)); instance.release(); return ret; diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index 5a7a419954..edfd8b055d 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -142,6 +142,8 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_report.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/common.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.hpp @@ -164,7 +166,7 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/symbolizer.cpp ) target_include_directories(ur_loader PRIVATE ${LLVM_INCLUDE_DIRS}) - target_link_libraries(ur_loader PRIVATE LLVMSymbolize) + target_link_libraries(ur_loader PRIVATE LLVMSupport LLVMSymbolize) endif() target_include_directories(ur_loader PRIVATE diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp index 4cf90c7da4..382d6e3ada 100644 --- a/source/loader/layers/sanitizer/asan_buffer.cpp +++ b/source/loader/layers/sanitizer/asan_buffer.cpp @@ -75,12 +75,14 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { return UR_RESULT_SUCCESS; } + std::scoped_lock Guard(Mutex); auto &Allocation = Allocations[Device]; + ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { ur_usm_desc_t USMDesc{}; USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - ur_result_t URes = getContext()->interceptor->allocateMemory( + URes = getContext()->interceptor->allocateMemory( Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER, ur_cast(&Allocation)); if (URes != UR_RESULT_SUCCESS) { @@ -105,7 +107,60 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { Handle = Allocation; - return UR_RESULT_SUCCESS; + if (!LastSyncedDevice.hDevice) { + LastSyncedDevice = MemBuffer::Device_t{Device, Handle}; + return URes; + } + + // If the device required to allocate memory is not the previous one, we + // need to do data migration. + if (Device != LastSyncedDevice.hDevice) { + auto &HostAllocation = Allocations[nullptr]; + if (!HostAllocation) { + ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); + ur_usm_pool_handle_t Pool{}; + URes = getContext()->interceptor->allocateMemory( + Context, nullptr, &USMDesc, Pool, Size, AllocType::HOST_USM, + ur_cast(&HostAllocation)); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("Failed to allocate {} bytes host " + "USM for buffer {} migration", + Size, this); + return URes; + } + } + + // Copy data from last synced device to host + { + ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, + 0, nullptr, nullptr); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Failed to migrate memory buffer data"); + return URes; + } + } + + // Sync data back to device + { + ManagedQueue Queue(Context, Device); + URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, Allocation, HostAllocation, Size, 0, nullptr, + nullptr); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Failed to migrate memory buffer data"); + return URes; + } + } + } + + LastSyncedDevice = MemBuffer::Device_t{Device, Handle}; + + return URes; } ur_result_t MemBuffer::free() { diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp index b4eba4e4ba..989ef4249f 100644 --- a/source/loader/layers/sanitizer/asan_buffer.hpp +++ b/source/loader/layers/sanitizer/asan_buffer.hpp @@ -48,6 +48,12 @@ struct MemBuffer { ur_context_handle_t Context; + struct Device_t { + ur_device_handle_t hDevice; + char *MemHandle; + }; + Device_t LastSyncedDevice{}; + size_t Size; char *HostPtr{}; diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index 0deb021a3f..ec1d5e8fad 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -16,6 +16,7 @@ #include "asan_quarantine.hpp" #include "asan_report.hpp" #include "asan_shadow_setup.hpp" +#include "asan_validator.hpp" #include "stacktrace.hpp" #include "ur_sanitizer_utils.hpp" @@ -185,6 +186,18 @@ SanitizerInterceptor::~SanitizerInterceptor() { DestroyShadowMemoryOnCPU(); DestroyShadowMemoryOnPVC(); DestroyShadowMemoryOnDG2(); + + // We must release these objects before releasing adapters, since + // they may use the adapter in their destructor + m_Quarantine = nullptr; + m_MemBufferMap.clear(); + m_AllocationMap.clear(); + m_KernelMap.clear(); + m_ContextMap.clear(); + + for (auto Adapter : m_Adapters) { + getContext()->urDdiTable.Global.pfnAdapterRelease(Adapter); + } } /// The memory chunk allocated from the underlying allocator looks like this: @@ -615,6 +628,9 @@ SanitizerInterceptor::insertDevice(ur_device_handle_t Device, DI = std::make_shared(Device); + DI->IsSupportSharedSystemUSM = GetDeviceUSMCapability( + Device, UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT); + // Query alignment UR_CALL(getContext()->urDdiTable.Device.pfnGetInfo( Device, UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN, sizeof(DI->Alignment), @@ -683,8 +699,25 @@ ur_result_t SanitizerInterceptor::prepareLaunch( auto Program = GetProgram(Kernel); do { - // Set membuffer arguments auto KernelInfo = getKernelInfo(Kernel); + + // Validate pointer arguments + if (Options(logger).DetectKernelArguments) { + for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) { + auto Ptr = PtrPair.first; + if (Ptr == nullptr) { + continue; + } + if (auto ValidateResult = ValidateUSMPointer( + Context, DeviceInfo->Handle, (uptr)Ptr)) { + ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr, + ValidateResult, PtrPair.second); + exit(1); + } + } + } + + // Set membuffer arguments for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) { char *ArgPointer = nullptr; UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer)); diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index 39c7705c99..1c87cdc8e1 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace ur_sanitizer_layer { @@ -41,21 +42,16 @@ struct DeviceInfo { uptr ShadowOffset = 0; uptr ShadowOffsetEnd = 0; + // Device features + bool IsSupportSharedSystemUSM = false; + ur_mutex Mutex; std::queue> Quarantine; size_t QuarantineSize = 0; - explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) { - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Device.pfnRetain(Device); - assert(Result == UR_RESULT_SUCCESS); - } - - ~DeviceInfo() { - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Device.pfnRelease(Handle); - assert(Result == UR_RESULT_SUCCESS); - } + // Device handles are special and alive in the whole process lifetime, + // so we needn't retain&release here. + explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {} ur_result_t allocShadowMemory(ur_context_handle_t Context); }; @@ -85,6 +81,8 @@ struct KernelInfo { ur_shared_mutex Mutex; std::atomic RefCount = 1; std::unordered_map> BufferArgs; + std::unordered_map> + PointerArgs; // Need preserve the order of local arguments std::map LocalArgs; @@ -201,6 +199,16 @@ class SanitizerInterceptor { ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle); std::shared_ptr getMemBuffer(ur_mem_handle_t MemHandle); + ur_result_t holdAdapter(ur_adapter_handle_t Adapter) { + std::scoped_lock Guard(m_AdaptersMutex); + if (m_Adapters.find(Adapter) != m_Adapters.end()) { + return UR_RESULT_SUCCESS; + } + UR_CALL(getContext()->urDdiTable.Global.pfnAdapterRetain(Adapter)); + m_Adapters.insert(Adapter); + return UR_RESULT_SUCCESS; + } + std::optional findAllocInfoByAddress(uptr Address); std::shared_ptr getContextInfo(ur_context_handle_t Context) { @@ -262,6 +270,9 @@ class SanitizerInterceptor { std::unique_ptr m_Quarantine; logger::Logger &logger; + + std::unordered_set m_Adapters; + ur_shared_mutex m_AdaptersMutex; }; } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_options.hpp b/source/loader/layers/sanitizer/asan_options.hpp index ab6ee0c26b..298639b73c 100644 --- a/source/loader/layers/sanitizer/asan_options.hpp +++ b/source/loader/layers/sanitizer/asan_options.hpp @@ -38,6 +38,7 @@ struct AsanOptions { uint32_t MaxQuarantineSizeMB = 0; bool DetectLocals = true; bool DetectPrivates = true; + bool DetectKernelArguments = true; private: AsanOptions(logger::Logger &logger) { @@ -93,10 +94,11 @@ struct AsanOptions { SetBoolOption("debug", Debug); SetBoolOption("detect_locals", DetectLocals); SetBoolOption("detect_privates", DetectPrivates); + SetBoolOption("detect_kernel_arguments", DetectKernelArguments); auto KV = OptionsEnvMap->find("quarantine_size_mb"); if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); + const auto &Value = KV->second.front(); try { auto temp_long = std::stoul(Value); if (temp_long > UINT32_MAX) { @@ -112,7 +114,7 @@ struct AsanOptions { KV = OptionsEnvMap->find("redzone"); if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); + const auto &Value = KV->second.front(); try { MinRZSize = std::stoul(Value); if (MinRZSize < 16) { @@ -127,7 +129,7 @@ struct AsanOptions { KV = OptionsEnvMap->find("max_redzone"); if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); + const auto &Value = KV->second.front(); try { MaxRZSize = std::stoul(Value); if (MaxRZSize > 2048) { diff --git a/source/loader/layers/sanitizer/asan_report.cpp b/source/loader/layers/sanitizer/asan_report.cpp index bdae3284b4..a92e93f979 100644 --- a/source/loader/layers/sanitizer/asan_report.cpp +++ b/source/loader/layers/sanitizer/asan_report.cpp @@ -11,16 +11,32 @@ */ #include "asan_report.hpp" -#include "asan_options.hpp" - #include "asan_allocator.hpp" #include "asan_interceptor.hpp" #include "asan_libdevice.hpp" +#include "asan_options.hpp" +#include "asan_validator.hpp" #include "ur_sanitizer_layer.hpp" #include "ur_sanitizer_utils.hpp" namespace ur_sanitizer_layer { +namespace { + +void PrintAllocateInfo(uptr Addr, const AllocInfo *AI) { + getContext()->logger.always("{} is located inside of {} region [{}, {})", + (void *)Addr, ToString(AI->Type), + (void *)AI->UserBegin, (void *)AI->UserEnd); + getContext()->logger.always("allocated here:"); + AI->AllocStack.print(); + if (AI->IsReleased) { + getContext()->logger.always("freed here:"); + AI->ReleaseStack.print(); + } +} + +} // namespace + void ReportBadFree(uptr Addr, const StackTrace &stack, const std::shared_ptr &AI) { getContext()->logger.always( @@ -32,13 +48,9 @@ void ReportBadFree(uptr Addr, const StackTrace &stack, (void *)Addr); } - assert(!AI->IsReleased && "Chunk must be not released"); + assert(AI && !AI->IsReleased && "Chunk must be not released"); - getContext()->logger.always("{} is located inside of {} region [{}, {})", - (void *)Addr, ToString(AI->Type), - (void *)AI->UserBegin, (void *)AI->UserEnd); - getContext()->logger.always("allocated here:"); - AI->AllocStack.print(); + PrintAllocateInfo(Addr, AI.get()); } void ReportBadContext(uptr Addr, const StackTrace &stack, @@ -48,16 +60,7 @@ void ReportBadContext(uptr Addr, const StackTrace &stack, (void *)Addr); stack.print(); - getContext()->logger.always("{} is located inside of {} region [{}, {})", - (void *)Addr, ToString(AI->Type), - (void *)AI->UserBegin, (void *)AI->UserEnd); - getContext()->logger.always("allocated here:"); - AI->AllocStack.print(); - - if (AI->IsReleased) { - getContext()->logger.always("freed here:"); - AI->ReleaseStack.print(); - } + PrintAllocateInfo(Addr, AI.get()); } void ReportDoubleFree(uptr Addr, const StackTrace &Stack, @@ -139,16 +142,10 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report, "Failed to find which chunck {} is allocated", (void *)Report.Address); } - assert(AllocInfo->IsReleased); + assert(AllocInfo->IsReleased && + "It must be released since it's use-after-free"); - getContext()->logger.always( - "{} is located inside of {} region [{}, {})", - (void *)Report.Address, ToString(AllocInfo->Type), - (void *)AllocInfo->UserBegin, (void *)AllocInfo->UserEnd); - getContext()->logger.always("allocated here:"); - AllocInfo->AllocStack.print(); - getContext()->logger.always("released here:"); - AllocInfo->ReleaseStack.print(); + PrintAllocateInfo(Report.Address, AllocInfo.get()); } } else { getContext()->logger.always( @@ -157,4 +154,47 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report, } } +void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex, + uptr Addr, const ValidateUSMResult &VR, + StackTrace Stack) { + getContext()->logger.always("\n====ERROR: DeviceSanitizer: " + "invalid-argument on kernel <{}>", + DemangleName(GetKernelName(Kernel))); + Stack.print(); + auto &AI = VR.AI; + switch (VR.Type) { + case ValidateUSMResult::MAYBE_HOST_POINTER: + getContext()->logger.always("The {}th argument {} is not a USM pointer", + ArgIndex + 1, (void *)Addr); + break; + case ValidateUSMResult::RELEASED_POINTER: + getContext()->logger.always( + "The {}th argument {} is a released USM pointer", ArgIndex, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::BAD_CONTEXT: + getContext()->logger.always( + "The {}th argument {} is allocated in other context", ArgIndex, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::BAD_DEVICE: + getContext()->logger.always( + "The {}th argument {} is allocated in other device", ArgIndex, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::OUT_OF_BOUNDS: + getContext()->logger.always( + "The {}th argument {} is located outside of its region [{}, {})", + ArgIndex, (void *)Addr, (void *)AI->UserBegin, (void *)AI->UserEnd); + getContext()->logger.always("allocated here:"); + AI->AllocStack.print(); + break; + default: + break; + } +} + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_report.hpp b/source/loader/layers/sanitizer/asan_report.hpp index 77a182b0e6..0dd8f346d0 100644 --- a/source/loader/layers/sanitizer/asan_report.hpp +++ b/source/loader/layers/sanitizer/asan_report.hpp @@ -21,6 +21,7 @@ namespace ur_sanitizer_layer { struct DeviceSanitizerReport; struct AllocInfo; struct StackTrace; +struct ValidateUSMResult; void ReportBadFree(uptr Addr, const StackTrace &stack, const std::shared_ptr &AllocInfo); @@ -40,4 +41,8 @@ void ReportGenericError(const DeviceSanitizerReport &Report, void ReportUseAfterFree(const DeviceSanitizerReport &Report, ur_kernel_handle_t Kernel, ur_context_handle_t Context); +void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex, + uptr Addr, const ValidateUSMResult &VR, + StackTrace Stack); + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_validator.cpp b/source/loader/layers/sanitizer/asan_validator.cpp new file mode 100644 index 0000000000..a9f2bd2b17 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_validator.cpp @@ -0,0 +1,77 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_validator.cpp + * + */ + +#include "asan_validator.hpp" +#include "asan_interceptor.hpp" +#include "ur_sanitizer_utils.hpp" + +namespace ur_sanitizer_layer { + +namespace { + +bool IsSameDevice(ur_device_handle_t Device1, ur_device_handle_t Device2) { + if (Device1 == Device2) { + return true; + } + auto RootDevice1 = GetParentDevice(Device1); + RootDevice1 = RootDevice1 ? RootDevice1 : Device1; + auto RootDevice2 = GetParentDevice(Device2); + RootDevice2 = RootDevice2 ? RootDevice2 : Device2; + if (RootDevice1 == RootDevice2) { + return true; + } + return false; +} + +} // namespace + +ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context, + ur_device_handle_t Device, uptr Ptr) { + assert(Ptr != 0 && "Don't validate nullptr here"); + + auto AllocInfoItOp = getContext()->interceptor->findAllocInfoByAddress(Ptr); + if (!AllocInfoItOp) { + auto DI = getContext()->interceptor->getDeviceInfo(Device); + bool IsSupportSharedSystemUSM = DI->IsSupportSharedSystemUSM; + if (IsSupportSharedSystemUSM) { + // maybe it's host pointer + return ValidateUSMResult::success(); + } + return ValidateUSMResult::fail(ValidateUSMResult::MAYBE_HOST_POINTER); + } + + auto AllocInfo = AllocInfoItOp.value()->second; + + if (AllocInfo->Context != Context) { + return ValidateUSMResult::fail(ValidateUSMResult::BAD_CONTEXT, + AllocInfo); + } + + if (AllocInfo->Device && !IsSameDevice(AllocInfo->Device, Device)) { + return ValidateUSMResult::fail(ValidateUSMResult::BAD_DEVICE, + AllocInfo); + } + + if (AllocInfo->IsReleased) { + return ValidateUSMResult::fail(ValidateUSMResult::RELEASED_POINTER, + AllocInfo); + } + + if (Ptr < AllocInfo->UserBegin || Ptr >= AllocInfo->UserEnd) { + return ValidateUSMResult::fail(ValidateUSMResult::OUT_OF_BOUNDS, + AllocInfo); + } + + return ValidateUSMResult::success(); +} + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_validator.hpp b/source/loader/layers/sanitizer/asan_validator.hpp new file mode 100644 index 0000000000..52db966562 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_validator.hpp @@ -0,0 +1,50 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_validator.hpp + * + */ +#pragma once + +#include "asan_allocator.hpp" + +namespace ur_sanitizer_layer { + +struct ValidateUSMResult { + enum ErrorType { + SUCCESS, + NULL_POINTER, + MAYBE_HOST_POINTER, + RELEASED_POINTER, + BAD_CONTEXT, + BAD_DEVICE, + OUT_OF_BOUNDS + }; + ErrorType Type; + std::shared_ptr AI; + + operator bool() { return Type != SUCCESS; } + + static ValidateUSMResult success() { return {SUCCESS, nullptr}; } + + static ValidateUSMResult fail(ErrorType Type, + const std::shared_ptr &AI) { + assert(Type != SUCCESS && "The error type shouldn't be SUCCESS"); + return {Type, AI}; + } + + static ValidateUSMResult fail(ErrorType Type) { + assert(Type != SUCCESS && "The error type shouldn't be SUCCESS"); + return {Type, nullptr}; + } +}; + +ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context, + ur_device_handle_t Device, uptr Ptr); + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/stacktrace.cpp b/source/loader/layers/sanitizer/stacktrace.cpp index cf28c8b091..e33fcf0416 100644 --- a/source/loader/layers/sanitizer/stacktrace.cpp +++ b/source/loader/layers/sanitizer/stacktrace.cpp @@ -99,7 +99,7 @@ void StackTrace::print() const { uptr Offset; ParseBacktraceInfo(BI, ModuleName, Offset); if (SymbolizeCode(ModuleName, Offset, Result)) { - SourceInfo SrcInfo = ParseSymbolizerOutput(Result); + SourceInfo SrcInfo = ParseSymbolizerOutput(std::move(Result)); if (SrcInfo.file != "??") { getContext()->logger.always(" #{} in {} {}:{}:{}", index, SrcInfo.function, SrcInfo.file, diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 7fae0285b8..e5e963806b 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -11,9 +11,13 @@ */ #include "asan_interceptor.hpp" +#include "asan_options.hpp" +#include "stacktrace.hpp" #include "ur_sanitizer_layer.hpp" #include "ur_sanitizer_utils.hpp" +#include + namespace ur_sanitizer_layer { namespace { @@ -31,7 +35,11 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, getContext()->logger.error("Unsupport device"); return UR_RESULT_ERROR_INVALID_DEVICE; } - getContext()->logger.info("Add {} into context {}", ToString(DI->Type), + getContext()->logger.info( + "DeviceInfo {} (Type={}, IsSupportSharedSystemUSM={})", + (void *)DI->Handle, ToString(DI->Type), + DI->IsSupportSharedSystemUSM); + getContext()->logger.info("Add {} into context {}", (void *)DI->Handle, (void *)Context); if (!DI->ShadowOffset) { UR_CALL(DI->allocShadowMemory(Context)); @@ -44,6 +52,38 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, } // namespace +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urAdapterGet +__urdlllocal ur_result_t UR_APICALL urAdapterGet( + uint32_t + NumEntries, ///< [in] the number of adapters to be added to phAdapters. + ///< If phAdapters is not NULL, then NumEntries should be greater than + ///< zero, otherwise ::UR_RESULT_ERROR_INVALID_SIZE, + ///< will be returned. + ur_adapter_handle_t * + phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. + ///< If NumEntries is less than the number of adapters available, then + ///< ::urAdapterGet shall only retrieve that number of platforms. + uint32_t * + pNumAdapters ///< [out][optional] returns the total number of adapters available. +) { + auto pfnAdapterGet = getContext()->urDdiTable.Global.pfnAdapterGet; + + if (nullptr == pfnAdapterGet) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters); + if (result == UR_RESULT_SUCCESS && phAdapters) { + const uint32_t NumAdapters = pNumAdapters ? *pNumAdapters : NumEntries; + for (uint32_t i = 0; i < NumAdapters; ++i) { + UR_CALL(getContext()->interceptor->holdAdapter(phAdapters[i])); + } + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMHostAlloc __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc( @@ -424,6 +464,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( std::shared_ptr pMemBuffer = std::make_shared(hContext, size, hostPtrOrNull); + + if (Host && (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + std::shared_ptr CtxInfo = + getContext()->interceptor->getContextInfo(hContext); + for (const auto &hDevice : CtxInfo->DeviceList) { + ManagedQueue InternalQueue(hContext, hDevice); + char *Handle = nullptr; + UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + InternalQueue, true, Handle, Host, size, 0, nullptr, nullptr)); + } + } + ur_result_t result = getContext()->interceptor->insertMemBuffer(pMemBuffer); *phBuffer = ur_cast(pMemBuffer.get()); @@ -1283,6 +1336,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgPointer +__urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + const ur_kernel_arg_pointer_properties_t + *pProperties, ///< [in][optional] pointer to USM pointer properties. + const void * + pArgValue ///< [in][optional] Pointer obtained by USM allocation or virtual memory + ///< mapping operation. If null then argument value is considered null. +) { + auto pfnSetArgPointer = getContext()->urDdiTable.Kernel.pfnSetArgPointer; + + if (nullptr == pfnSetArgPointer) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug( + "==== urKernelSetArgPointer (argIndex={}, pArgValue={})", argIndex, + pArgValue); + + if (Options(getContext()->logger).DetectKernelArguments) { + auto KI = getContext()->interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KI->Mutex); + KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()}; + } + + ur_result_t result = + pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's Global table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_global_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnAdapterGet = ur_sanitizer_layer::urAdapterGet; + + return result; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Context table /// with current process' addresses @@ -1379,6 +1495,7 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = ur_sanitizer_layer::urKernelSetArgValue; pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::urKernelSetArgMemObj; pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + pDdiTable->pfnSetArgPointer = ur_sanitizer_layer::urKernelSetArgPointer; return result; } @@ -1555,6 +1672,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, urDdiTable = *dditable; + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetGlobalProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Global); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetContextProcAddrTable( UR_API_VERSION_CURRENT, &dditable->Context); diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp index 2dd98b945d..feaff8757a 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp @@ -152,6 +152,23 @@ DeviceType GetDeviceType(ur_context_handle_t Context, } } +ur_device_handle_t GetParentDevice(ur_device_handle_t Device) { + ur_device_handle_t ParentDevice{}; + [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo( + Device, UR_DEVICE_INFO_PARENT_DEVICE, sizeof(ur_device_handle_t), + &ParentDevice, nullptr); + assert(Result == UR_RESULT_SUCCESS && "getParentDevice() failed"); + return ParentDevice; +} + +bool GetDeviceUSMCapability(ur_device_handle_t Device, + ur_device_info_t USMInfo) { + ur_device_usm_access_capability_flags_t Flag; + [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo( + Device, USMInfo, sizeof(Flag), &Flag, nullptr); + return (bool)Flag; +} + std::vector GetProgramDevices(ur_program_handle_t Program) { size_t PropSize; [[maybe_unused]] ur_result_t Result = diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp index 92cb4cebc4..44ddf46922 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp @@ -36,6 +36,9 @@ ur_context_handle_t GetContext(ur_kernel_handle_t Kernel); ur_device_handle_t GetDevice(ur_queue_handle_t Queue); DeviceType GetDeviceType(ur_context_handle_t Context, ur_device_handle_t Device); +ur_device_handle_t GetParentDevice(ur_device_handle_t Device); +bool GetDeviceUSMCapability(ur_device_handle_t Device, + ur_device_info_t Feature); std::string GetKernelName(ur_kernel_handle_t Kernel); size_t GetDeviceLocalMemorySize(ur_device_handle_t Device); ur_program_handle_t GetProgram(ur_kernel_handle_t Kernel); diff --git a/source/loader/layers/tracing/ur_tracing_layer.cpp b/source/loader/layers/tracing/ur_tracing_layer.cpp index 88aff57526..722ee77faa 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -21,7 +21,7 @@ namespace ur_tracing_layer { context_t *getContext() { return context_t::get_direct(); } -constexpr auto CALL_STREAM_NAME = "ur"; +constexpr auto CALL_STREAM_NAME = "ur.call"; constexpr auto STREAM_VER_MAJOR = UR_MAJOR_VERSION(UR_API_VERSION_CURRENT); constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); @@ -29,36 +29,19 @@ constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); // Unfortunately this doesn't match the semantics of XPTI, which can be initialized // and finalized exactly once. To workaround this, XPTI is globally initialized on // first use and finalized in the destructor. -class XptiContext { - XptiContext() { - xptiFrameworkInitialize(); - inited = true; - } - - ~XptiContext() { - xptiFrameworkFinalize(); - inited = false; - } - - // Accessing this after destruction is technically UB, but if we get there, - // it means something is calling UR after it has been destroyed at program - // exit. - std::atomic_bool inited; - - public: - static bool running() { - static XptiContext context; - return context.inited; - } +struct XptiContextManager { + XptiContextManager() { xptiFrameworkInitialize(); } + ~XptiContextManager() { xptiFrameworkFinalize(); } }; +static std::shared_ptr xptiContextManagerGlobal = [] { + return std::make_shared(); +}(); static thread_local xpti_td *activeEvent; /////////////////////////////////////////////////////////////////////////////// context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { - if (!XptiContext::running()) { - return; - } + this->xptiContextManager = xptiContextManagerGlobal; call_stream_id = xptiRegisterStream(CALL_STREAM_NAME); std::ostringstream streamv; @@ -69,20 +52,12 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { - if (!XptiContext::running()) { - return; - } - xpti::function_with_args_t payload{id, name, args, resultp, nullptr}; xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, instance, &payload); } uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { - if (!XptiContext::running()) { - return 0; - } - if (auto loc = codelocData.get_codeloc()) { xpti::payload_t payload = xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber, @@ -101,20 +76,10 @@ uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { void context_t::notify_end(uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { - if (!XptiContext::running()) { - return; - } - notify((uint16_t)xpti::trace_point_type_t::function_with_args_end, id, name, args, resultp, instance); } /////////////////////////////////////////////////////////////////////////////// -context_t::~context_t() { - if (!XptiContext::running()) { - return; - } - - xptiFinalize(CALL_STREAM_NAME); -} +context_t::~context_t() { xptiFinalize(CALL_STREAM_NAME); } } // namespace ur_tracing_layer diff --git a/source/loader/layers/tracing/ur_tracing_layer.hpp b/source/loader/layers/tracing/ur_tracing_layer.hpp index 1a5c542ee6..b7e3fc0314 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.hpp +++ b/source/loader/layers/tracing/ur_tracing_layer.hpp @@ -21,6 +21,8 @@ #define TRACING_COMP_NAME "tracing layer" namespace ur_tracing_layer { +struct XptiContextManager; + /////////////////////////////////////////////////////////////////////////////// class __urdlllocal context_t : public proxy_layer_context_t, public AtomicSingleton { @@ -47,6 +49,8 @@ class __urdlllocal context_t : public proxy_layer_context_t, uint8_t call_stream_id; inline static const std::string name = "UR_LAYER_TRACING"; + + std::shared_ptr xptiContextManager; }; context_t *getContext(); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 315be97531..a368ae7b1a 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -705,7 +705,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -719,14 +720,14 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } ur_device_create_with_native_handle_params_t params = { - &hNativeDevice, &hPlatform, &pProperties, &phDevice}; + &hNativeDevice, &hAdapter, &pProperties, &phDevice}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, "urDeviceCreateWithNativeHandle", ¶ms); getContext()->logger.info("---> urDeviceCreateWithNativeHandle"); - ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, + ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); getContext()->notify_end(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, @@ -6048,6 +6049,51 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + auto pfnMapExternalLinearMemoryExp = + getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_bindless_images_map_external_linear_memory_exp_params_t params = { + &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + "urBindlessImagesMapExternalLinearMemoryExp", ¶ms); + + getContext()->logger.info( + "---> urBindlessImagesMapExternalLinearMemoryExp"); + + ur_result_t result = pfnMapExternalLinearMemoryExp( + hContext, hDevice, offset, size, hExternalMem, ppRetMem); + + getContext()->notify_end( + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + "urBindlessImagesMapExternalLinearMemoryExp", ¶ms, &result, + instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + ¶ms); + getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( @@ -8126,6 +8172,11 @@ __urdlllocal ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = ur_tracing_layer::urBindlessImagesMapExternalArrayExp; + dditable.pfnMapExternalLinearMemoryExp = + pDdiTable->pfnMapExternalLinearMemoryExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_tracing_layer::urBindlessImagesMapExternalLinearMemoryExp; + dditable.pfnReleaseExternalMemoryExp = pDdiTable->pfnReleaseExternalMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = diff --git a/source/loader/layers/validation/ur_leak_check.hpp b/source/loader/layers/validation/ur_leak_check.hpp index 56998797a3..7ce5415d96 100644 --- a/source/loader/layers/validation/ur_leak_check.hpp +++ b/source/loader/layers/validation/ur_leak_check.hpp @@ -109,7 +109,7 @@ struct RefCountContext { // No more active adapters, so any references still held are leaked if (adapterCount == 0) { logInvalidReferences(); - clear(); + counts.clear(); } } @@ -133,9 +133,8 @@ struct RefCountContext { updateRefCount(handle, REFCOUNT_CREATE_OR_INCREASE, isAdapterHandle); } - void clear() { counts.clear(); } - template bool isReferenceValid(T handle) { + std::unique_lock lock(mutex); auto it = counts.find(static_cast(handle)); if (it == counts.end() || it->second.refCount < 1) { return false; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 8539951293..8c178b1091 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -719,7 +719,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -733,7 +734,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } if (getContext()->enableParameterValidation) { - if (NULL == hPlatform) { + if (NULL == hAdapter) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } @@ -742,7 +743,12 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } } - ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hAdapter)) { + getContext()->refCountContext->logInvalidReference(hAdapter); + } + + ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) { @@ -4822,9 +4828,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -4902,9 +4910,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5033,9 +5043,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, bufferOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, bufferOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5168,9 +5180,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, bufferOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, bufferOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5248,14 +5262,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBufferSrc, srcOffset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferSrc, srcOffset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hBufferDst, dstOffset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferDst, dstOffset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5383,14 +5401,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBufferSrc, srcOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferSrc, srcOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hBufferDst, dstOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferDst, dstOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5492,9 +5514,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5579,9 +5603,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImage, origin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImage, origin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5667,9 +5693,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImage, origin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImage, origin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5756,14 +5784,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = boundsImage(hImageDst, dstOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImageDst, dstOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5850,9 +5882,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6012,9 +6046,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6089,14 +6125,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pDst, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pDst, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hQueue, pSrc, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pSrc, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6169,9 +6209,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6230,9 +6272,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } } @@ -6332,9 +6376,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6431,14 +6477,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -7549,6 +7599,59 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + auto pfnMapExternalLinearMemoryExp = + getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hContext) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hDevice) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hExternalMem) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == ppRetMem) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hContext)) { + getContext()->refCountContext->logInvalidReference(hContext); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + getContext()->refCountContext->logInvalidReference(hDevice); + } + + ur_result_t result = pfnMapExternalLinearMemoryExp( + hContext, hDevice, offset, size, hExternalMem, ppRetMem); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( @@ -9751,6 +9854,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = ur_validation_layer::urBindlessImagesMapExternalArrayExp; + dditable.pfnMapExternalLinearMemoryExp = + pDdiTable->pfnMapExternalLinearMemoryExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_validation_layer::urBindlessImagesMapExternalLinearMemoryExp; + dditable.pfnReleaseExternalMemoryExp = pDdiTable->pfnReleaseExternalMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = @@ -10939,9 +11047,13 @@ ur_result_t context_t::init(ur_dditable_t *dditable, if (enabledLayerNames.count(nameFullValidation)) { enableParameterValidation = true; + enableBoundsChecking = true; enableLeakChecking = true; enableLifetimeValidation = true; } else { + if (enabledLayerNames.count(nameBoundsChecking)) { + enableBoundsChecking = true; + } if (enabledLayerNames.count(nameParameterValidation)) { enableParameterValidation = true; } @@ -11069,13 +11181,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, } ur_result_t context_t::tearDown() { - ur_result_t result = UR_RESULT_SUCCESS; - if (enableLeakChecking) { getContext()->refCountContext->logInvalidReferences(); - getContext()->refCountContext->clear(); } - return result; + + return UR_RESULT_SUCCESS; } } // namespace ur_validation_layer diff --git a/source/loader/layers/validation/ur_validation_layer.hpp b/source/loader/layers/validation/ur_validation_layer.hpp index aa3d4629b7..642829f7f0 100644 --- a/source/loader/layers/validation/ur_validation_layer.hpp +++ b/source/loader/layers/validation/ur_validation_layer.hpp @@ -24,6 +24,7 @@ class __urdlllocal context_t : public proxy_layer_context_t, public AtomicSingleton { public: bool enableParameterValidation = false; + bool enableBoundsChecking = false; bool enableLeakChecking = false; bool enableLifetimeValidation = false; logger::Logger logger; @@ -35,7 +36,7 @@ class __urdlllocal context_t : public proxy_layer_context_t, static std::vector getNames() { return {nameFullValidation, nameParameterValidation, nameLeakChecking, - nameLifetimeValidation}; + nameBoundsChecking, nameLifetimeValidation}; } ur_result_t init(ur_dditable_t *dditable, const std::set &enabledLayerNames, @@ -49,6 +50,8 @@ class __urdlllocal context_t : public proxy_layer_context_t, "UR_LAYER_FULL_VALIDATION"; inline static const std::string nameParameterValidation = "UR_LAYER_PARAMETER_VALIDATION"; + inline static const std::string nameBoundsChecking = + "UR_LAYER_BOUNDS_CHECKING"; inline static const std::string nameLeakChecking = "UR_LAYER_LEAK_CHECKING"; inline static const std::string nameLifetimeValidation = "UR_LAYER_LIFETIME_VALIDATION"; diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index b94442c9a7..5e628b4faf 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -12,6 +12,7 @@ EXPORTS urBindlessImagesImportExternalMemoryExp urBindlessImagesImportExternalSemaphoreExp urBindlessImagesMapExternalArrayExp + urBindlessImagesMapExternalLinearMemoryExp urBindlessImagesMipmapFreeExp urBindlessImagesMipmapGetLevelExp urBindlessImagesReleaseExternalMemoryExp @@ -181,6 +182,7 @@ EXPORTS urPrintBindlessImagesImportExternalMemoryExpParams urPrintBindlessImagesImportExternalSemaphoreExpParams urPrintBindlessImagesMapExternalArrayExpParams + urPrintBindlessImagesMapExternalLinearMemoryExpParams urPrintBindlessImagesMipmapFreeExpParams urPrintBindlessImagesMipmapGetLevelExpParams urPrintBindlessImagesReleaseExternalMemoryExpParams diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index ab5a7c19ab..18e4018aee 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -12,6 +12,7 @@ urBindlessImagesImportExternalMemoryExp; urBindlessImagesImportExternalSemaphoreExp; urBindlessImagesMapExternalArrayExp; + urBindlessImagesMapExternalLinearMemoryExp; urBindlessImagesMipmapFreeExp; urBindlessImagesMipmapGetLevelExp; urBindlessImagesReleaseExternalMemoryExp; @@ -181,6 +182,7 @@ urPrintBindlessImagesImportExternalMemoryExpParams; urPrintBindlessImagesImportExternalSemaphoreExpParams; urPrintBindlessImagesMapExternalArrayExpParams; + urPrintBindlessImagesMapExternalLinearMemoryExpParams; urPrintBindlessImagesMipmapFreeExpParams; urPrintBindlessImagesMipmapGetLevelExpParams; urPrintBindlessImagesReleaseExternalMemoryExpParams; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 1a867fb57d..c1d023af55 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -764,7 +764,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -775,8 +776,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( [[maybe_unused]] auto context = getContext(); // extract platform's function pointer table - auto dditable = - reinterpret_cast(hPlatform)->dditable; + auto dditable = reinterpret_cast(hAdapter)->dditable; auto pfnCreateWithNativeHandle = dditable->ur.Device.pfnCreateWithNativeHandle; if (nullptr == pfnCreateWithNativeHandle) { @@ -784,10 +784,10 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } // convert loader handle to platform handle - hPlatform = reinterpret_cast(hPlatform)->handle; + hAdapter = reinterpret_cast(hAdapter)->handle; // forward to device-platform - result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties, + result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); if (UR_RESULT_SUCCESS != result) { @@ -6641,6 +6641,46 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hContext)->dditable; + auto pfnMapExternalLinearMemoryExp = + dditable->ur.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hContext = reinterpret_cast(hContext)->handle; + + // convert loader handle to platform handle + hDevice = reinterpret_cast(hDevice)->handle; + + // convert loader handle to platform handle + hExternalMem = + reinterpret_cast(hExternalMem)->handle; + + // forward to device-platform + result = pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size, + hExternalMem, ppRetMem); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( @@ -8691,6 +8731,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( ur_loader::urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = ur_loader::urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_loader::urBindlessImagesMapExternalLinearMemoryExp; pDdiTable->pfnReleaseExternalMemoryExp = ur_loader::urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = diff --git a/source/loader/ur_lib.cpp b/source/loader/ur_lib.cpp index 12b159b0e5..9aad7159c3 100644 --- a/source/loader/ur_lib.cpp +++ b/source/loader/ur_lib.cpp @@ -57,7 +57,8 @@ void context_t::initLayers() const { } void context_t::tearDownLayers() const { - for (auto &[layer, destroy] : layers) { + for (auto it = layers.rbegin(); it != layers.rend(); ++it) { + auto [layer, destroy] = *it; layer->tearDown(); destroy(); } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index aadcb5cda1..5ab7c58803 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -1135,7 +1135,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1143,7 +1143,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -1155,7 +1156,7 @@ ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties, + return pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); } catch (...) { return exceptionToResult(std::current_exception()); @@ -7111,6 +7112,48 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory + ) try { + auto pfnMapExternalLinearMemoryExp = + ur_lib::getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size, + hExternalMem, ppRetMem); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Release external memory /// diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index db31600e2c..f9d510e95d 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1219,6 +1219,15 @@ ur_result_t urPrintBindlessImagesMapExternalArrayExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintBindlessImagesMapExternalLinearMemoryExpParams( + const struct ur_bindless_images_map_external_linear_memory_exp_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintBindlessImagesReleaseExternalMemoryExpParams( const struct ur_bindless_images_release_external_memory_exp_params_t *params, diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp index cff431069a..a9da40775e 100644 --- a/source/ur/ur.cpp +++ b/source/ur/ur.cpp @@ -14,10 +14,11 @@ // Controls tracing UR calls from within the UR itself. bool PrintTrace = [] { + const char *UrRet = std::getenv("SYCL_UR_TRACE"); const char *PiRet = std::getenv("SYCL_PI_TRACE"); - const char *Trace = PiRet ? PiRet : nullptr; + const char *Trace = UrRet ? UrRet : (PiRet ? PiRet : nullptr); const int TraceValue = Trace ? std::stoi(Trace) : 0; - if (TraceValue == -1 || TraceValue == 2) { // Means print all traces + if (TraceValue == -1 || TraceValue == 2) { return true; } return false; diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 0b3ee0b936..54bfdcda42 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -997,7 +997,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1005,7 +1005,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -6051,6 +6052,39 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Release external memory /// diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp index 6eb502907b..e4ac022507 100644 --- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp +++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp @@ -17,6 +17,6 @@ TEST_F(urCudaDeviceCreateWithNativeHandle, Success) { ur_native_handle_t nativeCuda = static_cast(cudaDevice); ur_device_handle_t urDevice; - ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr, + ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, adapter, nullptr, &urDevice)); } diff --git a/test/adapters/level_zero/v2/command_list_cache_test.cpp b/test/adapters/level_zero/v2/command_list_cache_test.cpp index b8c7244352..31e40244fd 100644 --- a/test/adapters/level_zero/v2/command_list_cache_test.cpp +++ b/test/adapters/level_zero/v2/command_list_cache_test.cpp @@ -193,10 +193,10 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { QueueProps.pNext = &IndexProps; } - ur_queue_handle_t Queue; - ASSERT_EQ( - urQueueCreate(context, device, &QueueProps, &Queue), - UR_RESULT_SUCCESS); + uur::raii::Queue Queue; + ASSERT_EQ(urQueueCreate(context, device, &QueueProps, + Queue.ptr()), + UR_RESULT_SUCCESS); Queues.emplace_back(Queue); } diff --git a/test/adapters/level_zero/v2/event_pool_test.cpp b/test/adapters/level_zero/v2/event_pool_test.cpp index b4f7e46f11..e2aa3dc121 100644 --- a/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/test/adapters/level_zero/v2/event_pool_test.cpp @@ -139,7 +139,7 @@ TEST_P(EventPoolTest, Basic) { ur_event *first; ze_event_handle_t zeFirst; { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value()); first = pool->allocate(); zeFirst = first->getZeEvent(); @@ -148,7 +148,7 @@ TEST_P(EventPoolTest, Basic) { ur_event *second; ze_event_handle_t zeSecond; { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value()); second = pool->allocate(); zeSecond = second->getZeEvent(); @@ -165,7 +165,7 @@ TEST_P(EventPoolTest, Threaded) { for (int iters = 0; iters < 3; ++iters) { for (int th = 0; th < 10; ++th) { threads.emplace_back([&] { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value()); std::vector events; for (int i = 0; i < 100; ++i) { events.push_back(pool->allocate()); @@ -183,7 +183,7 @@ TEST_P(EventPoolTest, Threaded) { } TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value()); std::list events; for (int i = 0; i < 128; ++i) { events.push_back(pool->allocate()); diff --git a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp index 071183aa9b..5d64b11e09 100644 --- a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp +++ b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp @@ -20,7 +20,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) { // and perform some query on it to verify that it works. ur_device_handle_t dev = nullptr; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, nullptr, &dev)); + native_handle, adapter, nullptr, &dev)); ASSERT_NE(dev, nullptr); uint32_t dev_id = 0; @@ -41,7 +41,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { ur_device_native_properties_t props{ UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, true}; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, &props, &dev)); + native_handle, adapter, &props, &dev)); ASSERT_NE(dev, nullptr); uint32_t ref_count = 0; @@ -64,7 +64,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { ur_device_native_properties_t props{ UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, false}; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, &props, &dev)); + native_handle, adapter, &props, &dev)); ASSERT_NE(dev, nullptr); uint32_t ref_count = 0; @@ -93,7 +93,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullPointerDevice) { ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle)); ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, - urDeviceCreateWithNativeHandle(native_handle, platform, + urDeviceCreateWithNativeHandle(native_handle, adapter, nullptr, nullptr)); } } diff --git a/test/conformance/exp_command_buffer/CMakeLists.txt b/test/conformance/exp_command_buffer/CMakeLists.txt index a8ecf793ab..a28d692d9b 100644 --- a/test/conformance/exp_command_buffer/CMakeLists.txt +++ b/test/conformance/exp_command_buffer/CMakeLists.txt @@ -12,4 +12,6 @@ add_conformance_test_with_kernels_environment(exp_command_buffer release.cpp retain.cpp invalid_update.cpp + commands.cpp + fill.cpp ) diff --git a/test/conformance/exp_command_buffer/commands.cpp b/test/conformance/exp_command_buffer/commands.cpp new file mode 100644 index 0000000000..412e4ab6de --- /dev/null +++ b/test/conformance/exp_command_buffer/commands.cpp @@ -0,0 +1,204 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" + +struct urCommandBufferCommandsTest + : uur::command_buffer::urCommandBufferExpTest { + + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTest::SetUp()); + + // Allocate USM pointers + for (auto &device_ptr : device_ptrs) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + + for (auto &buffer : buffers) { + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + allocation_size, nullptr, + &buffer)); + + ASSERT_NE(buffer, nullptr); + } + } + + void TearDown() override { + for (auto &device_ptr : device_ptrs) { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + } + + for (auto &buffer : buffers) { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTest::TearDown()); + } + + static constexpr unsigned elements = 16; + static constexpr size_t allocation_size = elements * sizeof(uint32_t); + + std::array device_ptrs = {nullptr, nullptr}; + std::array buffers = {nullptr, nullptr}; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferCommandsTest); + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMMemcpyExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, device_ptrs[0], device_ptrs[1], allocation_size, 0, + nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMFillExp) { + uint32_t pattern = 42; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[0], &pattern, sizeof(pattern), + allocation_size, 0, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyExp) { + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + cmd_buf_handle, buffers[0], buffers[1], 0, 0, allocation_size, 0, + nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyRectExp) { + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyRectExp( + cmd_buf_handle, buffers[0], buffers[1], origin, origin, region, 4, 16, + 4, 16, 0, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadExp) { + std::array host_data{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0, + nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadRectExp) { + std::array host_data{}; + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadRectExp( + cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16, + host_data.data(), 0, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferWriteExp) { + std::array host_data{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0, + nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, + urCommandBufferAppendMemBufferWriteRectExp) { + std::array host_data{}; + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteRectExp( + cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16, + host_data.data(), 0, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { + uint32_t pattern = 42; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffers[0], &pattern, sizeof(pattern), 0, + allocation_size, 0, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr, + nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMAdviseExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMAdviseExp( + cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr, + nullptr)); +} + +struct urCommandBufferAppendKernelLaunchExpTest + : uur::command_buffer::urCommandBufferExpExecutionTest { + virtual void SetUp() override { + program_name = "saxpy_usm"; + UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::SetUp()); + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + } + + int32_t *ptrX = static_cast(shared_ptrs[1]); + int32_t *ptrY = static_cast(shared_ptrs[2]); + for (size_t i = 0; i < global_size; i++) { + ptrX[i] = i; + ptrY[i] = i * 2; + } + + // Index 0 is output + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 0, nullptr, shared_ptrs[0])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 2, nullptr, shared_ptrs[1])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 3, nullptr, shared_ptrs[2])); + } + + virtual void TearDown() override { + for (auto &shared_ptr : shared_ptrs) { + if (shared_ptr) { + EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE( + urCommandBufferExpExecutionTest::TearDown()); + } + + static constexpr size_t local_size = 4; + static constexpr size_t global_size = 32; + static constexpr size_t global_offset = 0; + static constexpr size_t n_dimensions = 1; + static constexpr size_t allocation_size = sizeof(uint32_t) * global_size; + static constexpr uint32_t A = 42; + std::array shared_ptrs = {nullptr, nullptr, nullptr}; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferAppendKernelLaunchExpTest); +TEST_P(urCommandBufferAppendKernelLaunchExpTest, Basic) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, + &local_size, 0, nullptr, nullptr, nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + int32_t *ptrZ = static_cast(shared_ptrs[0]); + for (size_t i = 0; i < global_size; i++) { + uint32_t result = (A * i) + (i * 2); + ASSERT_EQ(result, ptrZ[i]); + } +} diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match index e69de29bb2..8b13789179 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match @@ -0,0 +1 @@ + diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match index e69de29bb2..8b13789179 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match @@ -0,0 +1 @@ + diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match index e69de29bb2..afb0fb95c5 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match @@ -0,0 +1,15 @@ +urCommandBufferAppendKernelLaunchExpTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero__{{.*}} diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match index 0a5a2b1317..2508f92fed 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match @@ -25,3 +25,4 @@ {{OPT}}InvalidUpdateTest.GlobalLocalSizeMistach/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}InvalidUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}InvalidUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/exp_command_buffer/fill.cpp b/test/conformance/exp_command_buffer/fill.cpp new file mode 100644 index 0000000000..2b9a27cf2a --- /dev/null +++ b/test/conformance/exp_command_buffer/fill.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" + +struct testParametersFill { + size_t size; + size_t pattern_size; +}; + +struct urCommandBufferFillCommandsTest + : uur::command_buffer::urCommandBufferExpTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTestWithParam< + testParametersFill>::SetUp()); + + size = std::get<1>(GetParam()).size; + pattern_size = std::get<1>(GetParam()).pattern_size; + pattern = std::vector(pattern_size); + uur::generateMemFillPattern(pattern); + + // Allocate USM pointers + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, size, + &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &buffer)); + + ASSERT_NE(buffer, nullptr); + } + + void TearDown() override { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTestWithParam< + testParametersFill>::TearDown()); + } + + void verifyData(std::vector &output, size_t verify_size) { + size_t pattern_index = 0; + for (size_t i = 0; i < verify_size; ++i) { + ASSERT_EQ(output[i], pattern[pattern_index]) + << "Result mismatch at index: " << i; + + ++pattern_index; + if (pattern_index % pattern_size == 0) { + pattern_index = 0; + } + } + } + + static constexpr unsigned elements = 16; + static constexpr size_t allocation_size = elements * sizeof(uint32_t); + + std::vector pattern; + size_t size; + size_t pattern_size; + + ur_exp_command_buffer_sync_point_t sync_point; + void *device_ptr = nullptr; + ur_mem_handle_t buffer = nullptr; +}; + +static std::vector test_cases{ + /* Everything set to 1 */ + {1, 1}, + /* pattern_size == size */ + {256, 256}, + /* pattern_size < size */ + {1024, 256}, + /* pattern sizes corresponding to some common scalar and vector types */ + {256, 4}, + {256, 8}, + {256, 16}, + {256, 32}}; + +template +static std::string +printFillTestString(const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + std::stringstream test_name; + test_name << platform_device_name << "__size__" + << std::get<1>(info.param).size << "__patternSize__" + << std::get<1>(info.param).pattern_size; + return test_name.str(); +} + +UUR_TEST_SUITE_P(urCommandBufferFillCommandsTest, testing::ValuesIn(test_cases), + printFillTestString); + +TEST_P(urCommandBufferFillCommandsTest, Buffer) { + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0, + nullptr, &sync_point)); + + std::vector output(size, 1); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffer, 0, size, output.data(), 1, &sync_point, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + verifyData(output, size); +} + +TEST_P(urCommandBufferFillCommandsTest, USM) { + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptr, pattern.data(), pattern_size, size, 0, + nullptr, &sync_point)); + + std::vector output(size, 1); + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, output.data(), device_ptr, size, 1, &sync_point, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + verifyData(output, size); +} diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index eeb0a5d5d8..85457bea97 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -55,6 +55,46 @@ struct urCommandBufferExpTest : uur::urContextTest { ur_bool_t updatable_command_buffer_support = false; }; +template +struct urCommandBufferExpTestWithParam : urQueueTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::SetUp()); + + size_t returned_size; + ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS, + 0, nullptr, &returned_size)); + + std::unique_ptr returned_extensions(new char[returned_size]); + + ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS, + returned_size, returned_extensions.get(), + nullptr)); + + std::string_view extensions_string(returned_extensions.get()); + bool command_buffer_support = + extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != + std::string::npos; + + if (!command_buffer_support) { + GTEST_SKIP() << "EXP command-buffer feature is not supported."; + } + + // Create a command-buffer + ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, + nullptr, &cmd_buf_handle)); + ASSERT_NE(cmd_buf_handle, nullptr); + } + + void TearDown() override { + if (cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + } + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::TearDown()); + } + + ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; +}; + struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); diff --git a/test/conformance/kernel/kernel_adapter_level_zero-v2.match b/test/conformance/kernel/kernel_adapter_level_zero-v2.match index 77d2096d92..9589496bcb 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero-v2.match +++ b/test/conformance/kernel/kernel_adapter_level_zero-v2.match @@ -1,17 +1,10 @@ -urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS -urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelGetSuggestedLocalWorkSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelGetSuggestedLocalWorkSizeTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelGetSuggestedLocalWorkSizeTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelGetSuggestedLocalWorkSizeTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ \ No newline at end of file diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 7b74766ac2..c448f6363a 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -1,10 +1,3 @@ -urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS -urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp index ea210a921f..28d5d9c4e3 100644 --- a/test/conformance/memory/urMemImageCreate.cpp +++ b/test/conformance/memory/urMemImageCreate.cpp @@ -26,10 +26,10 @@ struct urMemImageCreateTest : public uur::urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; auto ret = urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, - &image_desc, nullptr, &image_handle); + &image_desc, nullptr, image_handle.ptr()); if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { GTEST_SKIP() << "urMemImageCreate not supported"; @@ -50,10 +50,10 @@ struct urMemImageCreateTestWithParam UUR_RETURN_ON_FATAL_FAILURE( uur::urContextTestWithParam::SetUp()); - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; auto ret = urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc, nullptr, - &image_handle); + image_handle.ptr()); if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { GTEST_SKIP() << "urMemImageCreate not supported"; @@ -89,12 +89,11 @@ TEST_P(urMemImageCreateTestWith1DMemoryTypeParam, Success) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } using urMemImageCreateTestWith2DMemoryTypeParam = @@ -120,12 +119,11 @@ TEST_P(urMemImageCreateTestWith2DMemoryTypeParam, Success) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } TEST_P(urMemImageCreateTest, SuccessWith3DImageType) { @@ -143,28 +141,27 @@ TEST_P(urMemImageCreateTest, SuccessWith3DImageType) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } TEST_P(urMemImageCreateTest, InvalidNullHandleContext) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urMemImageCreate(nullptr, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidEnumerationFlags) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION, urMemImageCreate(context, UR_MEM_FLAG_FORCE_UINT32, &image_format, &image_desc, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) { @@ -175,23 +172,24 @@ TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) { } TEST_P(urMemImageCreateTest, InvalidNullPointerImageDesc) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, nullptr, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidNullPointerImageFormat) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, nullptr, - &image_desc, nullptr, &image_handle)); + &image_desc, nullptr, + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidSize) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.width = std::numeric_limits::max(); @@ -199,7 +197,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); invalid_image_desc = image_desc; invalid_image_desc.height = std::numeric_limits::max(); @@ -207,7 +205,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); invalid_image_desc = image_desc; invalid_image_desc.depth = std::numeric_limits::max(); @@ -215,21 +213,21 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescStype) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.stype = UR_STRUCTURE_TYPE_FORCE_UINT32; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescType) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.type = UR_MEM_TYPE_FORCE_UINT32; @@ -237,11 +235,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescType) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.numMipLevel = 1; /* Must be 0 */ @@ -249,11 +247,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.numSamples = 1; /* Must be 0 */ @@ -261,11 +259,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.rowPitch = 1; /* Must be 0 if pHost is NULL */ @@ -273,11 +271,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.slicePitch = 1; /* Must be 0 if pHost is NULL */ @@ -285,7 +283,7 @@ TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } using urMemImageCreateWithHostPtrFlagsTest = @@ -310,8 +308,9 @@ TEST_P(urMemImageCreateWithHostPtrFlagsTest, Success) { } TEST_P(urMemImageCreateWithHostPtrFlagsTest, InvalidHostPtr) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_HOST_PTR, urMemImageCreate(context, getParam(), &image_format, - &image_desc, nullptr, &image_handle)); + &image_desc, nullptr, + image_handle.ptr())); } diff --git a/test/conformance/program/program_adapter_level_zero-v2.match b/test/conformance/program/program_adapter_level_zero-v2.match index 05b71211b8..c25be22424 100644 --- a/test/conformance/program/program_adapter_level_zero-v2.match +++ b/test/conformance/program/program_adapter_level_zero-v2.match @@ -1,6 +1,9 @@ urProgramCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS +urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_UR_PROGRAM_BUILD_INFO_STATUS urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -Aborted +urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +Segmentation fault \ No newline at end of file diff --git a/test/conformance/program/program_adapter_level_zero.match b/test/conformance/program/program_adapter_level_zero.match index 9e902dca94..f8d65b426e 100644 --- a/test/conformance/program/program_adapter_level_zero.match +++ b/test/conformance/program/program_adapter_level_zero.match @@ -3,4 +3,6 @@ urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Uni urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_UR_PROGRAM_BUILD_INFO_STATUS urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -Aborted +urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ \ No newline at end of file diff --git a/test/layers/CMakeLists.txt b/test/layers/CMakeLists.txt index 2c10a08518..fbf532c274 100644 --- a/test/layers/CMakeLists.txt +++ b/test/layers/CMakeLists.txt @@ -8,3 +8,7 @@ add_subdirectory(validation) if(UR_ENABLE_TRACING) add_subdirectory(tracing) endif() + +if(UR_ENABLE_SANITIZER) + add_subdirectory(sanitizer) +endif() diff --git a/test/layers/sanitizer/CMakeLists.txt b/test/layers/sanitizer/CMakeLists.txt new file mode 100644 index 0000000000..a9601a89c8 --- /dev/null +++ b/test/layers/sanitizer/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(UR_SANITIZER_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(SAN_TEST_PREFIX sanitizer_test) + +function(add_sanitizer_test_executable name) + add_ur_executable(${SAN_TEST_PREFIX}-${name} + ${ARGN}) + target_link_libraries(${SAN_TEST_PREFIX}-${name} + PRIVATE + ${PROJECT_NAME}::loader + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::testing + ${PROJECT_NAME}::mock + GTest::gtest_main) +endfunction() + +function(set_sanitizer_test_properties name) + set_tests_properties(${name} PROPERTIES LABELS "sanitizer") + set_property(TEST ${name} PROPERTY ENVIRONMENT + "UR_LOG_SANITIZER=level:debug\;flush:debug\;output:stdout") +endfunction() + +function(add_sanitizer_test name) + add_sanitizer_test_executable(${name} ${ARGN}) + + add_test(NAME ${name} + COMMAND ${SAN_TEST_PREFIX}-${name} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + set_sanitizer_test_properties(${name}) +endfunction() + +add_sanitizer_test(asan asan.cpp) diff --git a/test/layers/sanitizer/asan.cpp b/test/layers/sanitizer/asan.cpp new file mode 100644 index 0000000000..0fbfe4cefe --- /dev/null +++ b/test/layers/sanitizer/asan.cpp @@ -0,0 +1,58 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan.cpp + * + */ + +#include +#include + +TEST(DeviceAsan, Initialization) { + ur_result_t status; + + ur_loader_config_handle_t loaderConfig; + status = urLoaderConfigCreate(&loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + status = urLoaderConfigEnableLayer(loaderConfig, "UR_LAYER_ASAN"); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderInit(0, loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_adapter_handle_t adapter; + status = urAdapterGet(1, &adapter, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_platform_handle_t platform; + status = urPlatformGet(&adapter, 1, 1, &platform, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_device_handle_t device; + status = urDeviceGet(platform, UR_DEVICE_TYPE_DEFAULT, 1, &device, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_context_handle_t context; + status = urContextCreate(1, &device, nullptr, &context); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urContextRelease(context); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urDeviceRelease(device); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urAdapterRelease(adapter); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderTearDown(); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderConfigRelease(loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); +} diff --git a/test/layers/tracing/test_collector.cpp b/test/layers/tracing/test_collector.cpp index 6c942c63ec..2e412427a7 100644 --- a/test/layers/tracing/test_collector.cpp +++ b/test/layers/tracing/test_collector.cpp @@ -25,7 +25,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; XPTI_CALLBACK_API void trace_cb(uint16_t trace_type, xpti::trace_event_data_t *, xpti::trace_event_data_t *child, uint64_t, diff --git a/test/layers/validation/fixtures.hpp b/test/layers/validation/fixtures.hpp index 9e261f0a1d..00b983138f 100644 --- a/test/layers/validation/fixtures.hpp +++ b/test/layers/validation/fixtures.hpp @@ -133,7 +133,7 @@ inline ur_result_t genericSuccessCallback(void *) { return UR_RESULT_SUCCESS; }; // This returns valid (non-null) handles that we can safely leak. inline ur_result_t fakeContext_urContextCreate(void *pParams) { static std::atomic_int handle = 42; - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); // There are two casts because windows doesn't implicitly extend the 32 bit // result of atomic_int::operator++. **params.pphContext = diff --git a/test/layers/validation/leaks.cpp b/test/layers/validation/leaks.cpp index 59b6bdb750..cd4fc4b739 100644 --- a/test/layers/validation/leaks.cpp +++ b/test/layers/validation/leaks.cpp @@ -9,7 +9,7 @@ // We need a fake handle for the below adapter leak test. inline ur_result_t fakeAdapter_urAdapterGet(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); **params.pphAdapters = reinterpret_cast(0x1); return UR_RESULT_SUCCESS; } diff --git a/test/loader/handles/fixtures.hpp b/test/loader/handles/fixtures.hpp index 8044c90414..441433d899 100644 --- a/test/loader/handles/fixtures.hpp +++ b/test/loader/handles/fixtures.hpp @@ -15,7 +15,7 @@ #endif ur_result_t replace_urPlatformGet(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (*params.ppNumPlatforms) { **params.ppNumPlatforms = 1; @@ -29,7 +29,7 @@ ur_result_t replace_urPlatformGet(void *pParams) { } ur_result_t replace_urDeviceGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (*params.ppropName == UR_DEVICE_INFO_PLATFORM) { if (*params.ppPropSizeRet) { **params.ppPropSizeRet = sizeof(ur_platform_handle_t); diff --git a/tools/urtrace/collector.cpp b/tools/urtrace/collector.cpp index 766e7c9dfe..eb8c18d164 100644 --- a/tools/urtrace/collector.cpp +++ b/tools/urtrace/collector.cpp @@ -36,7 +36,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; static logger::Logger out = logger::create_logger("collector", true);