From 2fd9dea2d254140f0a12a28c06ef4c29dc23a8b2 Mon Sep 17 00:00:00 2001 From: Victor Lomuller Date: Mon, 9 Oct 2023 20:55:05 +0100 Subject: [PATCH] [HIP] Enable kernel finalization using comgr For kernel fusion support for hip, we need to finalize the kernels using comgr. The patch finalizes tagged binaries during buildProgram before handing it over to the hip runtime. Signed-off-by: Victor Lomuller --- README.md | 1 + source/adapters/hip/CMakeLists.txt | 14 ++- source/adapters/hip/common.cpp | 63 +++++++++++ source/adapters/hip/common.hpp | 7 ++ source/adapters/hip/program.cpp | 165 ++++++++++++++++++++++++++++- source/adapters/hip/program.hpp | 7 ++ source/ur/ur.hpp | 1 + 7 files changed, 256 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6d3c4345e2..4917add660 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ List of options provided by CMake: | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF | | UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF | | UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | +| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | ### Additional make targets diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index b29b1becf7..4595bfbf84 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include") set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include") # Set HIP lib dir -set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib") +set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib") # Check if HIP library path exists (AMD platform only) if("${UR_HIP_PLATFORM}" STREQUAL "AMD") @@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" ) + if(UR_ENABLE_COMGR) + add_library(amd_comgr SHARED IMPORTED GLOBAL) + set_target_properties( + amd_comgr PROPERTIES + IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so" + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + target_link_libraries(pi_hip PUBLIC amd_comgr) + target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION) + endif(UR_ENABLE_COMGR) + target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common diff --git a/source/adapters/hip/common.cpp b/source/adapters/hip/common.cpp index c5bd92047e..f1f8ec4fbb 100644 --- a/source/adapters/hip/common.cpp +++ b/source/adapters/hip/common.cpp @@ -11,6 +11,23 @@ #include +#ifdef SYCL_ENABLE_KERNEL_FUSION +ur_result_t mapErrorUR(amd_comgr_status_t Result) { + switch (Result) { + case AMD_COMGR_STATUS_SUCCESS: + return UR_RESULT_SUCCESS; + case AMD_COMGR_STATUS_ERROR: + return UR_RESULT_ERROR_UNKNOWN; + case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} +#endif + ur_result_t mapErrorUR(hipError_t Result) { switch (Result) { case hipSuccess: @@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) { } } +#ifdef SYCL_ENABLE_KERNEL_FUSION +void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line, + const char *File) { + if (Result == AMD_COMGR_STATUS_SUCCESS) { + return; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr || + std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { + const char *ErrorString = nullptr; + const char *ErrorName = nullptr; + switch (Result) { + case AMD_COMGR_STATUS_ERROR: + ErrorName = "AMD_COMGR_STATUS_ERROR"; + ErrorString = "Generic error"; + break; + case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: + ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT"; + ErrorString = + "One of the actual arguments does not meet a precondition stated in " + "the documentation of the corresponding formal argument."; + break; + case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: + ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES"; + ErrorString = "Failed to allocate the necessary resources"; + break; + default: + break; + } + std::cerr << "\nUR HIP ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function + << "\n\tSource Location: " << File << ":" << Line << "\n\n"; + } + + if (std::getenv("PI_HIP_ABORT") != nullptr || + std::getenv("UR_HIP_ABORT") != nullptr) { + std::abort(); + } + + throw mapErrorUR(Result); +} +#endif + void checkErrorUR(hipError_t Result, const char *Function, int Line, const char *File) { if (Result == hipSuccess) { diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp index edf5867c01..2649657f47 100644 --- a/source/adapters/hip/common.hpp +++ b/source/adapters/hip/common.hpp @@ -9,6 +9,9 @@ //===----------------------------------------------------------------------===// #pragma once +#ifdef SYCL_ENABLE_KERNEL_FUSION +#include +#endif #include #include @@ -69,6 +72,10 @@ typedef hipArray *hipCUarray; ur_result_t mapErrorUR(hipError_t Result); +#ifdef SYCL_ENABLE_KERNEL_FUSION +void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line, + const char *File); +#endif void checkErrorUR(hipError_t Result, const char *Function, int Line, const char *File); void checkErrorUR(ur_result_t Result, const char *Function, int Line, diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index aa208b82cd..10d3080007 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -10,6 +10,70 @@ #include "program.hpp" +#ifdef SYCL_ENABLE_KERNEL_FUSION +#include +namespace { +template +struct COMgrObjCleanUp { + COMgrObjCleanUp(T Obj) : Obj{Obj} {} + ~COMgrObjCleanUp() { Release(Obj); } + T Obj; +}; + +using COMgrDataTCleanUp = + COMgrObjCleanUp; +using COMgrDataSetTCleanUp = + COMgrObjCleanUp; +using COMgrActionInfoCleanUp = + COMgrObjCleanUp; + +void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog, + size_t MaxLogSize) { + size_t count = 0; + amd_comgr_status_t status = amd_comgr_action_data_count( + BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count); + + if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) { + std::strcpy(BuildLog, "extracting build log failed (no log)."); + return; + } + + amd_comgr_data_t LogBinaryData; + + if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0, + &LogBinaryData) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (no data)."); + return; + } + COMgrDataTCleanUp LogDataCleanup{LogBinaryData}; + + size_t binarySize = 0; + if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (no log size)."); + return; + } + + if (binarySize == 0) { + std::strcpy(BuildLog, "no log."); + return; + } + + size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize; + + if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (cannot copy log)."); + return; + } +} +} // namespace +#endif + ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{ Ctxt} { @@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } +ur_result_t +ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, + size_t Length) { + for (size_t i = 0; i < Length; ++i) { + const ur_program_metadata_t MetadataElement = Metadata[i]; + std::string MetadataElementName{MetadataElement.pName}; + + if (MetadataElementName == + __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) { + assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32); + IsRelocatable = MetadataElement.value.data32; + } + } + return UR_RESULT_SUCCESS; +} + ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { // Do not re-set program binary data which has already been set as that will // delete the old binary data. @@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { return UR_RESULT_SUCCESS; } +ur_result_t ur_program_handle_t_::finalizeRelocatable() { +#ifndef SYCL_ENABLE_KERNEL_FUSION + assert(false && "Relocation only available with fusion"); + return UR_RESULT_ERROR_UNKNOWN; +#else + assert(IsRelocatable && "Not a relocatable input"); + amd_comgr_data_t ComgrData; + amd_comgr_data_set_t RelocatableData; + UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData)); + COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData}; + + UR_CHECK_ERROR( + amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData)); + // RAII for auto clean-up + COMgrDataTCleanUp DataCleanup{ComgrData}; + UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary)); + UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o")); + + UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData)); + + amd_comgr_action_info_t Action; + + UR_CHECK_ERROR(amd_comgr_create_action_info(&Action)); + COMgrActionInfoCleanUp ActionCleanUp{Action}; + + std::string ISA = "amdgcn-amd-amdhsa--"; + hipDeviceProp_t Props; + detail::ur::assertion(hipGetDeviceProperties( + &Props, Context->getDevice()->get()) == hipSuccess); + ISA += Props.gcnArchName; + UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data())); + + UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true)); + + amd_comgr_data_set_t Output; + UR_CHECK_ERROR(amd_comgr_create_data_set(&Output)); + COMgrDataSetTCleanUp OutputDataCleanup{Output}; + + if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + Action, RelocatableData, + Output) != AMD_COMGR_STATUS_SUCCESS) { + getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE); + return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + } + amd_comgr_data_t binaryData; + + UR_CHECK_ERROR(amd_comgr_action_data_get_data( + Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData)); + { + COMgrDataTCleanUp binaryDataCleanUp{binaryData}; + + size_t binarySize = 0; + UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL)); + + ExecutableCache.resize(binarySize); + + UR_CHECK_ERROR( + amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data())); + } + Binary = ExecutableCache.data(); + BinarySizeInBytes = ExecutableCache.size(); + return UR_RESULT_SUCCESS; +#endif +} + ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { + if (IsRelocatable) { + if (finalizeRelocatable() != UR_RESULT_SUCCESS) { + BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR; + return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + } + IsRelocatable = false; + } + if (BuildOptions) { this->BuildOptions = BuildOptions; } @@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( /// Note: Only supports one device UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, - const uint8_t *pBinary, const ur_program_properties_t *, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), @@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( // TODO: Set metadata here and use reqd_work_group_size information. // See urProgramCreateWithBinary in CUDA adapter. + if (pProperties) { + if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + Result = + RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); + } + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); auto pBinary_string = reinterpret_cast(pBinary); if (size == 0) { diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp index 50064381d9..ff9b68fc92 100644 --- a/source/adapters/hip/program.hpp +++ b/source/adapters/hip/program.hpp @@ -23,6 +23,10 @@ struct ur_program_handle_t_ { size_t BinarySizeInBytes; std::atomic_uint32_t RefCount; ur_context_handle_t Context; + std::string ExecutableCache; + + // Metadata + bool IsRelocatable = false; constexpr static size_t MAX_LOG_SIZE = 8192u; @@ -33,9 +37,12 @@ struct ur_program_handle_t_ { ur_program_handle_t_(ur_context_handle_t Ctxt); ~ur_program_handle_t_(); + ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); + ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); ur_result_t buildProgram(const char *BuildOptions); + ur_result_t finalizeRelocatable(); ur_context_handle_t getContext() const { return Context; }; native_type get() const noexcept { return Module; }; diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 0f7d589ef3..0437d719ba 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) {