diff --git a/README.md b/README.md index 6d3c4345e2..4917add660 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ List of options provided by CMake: | UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF | | UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF | | UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | +| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | ### Additional make targets diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index b29b1becf7..4595bfbf84 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include") set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include") # Set HIP lib dir -set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib") +set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib") # Check if HIP library path exists (AMD platform only) if("${UR_HIP_PLATFORM}" STREQUAL "AMD") @@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" ) + if(UR_ENABLE_COMGR) + add_library(amd_comgr SHARED IMPORTED GLOBAL) + set_target_properties( + amd_comgr PROPERTIES + IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so" + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + target_link_libraries(pi_hip PUBLIC amd_comgr) + target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION) + endif(UR_ENABLE_COMGR) + target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common diff --git a/source/adapters/hip/common.cpp b/source/adapters/hip/common.cpp index c5bd92047e..f1f8ec4fbb 100644 --- a/source/adapters/hip/common.cpp +++ b/source/adapters/hip/common.cpp @@ -11,6 +11,23 @@ #include +#ifdef SYCL_ENABLE_KERNEL_FUSION +ur_result_t mapErrorUR(amd_comgr_status_t Result) { + switch (Result) { + case AMD_COMGR_STATUS_SUCCESS: + return UR_RESULT_SUCCESS; + case AMD_COMGR_STATUS_ERROR: + return UR_RESULT_ERROR_UNKNOWN; + case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} +#endif + ur_result_t mapErrorUR(hipError_t Result) { switch (Result) { case hipSuccess: @@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) { } } +#ifdef SYCL_ENABLE_KERNEL_FUSION +void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line, + const char *File) { + if (Result == AMD_COMGR_STATUS_SUCCESS) { + return; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr || + std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { + const char *ErrorString = nullptr; + const char *ErrorName = nullptr; + switch (Result) { + case AMD_COMGR_STATUS_ERROR: + ErrorName = "AMD_COMGR_STATUS_ERROR"; + ErrorString = "Generic error"; + break; + case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: + ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT"; + ErrorString = + "One of the actual arguments does not meet a precondition stated in " + "the documentation of the corresponding formal argument."; + break; + case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: + ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES"; + ErrorString = "Failed to allocate the necessary resources"; + break; + default: + break; + } + std::cerr << "\nUR HIP ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function + << "\n\tSource Location: " << File << ":" << Line << "\n\n"; + } + + if (std::getenv("PI_HIP_ABORT") != nullptr || + std::getenv("UR_HIP_ABORT") != nullptr) { + std::abort(); + } + + throw mapErrorUR(Result); +} +#endif + void checkErrorUR(hipError_t Result, const char *Function, int Line, const char *File) { if (Result == hipSuccess) { diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp index edf5867c01..2649657f47 100644 --- a/source/adapters/hip/common.hpp +++ b/source/adapters/hip/common.hpp @@ -9,6 +9,9 @@ //===----------------------------------------------------------------------===// #pragma once +#ifdef SYCL_ENABLE_KERNEL_FUSION +#include +#endif #include #include @@ -69,6 +72,10 @@ typedef hipArray *hipCUarray; ur_result_t mapErrorUR(hipError_t Result); +#ifdef SYCL_ENABLE_KERNEL_FUSION +void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line, + const char *File); +#endif void checkErrorUR(hipError_t Result, const char *Function, int Line, const char *File); void checkErrorUR(ur_result_t Result, const char *Function, int Line, diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index aa208b82cd..10d3080007 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -10,6 +10,70 @@ #include "program.hpp" +#ifdef SYCL_ENABLE_KERNEL_FUSION +#include +namespace { +template +struct COMgrObjCleanUp { + COMgrObjCleanUp(T Obj) : Obj{Obj} {} + ~COMgrObjCleanUp() { Release(Obj); } + T Obj; +}; + +using COMgrDataTCleanUp = + COMgrObjCleanUp; +using COMgrDataSetTCleanUp = + COMgrObjCleanUp; +using COMgrActionInfoCleanUp = + COMgrObjCleanUp; + +void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog, + size_t MaxLogSize) { + size_t count = 0; + amd_comgr_status_t status = amd_comgr_action_data_count( + BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count); + + if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) { + std::strcpy(BuildLog, "extracting build log failed (no log)."); + return; + } + + amd_comgr_data_t LogBinaryData; + + if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0, + &LogBinaryData) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (no data)."); + return; + } + COMgrDataTCleanUp LogDataCleanup{LogBinaryData}; + + size_t binarySize = 0; + if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (no log size)."); + return; + } + + if (binarySize == 0) { + std::strcpy(BuildLog, "no log."); + return; + } + + size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize; + + if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) != + AMD_COMGR_STATUS_SUCCESS) { + std::strcpy(BuildLog, "extracting build log failed (cannot copy log)."); + return; + } +} +} // namespace +#endif + ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{ Ctxt} { @@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } +ur_result_t +ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, + size_t Length) { + for (size_t i = 0; i < Length; ++i) { + const ur_program_metadata_t MetadataElement = Metadata[i]; + std::string MetadataElementName{MetadataElement.pName}; + + if (MetadataElementName == + __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) { + assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32); + IsRelocatable = MetadataElement.value.data32; + } + } + return UR_RESULT_SUCCESS; +} + ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { // Do not re-set program binary data which has already been set as that will // delete the old binary data. @@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { return UR_RESULT_SUCCESS; } +ur_result_t ur_program_handle_t_::finalizeRelocatable() { +#ifndef SYCL_ENABLE_KERNEL_FUSION + assert(false && "Relocation only available with fusion"); + return UR_RESULT_ERROR_UNKNOWN; +#else + assert(IsRelocatable && "Not a relocatable input"); + amd_comgr_data_t ComgrData; + amd_comgr_data_set_t RelocatableData; + UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData)); + COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData}; + + UR_CHECK_ERROR( + amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData)); + // RAII for auto clean-up + COMgrDataTCleanUp DataCleanup{ComgrData}; + UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary)); + UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o")); + + UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData)); + + amd_comgr_action_info_t Action; + + UR_CHECK_ERROR(amd_comgr_create_action_info(&Action)); + COMgrActionInfoCleanUp ActionCleanUp{Action}; + + std::string ISA = "amdgcn-amd-amdhsa--"; + hipDeviceProp_t Props; + detail::ur::assertion(hipGetDeviceProperties( + &Props, Context->getDevice()->get()) == hipSuccess); + ISA += Props.gcnArchName; + UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data())); + + UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true)); + + amd_comgr_data_set_t Output; + UR_CHECK_ERROR(amd_comgr_create_data_set(&Output)); + COMgrDataSetTCleanUp OutputDataCleanup{Output}; + + if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + Action, RelocatableData, + Output) != AMD_COMGR_STATUS_SUCCESS) { + getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE); + return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + } + amd_comgr_data_t binaryData; + + UR_CHECK_ERROR(amd_comgr_action_data_get_data( + Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData)); + { + COMgrDataTCleanUp binaryDataCleanUp{binaryData}; + + size_t binarySize = 0; + UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL)); + + ExecutableCache.resize(binarySize); + + UR_CHECK_ERROR( + amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data())); + } + Binary = ExecutableCache.data(); + BinarySizeInBytes = ExecutableCache.size(); + return UR_RESULT_SUCCESS; +#endif +} + ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { + if (IsRelocatable) { + if (finalizeRelocatable() != UR_RESULT_SUCCESS) { + BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR; + return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + } + IsRelocatable = false; + } + if (BuildOptions) { this->BuildOptions = BuildOptions; } @@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( /// Note: Only supports one device UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, - const uint8_t *pBinary, const ur_program_properties_t *, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), @@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( // TODO: Set metadata here and use reqd_work_group_size information. // See urProgramCreateWithBinary in CUDA adapter. + if (pProperties) { + if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + Result = + RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); + } + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); auto pBinary_string = reinterpret_cast(pBinary); if (size == 0) { diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp index 50064381d9..ff9b68fc92 100644 --- a/source/adapters/hip/program.hpp +++ b/source/adapters/hip/program.hpp @@ -23,6 +23,10 @@ struct ur_program_handle_t_ { size_t BinarySizeInBytes; std::atomic_uint32_t RefCount; ur_context_handle_t Context; + std::string ExecutableCache; + + // Metadata + bool IsRelocatable = false; constexpr static size_t MAX_LOG_SIZE = 8192u; @@ -33,9 +37,12 @@ struct ur_program_handle_t_ { ur_program_handle_t_(ur_context_handle_t Ctxt); ~ur_program_handle_t_(); + ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); + ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); ur_result_t buildProgram(const char *BuildOptions); + ur_result_t finalizeRelocatable(); ur_context_handle_t getContext() const { return Context; }; native_type get() const noexcept { return Module; }; diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 0f7d589ef3..0437d719ba 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) {