Skip to content

Commit

Permalink
[HIP] Enable kernel finalization using comgr
Browse files Browse the repository at this point in the history
For kernel fusion support for hip, we need to finalize the kernels using comgr.
The patch finalizes tagged binaries during buildProgram
before handing it over to the hip runtime.

Signed-off-by: Victor Lomuller <victor@codeplay.com>
  • Loading branch information
Naghasan committed Oct 10, 2023
1 parent b38855e commit 2fd9dea
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ List of options provided by CMake:
| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |

### Additional make targets

Expand Down
14 changes: 13 additions & 1 deletion source/adapters/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")

# Set HIP lib dir
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib")

# Check if HIP library path exists (AMD platform only)
if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
Expand Down Expand Up @@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
)

if(UR_ENABLE_COMGR)
add_library(amd_comgr SHARED IMPORTED GLOBAL)
set_target_properties(
amd_comgr PROPERTIES
IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so"
INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
)
target_link_libraries(pi_hip PUBLIC amd_comgr)
target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
endif(UR_ENABLE_COMGR)

target_link_libraries(${TARGET_NAME} PRIVATE
${PROJECT_NAME}::headers
${PROJECT_NAME}::common
Expand Down
63 changes: 63 additions & 0 deletions source/adapters/hip/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@

#include <sstream>

#ifdef SYCL_ENABLE_KERNEL_FUSION
ur_result_t mapErrorUR(amd_comgr_status_t Result) {
switch (Result) {
case AMD_COMGR_STATUS_SUCCESS:
return UR_RESULT_SUCCESS;
case AMD_COMGR_STATUS_ERROR:
return UR_RESULT_ERROR_UNKNOWN;
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
return UR_RESULT_ERROR_INVALID_ARGUMENT;
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
default:
return UR_RESULT_ERROR_UNKNOWN;
}
}
#endif

ur_result_t mapErrorUR(hipError_t Result) {
switch (Result) {
case hipSuccess:
Expand All @@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) {
}
}

#ifdef SYCL_ENABLE_KERNEL_FUSION
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
const char *File) {
if (Result == AMD_COMGR_STATUS_SUCCESS) {
return;
}

if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr ||
std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) {
const char *ErrorString = nullptr;
const char *ErrorName = nullptr;
switch (Result) {
case AMD_COMGR_STATUS_ERROR:
ErrorName = "AMD_COMGR_STATUS_ERROR";
ErrorString = "Generic error";
break;
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT";
ErrorString =
"One of the actual arguments does not meet a precondition stated in "
"the documentation of the corresponding formal argument.";
break;
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES";
ErrorString = "Failed to allocate the necessary resources";
break;
default:
break;
}
std::cerr << "\nUR HIP ERROR:"
<< "\n\tValue: " << Result
<< "\n\tName: " << ErrorName
<< "\n\tDescription: " << ErrorString
<< "\n\tFunction: " << Function
<< "\n\tSource Location: " << File << ":" << Line << "\n\n";
}

if (std::getenv("PI_HIP_ABORT") != nullptr ||
std::getenv("UR_HIP_ABORT") != nullptr) {
std::abort();
}

throw mapErrorUR(Result);
}
#endif

void checkErrorUR(hipError_t Result, const char *Function, int Line,
const char *File) {
if (Result == hipSuccess) {
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/hip/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
//===----------------------------------------------------------------------===//
#pragma once

#ifdef SYCL_ENABLE_KERNEL_FUSION
#include <amd_comgr/amd_comgr.h>
#endif
#include <hip/hip_runtime.h>
#include <ur/ur.hpp>

Expand Down Expand Up @@ -69,6 +72,10 @@ typedef hipArray *hipCUarray;

ur_result_t mapErrorUR(hipError_t Result);

#ifdef SYCL_ENABLE_KERNEL_FUSION
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
const char *File);
#endif
void checkErrorUR(hipError_t Result, const char *Function, int Line,
const char *File);
void checkErrorUR(ur_result_t Result, const char *Function, int Line,
Expand Down
165 changes: 164 additions & 1 deletion source/adapters/hip/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,70 @@

#include "program.hpp"

#ifdef SYCL_ENABLE_KERNEL_FUSION
#include <amd_comgr/amd_comgr.h>
namespace {
template <typename ReleaseType, ReleaseType Release, typename T>
struct COMgrObjCleanUp {
COMgrObjCleanUp(T Obj) : Obj{Obj} {}
~COMgrObjCleanUp() { Release(Obj); }
T Obj;
};

using COMgrDataTCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_release_data), &amd_comgr_release_data,
amd_comgr_data_t>;
using COMgrDataSetTCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_destroy_data_set),
&amd_comgr_destroy_data_set, amd_comgr_data_set_t>;
using COMgrActionInfoCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_destroy_action_info),
&amd_comgr_destroy_action_info, amd_comgr_action_info_t>;

void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
size_t MaxLogSize) {
size_t count = 0;
amd_comgr_status_t status = amd_comgr_action_data_count(
BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count);

if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) {
std::strcpy(BuildLog, "extracting build log failed (no log).");
return;
}

amd_comgr_data_t LogBinaryData;

if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0,
&LogBinaryData) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (no data).");
return;
}
COMgrDataTCleanUp LogDataCleanup{LogBinaryData};

size_t binarySize = 0;
if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (no log size).");
return;
}

if (binarySize == 0) {
std::strcpy(BuildLog, "no log.");
return;
}

size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize;

if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (cannot copy log).");
return;
}
}
} // namespace
#endif

ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
: Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{
Ctxt} {
Expand All @@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)

ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }

ur_result_t
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
size_t Length) {
for (size_t i = 0; i < Length; ++i) {
const ur_program_metadata_t MetadataElement = Metadata[i];
std::string MetadataElementName{MetadataElement.pName};

if (MetadataElementName ==
__SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) {
assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32);
IsRelocatable = MetadataElement.value.data32;
}
}
return UR_RESULT_SUCCESS;
}

ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
// Do not re-set program binary data which has already been set as that will
// delete the old binary data.
Expand All @@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
return UR_RESULT_SUCCESS;
}

ur_result_t ur_program_handle_t_::finalizeRelocatable() {
#ifndef SYCL_ENABLE_KERNEL_FUSION
assert(false && "Relocation only available with fusion");
return UR_RESULT_ERROR_UNKNOWN;
#else
assert(IsRelocatable && "Not a relocatable input");
amd_comgr_data_t ComgrData;
amd_comgr_data_set_t RelocatableData;
UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData));
COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData};

UR_CHECK_ERROR(
amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData));
// RAII for auto clean-up
COMgrDataTCleanUp DataCleanup{ComgrData};
UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary));
UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o"));

UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData));

amd_comgr_action_info_t Action;

UR_CHECK_ERROR(amd_comgr_create_action_info(&Action));
COMgrActionInfoCleanUp ActionCleanUp{Action};

std::string ISA = "amdgcn-amd-amdhsa--";
hipDeviceProp_t Props;
detail::ur::assertion(hipGetDeviceProperties(
&Props, Context->getDevice()->get()) == hipSuccess);
ISA += Props.gcnArchName;
UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data()));

UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true));

amd_comgr_data_set_t Output;
UR_CHECK_ERROR(amd_comgr_create_data_set(&Output));
COMgrDataSetTCleanUp OutputDataCleanup{Output};

if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
Action, RelocatableData,
Output) != AMD_COMGR_STATUS_SUCCESS) {
getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE);
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
}
amd_comgr_data_t binaryData;

UR_CHECK_ERROR(amd_comgr_action_data_get_data(
Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData));
{
COMgrDataTCleanUp binaryDataCleanUp{binaryData};

size_t binarySize = 0;
UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL));

ExecutableCache.resize(binarySize);

UR_CHECK_ERROR(
amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data()));
}
Binary = ExecutableCache.data();
BinarySizeInBytes = ExecutableCache.size();
return UR_RESULT_SUCCESS;
#endif
}

ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
if (IsRelocatable) {
if (finalizeRelocatable() != UR_RESULT_SUCCESS) {
BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR;
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
}
IsRelocatable = false;
}

if (BuildOptions) {
this->BuildOptions = BuildOptions;
}
Expand Down Expand Up @@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
/// Note: Only supports one device
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
Expand All @@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(

// TODO: Set metadata here and use reqd_work_group_size information.
// See urProgramCreateWithBinary in CUDA adapter.
if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);
if (size == 0) {
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/hip/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ struct ur_program_handle_t_ {
size_t BinarySizeInBytes;
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;
std::string ExecutableCache;

// Metadata
bool IsRelocatable = false;

constexpr static size_t MAX_LOG_SIZE = 8192u;

Expand All @@ -33,9 +37,12 @@ struct ur_program_handle_t_ {
ur_program_handle_t_(ur_context_handle_t Ctxt);
~ur_program_handle_t_();

ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);

ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);

ur_result_t buildProgram(const char *BuildOptions);
ur_result_t finalizeRelocatable();
ur_context_handle_t getContext() const { return Context; };

native_type get() const noexcept { return Module; };
Expand Down
1 change: 1 addition & 0 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \
"@reqd_work_group_size"
#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"

// Terminates the process with a catastrophic error message.
[[noreturn]] inline void die(const char *Message) {
Expand Down

0 comments on commit 2fd9dea

Please sign in to comment.