Skip to content

Commit

Permalink
Merge pull request #940 from Naghasan/victor/kernel-fusion-amd
Browse files Browse the repository at this point in the history
[UR][HIP] Enable kernel finalization using comgr
  • Loading branch information
kbenzie committed Oct 25, 2023
2 parents 3a3aae3 + 2fd9dea commit cf26de2
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ List of options provided by CMake:
| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |

### Additional make targets

Expand Down
14 changes: 13 additions & 1 deletion source/adapters/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")

# Set HIP lib dir
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib")

# Check if HIP library path exists (AMD platform only)
if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
Expand Down Expand Up @@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
)

if(UR_ENABLE_COMGR)
add_library(amd_comgr SHARED IMPORTED GLOBAL)
set_target_properties(
amd_comgr PROPERTIES
IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so"
INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
)
target_link_libraries(pi_hip PUBLIC amd_comgr)
target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
endif(UR_ENABLE_COMGR)

target_link_libraries(${TARGET_NAME} PRIVATE
${PROJECT_NAME}::headers
${PROJECT_NAME}::common
Expand Down
63 changes: 63 additions & 0 deletions source/adapters/hip/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@

#include <sstream>

#ifdef SYCL_ENABLE_KERNEL_FUSION
ur_result_t mapErrorUR(amd_comgr_status_t Result) {
switch (Result) {
case AMD_COMGR_STATUS_SUCCESS:
return UR_RESULT_SUCCESS;
case AMD_COMGR_STATUS_ERROR:
return UR_RESULT_ERROR_UNKNOWN;
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
return UR_RESULT_ERROR_INVALID_ARGUMENT;
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
default:
return UR_RESULT_ERROR_UNKNOWN;
}
}
#endif

ur_result_t mapErrorUR(hipError_t Result) {
switch (Result) {
case hipSuccess:
Expand All @@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) {
}
}

#ifdef SYCL_ENABLE_KERNEL_FUSION
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
const char *File) {
if (Result == AMD_COMGR_STATUS_SUCCESS) {
return;
}

if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr ||
std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) {
const char *ErrorString = nullptr;
const char *ErrorName = nullptr;
switch (Result) {
case AMD_COMGR_STATUS_ERROR:
ErrorName = "AMD_COMGR_STATUS_ERROR";
ErrorString = "Generic error";
break;
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT";
ErrorString =
"One of the actual arguments does not meet a precondition stated in "
"the documentation of the corresponding formal argument.";
break;
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES";
ErrorString = "Failed to allocate the necessary resources";
break;
default:
break;
}
std::cerr << "\nUR HIP ERROR:"
<< "\n\tValue: " << Result
<< "\n\tName: " << ErrorName
<< "\n\tDescription: " << ErrorString
<< "\n\tFunction: " << Function
<< "\n\tSource Location: " << File << ":" << Line << "\n\n";
}

if (std::getenv("PI_HIP_ABORT") != nullptr ||
std::getenv("UR_HIP_ABORT") != nullptr) {
std::abort();
}

throw mapErrorUR(Result);
}
#endif

void checkErrorUR(hipError_t Result, const char *Function, int Line,
const char *File) {
if (Result == hipSuccess) {
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/hip/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
//===----------------------------------------------------------------------===//
#pragma once

#ifdef SYCL_ENABLE_KERNEL_FUSION
#include <amd_comgr/amd_comgr.h>
#endif
#include <hip/hip_runtime.h>
#include <ur/ur.hpp>

Expand Down Expand Up @@ -69,6 +72,10 @@ typedef hipArray *hipCUarray;

ur_result_t mapErrorUR(hipError_t Result);

#ifdef SYCL_ENABLE_KERNEL_FUSION
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
const char *File);
#endif
void checkErrorUR(hipError_t Result, const char *Function, int Line,
const char *File);
void checkErrorUR(ur_result_t Result, const char *Function, int Line,
Expand Down
165 changes: 164 additions & 1 deletion source/adapters/hip/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,70 @@

#include "program.hpp"

#ifdef SYCL_ENABLE_KERNEL_FUSION
#include <amd_comgr/amd_comgr.h>
namespace {
template <typename ReleaseType, ReleaseType Release, typename T>
struct COMgrObjCleanUp {
COMgrObjCleanUp(T Obj) : Obj{Obj} {}
~COMgrObjCleanUp() { Release(Obj); }
T Obj;
};

using COMgrDataTCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_release_data), &amd_comgr_release_data,
amd_comgr_data_t>;
using COMgrDataSetTCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_destroy_data_set),
&amd_comgr_destroy_data_set, amd_comgr_data_set_t>;
using COMgrActionInfoCleanUp =
COMgrObjCleanUp<decltype(&amd_comgr_destroy_action_info),
&amd_comgr_destroy_action_info, amd_comgr_action_info_t>;

void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
size_t MaxLogSize) {
size_t count = 0;
amd_comgr_status_t status = amd_comgr_action_data_count(
BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count);

if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) {
std::strcpy(BuildLog, "extracting build log failed (no log).");
return;
}

amd_comgr_data_t LogBinaryData;

if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0,
&LogBinaryData) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (no data).");
return;
}
COMgrDataTCleanUp LogDataCleanup{LogBinaryData};

size_t binarySize = 0;
if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (no log size).");
return;
}

if (binarySize == 0) {
std::strcpy(BuildLog, "no log.");
return;
}

size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize;

if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) !=
AMD_COMGR_STATUS_SUCCESS) {
std::strcpy(BuildLog, "extracting build log failed (cannot copy log).");
return;
}
}
} // namespace
#endif

ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
: Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{
Ctxt} {
Expand All @@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)

ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }

ur_result_t
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
size_t Length) {
for (size_t i = 0; i < Length; ++i) {
const ur_program_metadata_t MetadataElement = Metadata[i];
std::string MetadataElementName{MetadataElement.pName};

if (MetadataElementName ==
__SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) {
assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32);
IsRelocatable = MetadataElement.value.data32;
}
}
return UR_RESULT_SUCCESS;
}

ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
// Do not re-set program binary data which has already been set as that will
// delete the old binary data.
Expand All @@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
return UR_RESULT_SUCCESS;
}

ur_result_t ur_program_handle_t_::finalizeRelocatable() {
#ifndef SYCL_ENABLE_KERNEL_FUSION
assert(false && "Relocation only available with fusion");
return UR_RESULT_ERROR_UNKNOWN;
#else
assert(IsRelocatable && "Not a relocatable input");
amd_comgr_data_t ComgrData;
amd_comgr_data_set_t RelocatableData;
UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData));
COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData};

UR_CHECK_ERROR(
amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData));
// RAII for auto clean-up
COMgrDataTCleanUp DataCleanup{ComgrData};
UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary));
UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o"));

UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData));

amd_comgr_action_info_t Action;

UR_CHECK_ERROR(amd_comgr_create_action_info(&Action));
COMgrActionInfoCleanUp ActionCleanUp{Action};

std::string ISA = "amdgcn-amd-amdhsa--";
hipDeviceProp_t Props;
detail::ur::assertion(hipGetDeviceProperties(
&Props, Context->getDevice()->get()) == hipSuccess);
ISA += Props.gcnArchName;
UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data()));

UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true));

amd_comgr_data_set_t Output;
UR_CHECK_ERROR(amd_comgr_create_data_set(&Output));
COMgrDataSetTCleanUp OutputDataCleanup{Output};

if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
Action, RelocatableData,
Output) != AMD_COMGR_STATUS_SUCCESS) {
getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE);
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
}
amd_comgr_data_t binaryData;

UR_CHECK_ERROR(amd_comgr_action_data_get_data(
Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData));
{
COMgrDataTCleanUp binaryDataCleanUp{binaryData};

size_t binarySize = 0;
UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL));

ExecutableCache.resize(binarySize);

UR_CHECK_ERROR(
amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data()));
}
Binary = ExecutableCache.data();
BinarySizeInBytes = ExecutableCache.size();
return UR_RESULT_SUCCESS;
#endif
}

ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
if (IsRelocatable) {
if (finalizeRelocatable() != UR_RESULT_SUCCESS) {
BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR;
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
}
IsRelocatable = false;
}

if (BuildOptions) {
this->BuildOptions = BuildOptions;
}
Expand Down Expand Up @@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
/// Note: Only supports one device
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
Expand All @@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(

// TODO: Set metadata here and use reqd_work_group_size information.
// See urProgramCreateWithBinary in CUDA adapter.
if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);
if (size == 0) {
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/hip/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ struct ur_program_handle_t_ {
size_t BinarySizeInBytes;
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;
std::string ExecutableCache;

// Metadata
bool IsRelocatable = false;

constexpr static size_t MAX_LOG_SIZE = 8192u;

Expand All @@ -33,9 +37,12 @@ struct ur_program_handle_t_ {
ur_program_handle_t_(ur_context_handle_t Ctxt);
~ur_program_handle_t_();

ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);

ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);

ur_result_t buildProgram(const char *BuildOptions);
ur_result_t finalizeRelocatable();
ur_context_handle_t getContext() const { return Context; };

native_type get() const noexcept { return Module; };
Expand Down
1 change: 1 addition & 0 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \
"@reqd_work_group_size"
#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"

// Terminates the process with a catastrophic error message.
[[noreturn]] inline void die(const char *Message) {
Expand Down

0 comments on commit cf26de2

Please sign in to comment.