Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Add support for binary type query #990

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 50 additions & 35 deletions source/adapters/cuda/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hdelan is that something we need to think about with multi device context (also the comment at the bottom of the diff)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be OK. The entry point has a Device param, so multi dev ctx just changes constructor of program to take the device as well

///
ur_result_t createProgram(ur_context_handle_t hContext,
ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary,
const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
UR_CHECK_ERROR(
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
}

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
*phProgram = RetProgram.release();

return UR_RESULT_SUCCESS;
}

/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
Expand All @@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
ur_device_handle_t hDevice = hContext->getDevice();
auto pBinary = reinterpret_cast<const uint8_t *>(pIL);

return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
pProperties, phProgram);
return createProgram(hContext, hDevice, length, pBinary, pProperties,
phProgram);
}

/// CUDA will handle the PTX/CUBIN binaries internally through a call to
Expand All @@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
UR_APIEXPORT ur_result_t UR_APICALL
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
const char *pOptions) {
return urProgramBuild(hContext, hProgram, pOptions);
UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
return UR_RESULT_SUCCESS;
}

/// Loads the images from a UR program into a CUmodule that can be
Expand All @@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
ScopedContext Active(hProgram->getContext());

hProgram->buildProgram(pOptions);
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;

} catch (ur_result_t Err) {
Result = Err;
Expand Down Expand Up @@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);

Result = RetProgram->buildProgram(pOptions);
RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
} catch (...) {
// Upon error attempt cleanup
UR_CHECK_ERROR(cuLinkDestroy(State));
Expand Down Expand Up @@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
return ReturnValue(hProgram->BuildOptions.c_str());
case UR_PROGRAM_BUILD_INFO_LOG:
return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
return ReturnValue(hProgram->BinaryType);
}
default:
break;
}
Expand Down Expand Up @@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
return UR_RESULT_SUCCESS;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

ur_result_t Result = UR_RESULT_SUCCESS;
UR_CHECK_ERROR(
createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
(*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

Result = RetProgram->setBinary(pBinary_string, size);
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

*phProgram = RetProgram.release();

return Result;
return UR_RESULT_SUCCESS;
}

// This entry point is only used for native specialization constants (SPIR-V),
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/cuda/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;

/* The ur_program_binary_type_t property is defined individually for every
* device in a program. However, since the CUDA adapter only has 1 device per
* context / program, there is no need to keep track of its value for each
* device. */
ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;

// Metadata
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelReqdWorkGroupSizeMD;
Expand Down