Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update cudnn convolution kernel #10440

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ if(BUILD_CUDA)
include(nccl)
include(cutlass)
include(trt_flash_attention)
include(cudnn-frontend)

list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES})
list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES})
Expand All @@ -164,6 +165,8 @@ if(BUILD_CUDA)
list(APPEND oneflow_third_party_dependencies trt_flash_attention)
list(APPEND oneflow_third_party_libs ${TRT_FLASH_ATTENTION_LIBRARIES})
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${TRT_FLASH_ATTENTION_INCLUDE_DIR})
list(APPEND oneflow_third_party_dependencies cudnn_frontend_copy_headers_to_destination)
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_FRONTEND_INCLUDE_DIR})
endif()

if(BUILD_RDMA)
Expand Down
29 changes: 29 additions & 0 deletions cmake/third_party/cudnn-frontend.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
include(ExternalProject)
linzs148 marked this conversation as resolved.
Show resolved Hide resolved

set(CUDNN_FRONTEND_URL https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.2.zip)
set(CUDNN_FRONTEND_MD5 7e16cc2dcaddefa7fd0f3d82b9cf5d73)
use_mirror(VARIABLE CUDNN_FRONTEND_URL URL ${CUDNN_FRONTEND_URL})

set(CUDNN_FRONTEND_INCLUDE_DIR ${THIRD_PARTY_DIR}/cudnn-frontend/include)
set(CUDNN_FRONTEND_BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cudnn-frontend/src/cudnn-frontend)

if(THIRD_PARTY)
ExternalProject_Add(
linzs148 marked this conversation as resolved.
Show resolved Hide resolved
cudnn-frontend
PREFIX cudnn-frontend
URL ${CUDNN_FRONTEND_URL}
URL_MD5 ${CUDNN_FRONTEND_MD5}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND "")

add_copy_headers_target(
NAME
cudnn_frontend
SRC
${CUDNN_FRONTEND_BASE_DIR}/include/
DST
${CUDNN_FRONTEND_INCLUDE_DIR}
DEPS
cudnn-frontend)
endif(THIRD_PARTY)
258 changes: 258 additions & 0 deletions oneflow/core/device/cudnn_conv_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef WITH_CUDA
#include "oneflow/core/framework/infer_util.h"
#include "oneflow/core/device/cudnn_conv_util.h"
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/common/cached_caller.h"
Expand All @@ -22,6 +23,7 @@ limitations under the License.
#include "oneflow/core/job/global_for.h"
#include "oneflow/core/job/global_for.h"
#include "oneflow/core/framework/op_kernel.h"
#include "oneflow/core/job/lazy_mode.h"

namespace oneflow {

Expand Down Expand Up @@ -82,6 +84,7 @@ perf_t GetBestAlgorithm(const CudnnConvArgs& args, CudnnConvResource* res,
FOR_RANGE(size_t, i, 0, perf_vec.size()) {
// Note: Shouldn't all returned results be successful?
CHECK_EQ(perf_vec[i].status, CUDNN_STATUS_SUCCESS);
// TODO workspace size limit will lead to dismatch result with pytorch for large tensor
if (perf_vec[i].memory > args.params.max_ws_size) { continue; }
if (args.deterministic && perf_vec[i].determinism == CUDNN_NON_DETERMINISTIC) { continue; }
found_algo_idx = i;
Expand Down Expand Up @@ -332,6 +335,22 @@ CudnnConvArgs::CudnnConvArgs(const user_op::KernelComputeContext& ctx, DataType
params.max_ws_size = max_workspace_size;
}

CudnnConvArgsV8::CudnnConvArgsV8(const user_op::InferContext& ctx, const user_op::TensorDesc& x,
const user_op::TensorDesc& y, const user_op::TensorDesc& w)
: xdesc(GetTensorDescriptor(x, 'x')),
ydesc(GetTensorDescriptor(y, 'y')),
wdesc(GetTensorDescriptor(w, 'w')),
cdesc(GetConvDescriptor(ctx, GetCudnnDataType(y.data_type()))),
beta(0.0f) {}

CudnnConvArgsV8::CudnnConvArgsV8(const user_op::KernelComputeContext& ctx, const user_op::Tensor* x,
const user_op::Tensor* y, const user_op::Tensor* w)
: xdesc(GetTensorDescriptor(x, 'x')),
ydesc(GetTensorDescriptor(y, 'y')),
wdesc(GetTensorDescriptor(w, 'w')),
cdesc(GetConvDescriptor(ctx, GetCudnnDataType(y->data_type()))),
beta(0.0f) {}

ManagedCudnnConvResource::ManagedCudnnConvResource(const CudnnConvArgs& args)
: handle_(nullptr), x_dptr_(nullptr), w_dptr_(nullptr), y_dptr_(nullptr), ws_dptr_(nullptr) {
x_byte_size_ = ByteSize4Tensor(args.params.x_dims, args.params.x_ndim, args.params.x_data_type);
Expand Down Expand Up @@ -424,6 +443,245 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso
args.wdesc.Get(), algo, sz);
}

void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc,
user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, user_op::Tensor* b,
const CudnnConvArgsV8& args) {
std::string tag;
auto configs =
GetConfigs(handle, desc, args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag);
TryConfigs(handle, x, y, w, b, configs, tag);
}

cudnn_frontend::EngineConfigList GetConfigs(const cudnnHandle_t handle,
const cudnnBackendDescriptorType_t desc,
const cudnn_frontend::Tensor& xdesc,
const cudnn_frontend::Tensor& ydesc,
const cudnn_frontend::Tensor& wdesc,
const cudnn_frontend::ConvDesc& cdesc, float beta,
std::string& tag) {
auto op_graph = BuildConvOpGraph(handle, desc, xdesc, ydesc, wdesc, cdesc, beta);
tag = op_graph.getTag();
auto sources = GetGeneratorSources(desc);
cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
auto configs = generator.generate_engine_config(op_graph);
return configs;
}

cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle,
const cudnnBackendDescriptorType_t desc,
const cudnn_frontend::Tensor& xdesc,
const cudnn_frontend::Tensor& ydesc,
const cudnn_frontend::Tensor& wdesc,
const cudnn_frontend::ConvDesc& cdesc, float beta) {
auto conv_op = cudnn_frontend::OperationBuilder(desc)
.setxDesc(xdesc)
.setyDesc(ydesc)
.setwDesc(wdesc)
.setcDesc(cdesc)
.setBeta(beta)
.build();
std::array<cudnn_frontend::Operation const*, 1> ops = {&conv_op};
auto op_graph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle)
.setOperationGraph(ops.size(), ops.data())
.build();
return op_graph;
}

cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id) {
auto dim = t->shape_view();
auto stride = t->stride();
return cudnn_frontend::TensorBuilder()
.setDim(dim.size(), dim.data())
.setStride(stride.size(), stride.data())
.setId(id)
.setAlignment(32)
.setDataType(GetCudnnDataType(t->data_type()))
.build();
}

cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id) {
auto dim = t.shape();
auto stride = t.stride();
return cudnn_frontend::TensorBuilder()
.setDim(dim.size(), dim.data())
.setStride(stride.size(), stride.data())
.setId(id)
.setAlignment(32)
.setDataType(GetCudnnDataType(t.data_type()))
.build();
}

cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx,
cudnnDataType_t data_type) {
if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) {
data_type = CUDNN_DATA_FLOAT;
}

std::vector<int64_t> padding;
const auto& padding_before = ctx.Attr<std::vector<int32_t>>("padding_before");
copy(padding_before.begin(), padding_before.end(), back_inserter(padding));

std::vector<int64_t> stride;
const auto& strides = ctx.Attr<std::vector<int32_t>>("strides");
copy(strides.begin(), strides.end(), back_inserter(stride));

std::vector<int64_t> dilation;
const auto& dilation_rate = ctx.Attr<std::vector<int32_t>>("dilation_rate");
copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation));

uint64_t ndim = stride.size();
return cudnn_frontend::ConvDescBuilder()
.setDataType(data_type)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(ndim)
.setStrides(ndim, stride.data())
.setPrePadding(ndim, padding.data())
.setPostPadding(ndim, padding.data())
.setDilation(ndim, dilation.data())
.build();
}

cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx,
cudnnDataType_t data_type) {
if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) {
data_type = CUDNN_DATA_FLOAT;
}

std::vector<int64_t> padding;
const auto& padding_before = ctx.Attr<std::vector<int32_t>>("padding_before");
copy(padding_before.begin(), padding_before.end(), back_inserter(padding));

std::vector<int64_t> stride;
const auto& strides = ctx.Attr<std::vector<int32_t>>("strides");
copy(strides.begin(), strides.end(), back_inserter(stride));

std::vector<int64_t> dilation;
const auto& dilation_rate = ctx.Attr<std::vector<int32_t>>("dilation_rate");
copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation));

uint64_t ndim = stride.size();
return cudnn_frontend::ConvDescBuilder()
.setDataType(data_type)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(ndim)
.setStrides(ndim, stride.data())
.setPrePadding(ndim, padding.data())
.setPostPadding(ndim, padding.data())
.setDilation(ndim, dilation.data())
.build();
}

std::vector<cudnn_frontend::GeneratorSource> GetGeneratorSources(
const cudnnBackendDescriptorType_t desc) {
bool deterministic = Singleton<ResourceDesc, ForSession>::Get()
->resource()
.cudnn_conf()
.cudnn_conv_use_deterministic_algo_only();
bool heuristic = ParseBooleanFromEnv("ONEFLOW_CUDNN_USE_HEURISTIC_MODE_B", false);
auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A;
// Method for engine config generator based on heuristics
const auto heurgen_method =
[deterministic,
heur_mode](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList {
auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
.setOperationGraph(opGraph)
.setHeurMode(heur_mode)
.build();
auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
cudnn_frontend::EngineConfigList filtered_configs;
FilterEngineConfigs(engine_configs, filtered_configs, deterministic);
return filtered_configs;
};
// Method for engine config generator based on fallback list
const auto fallback_method =
[desc,
deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList {
auto fallback = cudnn_frontend::EngineFallbackListBuilder()
.setOperationGraph(opGraph)
.setOperation(desc)
.build();
auto& fallback_list = fallback.getFallbackList();
cudnn_frontend::EngineConfigList filtered_configs;
FilterEngineConfigs(fallback_list, filtered_configs, deterministic);
return filtered_configs;
};
std::vector<cudnn_frontend::GeneratorSource> sources = {heurgen_method, fallback_method};
return sources;
}

void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from,
cudnn_frontend::EngineConfigList& to, bool deterministic) {
auto filter = [=](cudnnBackendDescriptor_t c) {
if (deterministic) {
if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) {
return true;
}
}
if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
return true;
}
return false;
};
cudnn_frontend::filter(from, to, filter);
}

void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y,
user_op::Tensor* w, user_op::Tensor* buf, cudnn_frontend::EngineConfigList& configs,
const std::string& tag) {
for (auto& config : configs) {
try {
auto plan = cudnn_frontend::ExecutionPlanBuilder()
.setHandle(handle)
.setEngineConfig(config, tag)
.build();
if (PlanErrataException(handle, plan.getTag())) { continue; }
RunConvPlan(handle, x, y, w, buf, plan);
return;
} catch (cudnn_frontend::cudnnException& e) {}
}
}

size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle,
cudnn_frontend::EngineConfigList& configs,
const std::string& tag) {
for (auto& config : configs) {
try {
auto plan = cudnn_frontend::ExecutionPlanBuilder()
.setHandle(handle)
.setEngineConfig(config, tag)
.build();
if (PlanErrataException(handle, plan.getTag())) { continue; }
if (plan.getWorkspaceSize() > 0L) { return plan.getWorkspaceSize(); }
} catch (cudnn_frontend::cudnnException& e) {}
}
return 1L;
}

bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) {
static nlohmann::json errata_json_handle;
static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, "");
if (!has_json) {
return false;
} else {
return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle,
[]() { return true; });
}
}

void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y,
user_op::Tensor* w, user_op::Tensor* buf,
const cudnn_frontend::ExecutionPlan& plan) {
void* data[] = {x->mut_dptr(), y->mut_dptr(), w->mut_dptr()};
int64_t ids[] = {'x', 'y', 'w'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(buf->mut_dptr())
.setDataPointers(3, data)
.setUids(3, ids)
.build();
OF_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
}

template<>
struct CudnnConvAlgorithmSearch<cudnnConvolutionFwdAlgoPerf_t> {
using perf_t = cudnnConvolutionFwdAlgoPerf_t;
Expand Down
Loading
Loading