From 35ff41cee0f63d39e663f6bc5777541bced0bc93 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 6 Mar 2024 07:01:57 +0000 Subject: [PATCH 01/12] add cudnn-frontend dependency --- cmake/third_party.cmake | 3 +++ cmake/third_party/cudnn-frontend.cmake | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 cmake/third_party/cudnn-frontend.cmake diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index c7ac2893e6e..99cb9577894 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -145,6 +145,7 @@ if(BUILD_CUDA) include(nccl) include(cutlass) include(trt_flash_attention) + include(cudnn-frontend) list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES}) list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES}) @@ -164,6 +165,8 @@ if(BUILD_CUDA) list(APPEND oneflow_third_party_dependencies trt_flash_attention) list(APPEND oneflow_third_party_libs ${TRT_FLASH_ATTENTION_LIBRARIES}) list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${TRT_FLASH_ATTENTION_INCLUDE_DIR}) + list(APPEND oneflow_third_party_dependencies cudnn_frontend_copy_headers_to_destination) + list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_FRONTEND_INCLUDE_DIR}) endif() if(BUILD_RDMA) diff --git a/cmake/third_party/cudnn-frontend.cmake b/cmake/third_party/cudnn-frontend.cmake new file mode 100644 index 00000000000..f640c86d9a5 --- /dev/null +++ b/cmake/third_party/cudnn-frontend.cmake @@ -0,0 +1,25 @@ +include(ExternalProject) + +set(CUDNN_FRONTEND_URL https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.2.zip) +set(CUDNN_FRONTEND_MD5 7e16cc2dcaddefa7fd0f3d82b9cf5d73) +use_mirror(VARIABLE CUDNN_FRONTEND_URL URL ${CUDNN_FRONTEND_URL}) + +set(CUDNN_FRONTEND_INCLUDE_DIR ${THIRD_PARTY_DIR}/cudnn-frontend/include) +set(CUDNN_FRONTEND_BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cudnn-frontend/src/cudnn-frontend) + +if(THIRD_PARTY) + ExternalProject_Add( + cudnn-frontend + PREFIX cudnn-frontend + URL ${CUDNN_FRONTEND_URL} + URL_MD5 ${CUDNN_FRONTEND_MD5} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "") + + add_copy_headers_target( + NAME cudnn_frontend + SRC ${CUDNN_FRONTEND_BASE_DIR}/include/ + DST ${CUDNN_FRONTEND_INCLUDE_DIR} + DEPS cudnn-frontend) +endif(THIRD_PARTY) From dc8a96e89189d55b38591ffcd8da9dc1881e8227 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Wed, 6 Mar 2024 07:24:57 +0000 Subject: [PATCH 02/12] auto format by CI --- cmake/third_party/cudnn-frontend.cmake | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cmake/third_party/cudnn-frontend.cmake b/cmake/third_party/cudnn-frontend.cmake index f640c86d9a5..e43a5196125 100644 --- a/cmake/third_party/cudnn-frontend.cmake +++ b/cmake/third_party/cudnn-frontend.cmake @@ -18,8 +18,12 @@ if(THIRD_PARTY) INSTALL_COMMAND "") add_copy_headers_target( - NAME cudnn_frontend - SRC ${CUDNN_FRONTEND_BASE_DIR}/include/ - DST ${CUDNN_FRONTEND_INCLUDE_DIR} - DEPS cudnn-frontend) + NAME + cudnn_frontend + SRC + ${CUDNN_FRONTEND_BASE_DIR}/include/ + DST + ${CUDNN_FRONTEND_INCLUDE_DIR} + DEPS + cudnn-frontend) endif(THIRD_PARTY) From 6e01476f12b4b065307737a8b60663ca76c451a2 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Sun, 17 Mar 2024 08:16:56 +0000 Subject: [PATCH 03/12] add cudnn v8 conv forward kernel --- oneflow/core/device/cudnn_conv_util.cpp | 171 ++++++++++++++++++++ oneflow/core/device/cudnn_conv_util.h | 47 ++++++ oneflow/core/device/cudnn_util.h | 3 + oneflow/user/kernels/conv_cudnn_kernels.cpp | 100 ++++++++++++ 4 files changed, 321 insertions(+) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index 849551b686f..260b33a402a 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -22,6 +22,7 @@ limitations under the License. #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/global_for.h" #include "oneflow/core/framework/op_kernel.h" +#include "oneflow/core/job/lazy_mode.h" namespace oneflow { @@ -424,6 +425,176 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso args.wdesc.Get(), algo, sz); } +void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, + const user_op::Tensor* x, user_op::Tensor* y, const user_op::Tensor* w, + user_op::Tensor* buf, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, + float beta) { + std::string tag; + cudnn_frontend::EngineConfigList configs = + GetConfigs(handle, desc, x, y, w, padding, stride, dilation, tag, beta); + TryConfigs(handle, x, y, w, buf, configs, tag); +} + +cudnn_frontend::EngineConfigList GetConfigs( + const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, std::string& tag, + float beta) { + auto op_graph = BuildConvOpGraph(handle, desc, x, y, w, padding, stride, dilation, beta); + tag = op_graph.getTag(); + auto sources = GetGeneratorSources(desc); + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(op_graph); + return configs; +} + +cudnn_frontend::OperationGraph BuildConvOpGraph( + const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, float beta) { + auto data_type = GetCudnnDataType(x->data_type()); + auto conv_op = cudnn_frontend::OperationBuilder(desc) + .setxDesc(GetTensorDescriptor(x, 'x')) + .setyDesc(GetTensorDescriptor(y, 'y')) + .setwDesc(GetTensorDescriptor(w, 'w')) + .setcDesc(GetConvDescriptor(data_type, padding, stride, dilation)) + .setBeta(beta) + .build(); + std::array ops = {&conv_op}; + auto op_graph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + return op_graph; +} + +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id) { + auto dim = t->shape_view(); + auto stride = t->stride(); + auto data_type = GetCudnnDataType(t->data_type()); + return cudnn_frontend::TensorBuilder() + .setDim(dim.size(), dim.data()) + .setStride(stride.size(), stride.data()) + .setId(id) + .setAlignment(16) + .setDataType(data_type) + .build(); +} + +cudnn_frontend::ConvDesc GetConvDescriptor(cudnnDataType_t data_type, + const std::vector& padding, + const std::vector& stride, + const std::vector& dilation) { + uint64_t ndim = stride.size(); + return cudnn_frontend::ConvDescBuilder() + .setDataType(data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(ndim) + .setStrides(ndim, stride.data()) + .setPrePadding(ndim, padding.data()) + .setPostPadding(ndim, padding.data()) + .setDilation(ndim, dilation.data()) + .build(); +} + +std::vector GetGeneratorSources( + const cudnnBackendDescriptorType_t desc) { + bool deterministic = Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_use_deterministic_algo_only(); + // Method for engine config generator based on heuristics + const auto heurgen_method = + [deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { + bool heuristic = Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_heuristic_search_algo() + || (!LazyMode::is_enabled()); + auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A; + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(heur_mode) + .build(); + auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + FilterEngineConfigs(engine_configs, filtered_configs, deterministic); + return filtered_configs; + }; + // Method for engine config generator based on fallback list + const auto fallback_method = + [&desc, + deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(opGraph) + .setOperation(desc) + .build(); + auto& fallback_list = fallback.getFallbackList(); + cudnn_frontend::EngineConfigList filtered_configs; + FilterEngineConfigs(fallback_list, filtered_configs, deterministic); + return filtered_configs; + }; + std::vector sources = {heurgen_method, fallback_method}; + return sources; +} + +void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, + cudnn_frontend::EngineConfigList& to, bool deterministic) { + auto filter = [=](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote(c)) { + return true; + } + } + if (cudnn_frontend::hasNumericalNote(c)) { + return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); +} + +void TryConfigs(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, + const user_op::Tensor* w, user_op::Tensor* buf, + cudnn_frontend::EngineConfigList& configs, const std::string& tag) { + for (auto& config : configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config, tag) + .build(); + if (PlanErrataException(handle, tag)) { continue; } + RunConvPlan(handle, x, y, w, buf, plan); + return; + } catch (cudnn_frontend::cudnnException& e) { continue; } + } +} + +bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) { + static nlohmann::json errata_json_handle; + static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, ""); + if (!has_json) { + return false; + } else { + return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle, + []() { return true; }); + } +} + +void RunConvPlan(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, + const user_op::Tensor* w, user_op::Tensor* buf, + const cudnn_frontend::ExecutionPlan& plan) { + void* data[] = {const_cast(x->dptr()), y->mut_dptr(), const_cast(w->dptr())}; + int64_t ids[] = {'x', 'y', 'w'}; + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(buf->mut_dptr()) + .setDataPointers(3, data) + .setUids(3, ids) + .build(); + OF_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); +} + template<> struct CudnnConvAlgorithmSearch { using perf_t = cudnnConvolutionFwdAlgoPerf_t; diff --git a/oneflow/core/device/cudnn_conv_util.h b/oneflow/core/device/cudnn_conv_util.h index e917572580b..1ada22f107e 100644 --- a/oneflow/core/device/cudnn_conv_util.h +++ b/oneflow/core/device/cudnn_conv_util.h @@ -20,6 +20,7 @@ limitations under the License. #include "oneflow/core/device/cudnn_util.h" #include "oneflow/core/common/protobuf.h" +#include "oneflow/core/framework/user_op_tensor.h" namespace oneflow { @@ -168,6 +169,52 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvResource* res, cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz); +void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, + const user_op::Tensor* x, user_op::Tensor* y, const user_op::Tensor* w, + user_op::Tensor* buf, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, + float beta); + +cudnn_frontend::EngineConfigList GetConfigs( + const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, std::string& tag, + float beta); + +cudnn_frontend::OperationGraph BuildConvOpGraph( + const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, + const std::vector& stride, const std::vector& dilation, float beta); + +cudnn_frontend::OperationGraph BuildConvBiasGraph( + const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, + const user_op::Tensor* w, const user_op::Tensor* b, user_op::Tensor* y, + const std::vector& padding, const std::vector& stride, + const std::vector& dilation); + +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id); + +cudnn_frontend::ConvDesc GetConvDescriptor(cudnnDataType_t data_type, + const std::vector& padding, + const std::vector& stride, + const std::vector& dilation); + +std::vector GetGeneratorSources( + const cudnnBackendDescriptorType_t desc); + +void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, + cudnn_frontend::EngineConfigList& to, bool deterministic); + +void TryConfigs(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, + const user_op::Tensor* w, user_op::Tensor* buf, + cudnn_frontend::EngineConfigList& configs, const std::string& tag); + +bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag); + +void RunConvPlan(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, + const user_op::Tensor* w, user_op::Tensor* buf, + const cudnn_frontend::ExecutionPlan& plan); + template perf_t FindCudnnConvAlgorithm(CudnnConvArgs* args); diff --git a/oneflow/core/device/cudnn_util.h b/oneflow/core/device/cudnn_util.h index 92c3d50e950..e05c5726fcb 100644 --- a/oneflow/core/device/cudnn_util.h +++ b/oneflow/core/device/cudnn_util.h @@ -22,6 +22,9 @@ limitations under the License. #ifdef WITH_CUDA #include "cudnn.h" +#include "cudnn_frontend.h" +#include "cudnn_frontend_find_plan.h" +#include "cudnn_frontend_get_plan.h" namespace oneflow { diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index b98c57afcb8..15d2ec7e91e 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -252,6 +252,106 @@ REGISTER_CONV_KERNEL(conv1d, 1); REGISTER_CONV_KERNEL(conv2d, 2); REGISTER_CONV_KERNEL(conv3d, 3); +template +class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + ConvGpuKernelV8() = default; + ~ConvGpuKernelV8() = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + std::shared_ptr CreateConvCudnnOpKernelCache( + user_op::KernelCacheContext* ctx) const { + const auto& data_format = ctx->Attr("data_format"); + int32_t filters = ctx->Attr("filters"); + + std::shared_ptr state(new ConvCudnnOpKernelCache()); + + const user_op::TensorDesc* bias = ctx->TensorDesc4ArgNameAndIndex("bias", 0); + if (bias != nullptr) { + state->bias_desc.reset( + GetBiasCudnnTensorDesc(data_format, filters, bias->data_type())); + } + + return state; + } + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateConvCudnnOpKernelCache(ctx); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + // process context data + const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("out", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + user_op::Tensor* buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + std::vector padding; + const auto& padding_before = ctx->Attr>("padding_before"); + copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); + + std::vector stride; + const auto& strides = ctx->Attr>("strides"); + copy(strides.begin(), strides.end(), back_inserter(stride)); + + std::vector dilation; + const auto& dilation_rate = ctx->Attr>("dilation_rate"); + copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + + // process add_to_output + float beta = 0.0f; + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + Memcpy( + ctx->stream(), output->mut_dptr(), add_to_output->dptr(), + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + beta = 1.0f; + } + + // trigger conv compute + cudnnHandle_t handle = ctx->stream()->As()->cudnn_handle(); + RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, input, output, + weight, buffer, padding, stride, dilation, beta); + + // process bias + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + if (bias != nullptr) { + const auto* conv_cache = dynamic_cast(cache); + CHECK_NOTNULL(conv_cache); + const std::string& data_format = ctx->Attr("data_format"); + CudnnTensorDesc output_desc(output->data_type(), output->shape_view(), data_format); + OF_CUDNN_CHECK(cudnnAddTensor(ctx->stream()->As()->cudnn_handle(), + CudnnSPOnePtr(input->data_type()), conv_cache->bias_desc->Get(), + bias->dptr(), CudnnSPOnePtr(input->data_type()), + output_desc.Get(), output->mut_dptr())); + } + } + + bool IsCudaGraphSupported(user_op::KernelInitContext* ctx, + user_op::OpKernelState* state) const override { + return Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_heuristic_search_algo(); + } +}; + +#define REGISTER_CONV_KERNEL_V8(op_name, ndims) \ + REGISTER_USER_KERNEL(#op_name) \ + .SetCreateFn>() \ + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA \ + && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) \ + .SetPriority(user_op::kKernelPriorityOptimized); + +REGISTER_CONV_KERNEL_V8(conv1d, 1); +REGISTER_CONV_KERNEL_V8(conv2d, 2); +REGISTER_CONV_KERNEL_V8(conv3d, 3); + class ConvDataGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { public: OF_DISALLOW_COPY_AND_MOVE(ConvDataGradGpuKernel); From 49e2c1862c23ede896e7e71b91e01c23f5fc9355 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 27 Mar 2024 05:09:45 +0000 Subject: [PATCH 04/12] add cudnn v8 conv backward kernel --- oneflow/core/device/cudnn_conv_util.cpp | 188 ++++++++++++++------ oneflow/core/device/cudnn_conv_util.h | 79 ++++---- oneflow/user/kernels/conv_cudnn_kernels.cpp | 180 ++++++++++++++++--- 3 files changed, 328 insertions(+), 119 deletions(-) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index 260b33a402a..ce5d90b56a3 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef WITH_CUDA +#include "oneflow/core/framework/infer_util.h" #include "oneflow/core/device/cudnn_conv_util.h" #include "oneflow/core/device/cuda_util.h" #include "oneflow/core/common/cached_caller.h" @@ -333,6 +334,22 @@ CudnnConvArgs::CudnnConvArgs(const user_op::KernelComputeContext& ctx, DataType params.max_ws_size = max_workspace_size; } +CudnnConvArgsV8::CudnnConvArgsV8(const user_op::InferContext& ctx, const user_op::TensorDesc& x, + const user_op::TensorDesc& y, const user_op::TensorDesc& w) + : xdesc(GetTensorDescriptor(x, 'x')), + ydesc(GetTensorDescriptor(y, 'y')), + wdesc(GetTensorDescriptor(w, 'w')), + cdesc(GetConvDescriptor(ctx, GetCudnnDataType(y.data_type()))), + beta(0.0f) {} + +CudnnConvArgsV8::CudnnConvArgsV8(const user_op::KernelComputeContext& ctx, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w) + : xdesc(GetTensorDescriptor(x, 'x')), + ydesc(GetTensorDescriptor(y, 'y')), + wdesc(GetTensorDescriptor(w, 'w')), + cdesc(GetConvDescriptor(ctx, GetCudnnDataType(y->data_type()))), + beta(0.0f) {} + ManagedCudnnConvResource::ManagedCudnnConvResource(const CudnnConvArgs& args) : handle_(nullptr), x_dptr_(nullptr), w_dptr_(nullptr), y_dptr_(nullptr), ws_dptr_(nullptr) { x_byte_size_ = ByteSize4Tensor(args.params.x_dims, args.params.x_ndim, args.params.x_data_type); @@ -426,22 +443,22 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso } void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, - const user_op::Tensor* x, user_op::Tensor* y, const user_op::Tensor* w, - user_op::Tensor* buf, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, - float beta) { + user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, user_op::Tensor* b, + const CudnnConvArgsV8& args) { std::string tag; - cudnn_frontend::EngineConfigList configs = - GetConfigs(handle, desc, x, y, w, padding, stride, dilation, tag, beta); - TryConfigs(handle, x, y, w, buf, configs, tag); -} - -cudnn_frontend::EngineConfigList GetConfigs( - const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, - const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, std::string& tag, - float beta) { - auto op_graph = BuildConvOpGraph(handle, desc, x, y, w, padding, stride, dilation, beta); + auto configs = + GetConfigs(handle, desc, args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + TryConfigs(handle, x, y, w, b, configs, tag); +} + +cudnn_frontend::EngineConfigList GetConfigs(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, float beta, + std::string& tag) { + auto op_graph = BuildConvOpGraph(handle, desc, xdesc, ydesc, wdesc, cdesc, beta); tag = op_graph.getTag(); auto sources = GetGeneratorSources(desc); cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); @@ -449,16 +466,17 @@ cudnn_frontend::EngineConfigList GetConfigs( return configs; } -cudnn_frontend::OperationGraph BuildConvOpGraph( - const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, - const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, float beta) { - auto data_type = GetCudnnDataType(x->data_type()); +cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, float beta) { auto conv_op = cudnn_frontend::OperationBuilder(desc) - .setxDesc(GetTensorDescriptor(x, 'x')) - .setyDesc(GetTensorDescriptor(y, 'y')) - .setwDesc(GetTensorDescriptor(w, 'w')) - .setcDesc(GetConvDescriptor(data_type, padding, stride, dilation)) + .setxDesc(xdesc) + .setyDesc(ydesc) + .setwDesc(wdesc) + .setcDesc(cdesc) .setBeta(beta) .build(); std::array ops = {&conv_op}; @@ -472,20 +490,67 @@ cudnn_frontend::OperationGraph BuildConvOpGraph( cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id) { auto dim = t->shape_view(); auto stride = t->stride(); - auto data_type = GetCudnnDataType(t->data_type()); return cudnn_frontend::TensorBuilder() .setDim(dim.size(), dim.data()) .setStride(stride.size(), stride.data()) .setId(id) - .setAlignment(16) + .setAlignment(32) + .setDataType(GetCudnnDataType(t->data_type())) + .build(); +} + +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id) { + auto dim = t.shape(); + auto stride = t.stride(); + return cudnn_frontend::TensorBuilder() + .setDim(dim.size(), dim.data()) + .setStride(stride.size(), stride.data()) + .setId(id) + .setAlignment(32) + .setDataType(GetCudnnDataType(t.data_type())) + .build(); +} + +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, + cudnnDataType_t data_type) { + std::vector padding; + const auto& padding_before = ctx.Attr>("padding_before"); + copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); + + std::vector stride; + const auto& strides = ctx.Attr>("strides"); + copy(strides.begin(), strides.end(), back_inserter(stride)); + + std::vector dilation; + const auto& dilation_rate = ctx.Attr>("dilation_rate"); + copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + + uint64_t ndim = stride.size(); + return cudnn_frontend::ConvDescBuilder() .setDataType(data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(ndim) + .setStrides(ndim, stride.data()) + .setPrePadding(ndim, padding.data()) + .setPostPadding(ndim, padding.data()) + .setDilation(ndim, dilation.data()) .build(); } -cudnn_frontend::ConvDesc GetConvDescriptor(cudnnDataType_t data_type, - const std::vector& padding, - const std::vector& stride, - const std::vector& dilation) { +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, + cudnnDataType_t data_type) { + std::vector padding; + const auto& padding_before = ctx.Attr>("padding_before"); + copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); + + std::vector stride; + const auto& strides = ctx.Attr>("strides"); + copy(strides.begin(), strides.end(), back_inserter(stride)); + + std::vector dilation; + const auto& dilation_rate = ctx.Attr>("dilation_rate"); + copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + uint64_t ndim = stride.size(); return cudnn_frontend::ConvDescBuilder() .setDataType(data_type) @@ -504,15 +569,16 @@ std::vector GetGeneratorSources( ->resource() .cudnn_conf() .cudnn_conv_use_deterministic_algo_only(); + bool heuristic = Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_heuristic_search_algo() + || (!LazyMode::is_enabled()); + auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A; // Method for engine config generator based on heuristics const auto heurgen_method = - [deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { - bool heuristic = Singleton::Get() - ->resource() - .cudnn_conf() - .cudnn_conv_heuristic_search_algo() - || (!LazyMode::is_enabled()); - auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A; + [deterministic, + heur_mode](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() .setOperationGraph(opGraph) .setHeurMode(heur_mode) @@ -522,20 +588,7 @@ std::vector GetGeneratorSources( FilterEngineConfigs(engine_configs, filtered_configs, deterministic); return filtered_configs; }; - // Method for engine config generator based on fallback list - const auto fallback_method = - [&desc, - deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { - auto fallback = cudnn_frontend::EngineFallbackListBuilder() - .setOperationGraph(opGraph) - .setOperation(desc) - .build(); - auto& fallback_list = fallback.getFallbackList(); - cudnn_frontend::EngineConfigList filtered_configs; - FilterEngineConfigs(fallback_list, filtered_configs, deterministic); - return filtered_configs; - }; - std::vector sources = {heurgen_method, fallback_method}; + std::vector sources = {heurgen_method}; return sources; } @@ -555,20 +608,37 @@ void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, cudnn_frontend::filter(from, to, filter); } -void TryConfigs(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, - const user_op::Tensor* w, user_op::Tensor* buf, - cudnn_frontend::EngineConfigList& configs, const std::string& tag) { +void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, + user_op::Tensor* w, user_op::Tensor* buf, cudnn_frontend::EngineConfigList& configs, + const std::string& tag) { for (auto& config : configs) { try { auto plan = cudnn_frontend::ExecutionPlanBuilder() .setHandle(handle) .setEngineConfig(config, tag) .build(); - if (PlanErrataException(handle, tag)) { continue; } + if (PlanErrataException(handle, plan.getTag())) { continue; } RunConvPlan(handle, x, y, w, buf, plan); return; - } catch (cudnn_frontend::cudnnException& e) { continue; } + } catch (cudnn_frontend::cudnnException& e) {} + } +} + +size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, + cudnn_frontend::EngineConfigList& configs, + const std::string& tag) { + size_t workspace_size = 0; + for (auto& config : configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config, tag) + .build(); + if (PlanErrataException(handle, plan.getTag())) { continue; } + if (plan.getWorkspaceSize() > workspace_size) { workspace_size = plan.getWorkspaceSize(); } + } catch (cudnn_frontend::cudnnException& e) {} } + return workspace_size; } bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) { @@ -582,10 +652,10 @@ bool PlanErrataException(const cudnnHandle_t handle, const std::string& executio } } -void RunConvPlan(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, - const user_op::Tensor* w, user_op::Tensor* buf, +void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, + user_op::Tensor* w, user_op::Tensor* buf, const cudnn_frontend::ExecutionPlan& plan) { - void* data[] = {const_cast(x->dptr()), y->mut_dptr(), const_cast(w->dptr())}; + void* data[] = {x->mut_dptr(), y->mut_dptr(), w->mut_dptr()}; int64_t ids[] = {'x', 'y', 'w'}; auto variantPack = cudnn_frontend::VariantPackBuilder() .setWorkspacePointer(buf->mut_dptr()) diff --git a/oneflow/core/device/cudnn_conv_util.h b/oneflow/core/device/cudnn_conv_util.h index 1ada22f107e..c5b9b642721 100644 --- a/oneflow/core/device/cudnn_conv_util.h +++ b/oneflow/core/device/cudnn_conv_util.h @@ -17,7 +17,7 @@ limitations under the License. #define ONEFLOW_CORE_DEVICE_CUDNN_CONV_UTIL_H_ #ifdef WITH_CUDA - +#include "oneflow/core/common/tensor_desc.h" #include "oneflow/core/device/cudnn_util.h" #include "oneflow/core/common/protobuf.h" #include "oneflow/core/framework/user_op_tensor.h" @@ -94,6 +94,20 @@ struct CudnnConvArgs final { bool enable_pseudo_half); }; +struct CudnnConvArgsV8 final { + cudnn_frontend::Tensor xdesc; + cudnn_frontend::Tensor ydesc; + cudnn_frontend::Tensor wdesc; + cudnn_frontend::ConvDesc cdesc; + float beta; + + OF_DISALLOW_COPY_AND_MOVE(CudnnConvArgsV8); + explicit CudnnConvArgsV8(const user_op::InferContext& ctx, const user_op::TensorDesc& x, + const user_op::TensorDesc& y, const user_op::TensorDesc& w); + explicit CudnnConvArgsV8(const user_op::KernelComputeContext& ctx, const user_op::Tensor* x, + const user_op::Tensor* y, const user_op::Tensor* w); +}; + class CudnnConvResource { public: CudnnConvResource() = default; @@ -170,34 +184,33 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz); void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, - const user_op::Tensor* x, user_op::Tensor* y, const user_op::Tensor* w, - user_op::Tensor* buf, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, - float beta); - -cudnn_frontend::EngineConfigList GetConfigs( - const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, - const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, std::string& tag, - float beta); - -cudnn_frontend::OperationGraph BuildConvOpGraph( - const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, - const user_op::Tensor* y, const user_op::Tensor* w, const std::vector& padding, - const std::vector& stride, const std::vector& dilation, float beta); - -cudnn_frontend::OperationGraph BuildConvBiasGraph( - const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const user_op::Tensor* x, - const user_op::Tensor* w, const user_op::Tensor* b, user_op::Tensor* y, - const std::vector& padding, const std::vector& stride, - const std::vector& dilation); + user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, user_op::Tensor* b, + const CudnnConvArgsV8& args); + +cudnn_frontend::EngineConfigList GetConfigs(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, float beta, + std::string& tag); + +cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, float beta); cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id); -cudnn_frontend::ConvDesc GetConvDescriptor(cudnnDataType_t data_type, - const std::vector& padding, - const std::vector& stride, - const std::vector& dilation); +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id); + +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, + cudnnDataType_t data_type); + +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, + cudnnDataType_t data_type); std::vector GetGeneratorSources( const cudnnBackendDescriptorType_t desc); @@ -205,14 +218,18 @@ std::vector GetGeneratorSources( void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, cudnn_frontend::EngineConfigList& to, bool deterministic); -void TryConfigs(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, - const user_op::Tensor* w, user_op::Tensor* buf, - cudnn_frontend::EngineConfigList& configs, const std::string& tag); +void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, + user_op::Tensor* w, user_op::Tensor* buf, cudnn_frontend::EngineConfigList& configs, + const std::string& tag); + +size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, + cudnn_frontend::EngineConfigList& configs, + const std::string& tag); bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag); -void RunConvPlan(const cudnnHandle_t handle, const user_op::Tensor* x, user_op::Tensor* y, - const user_op::Tensor* w, user_op::Tensor* buf, +void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, + user_op::Tensor* w, user_op::Tensor* buf, const cudnn_frontend::ExecutionPlan& plan); template diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index 15d2ec7e91e..148b3cd518e 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -285,44 +285,34 @@ class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGrap void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, const user_op::OpKernelCache* cache) const override { // process context data - const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("out", 0); - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - user_op::Tensor* buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - std::vector padding; - const auto& padding_before = ctx->Attr>("padding_before"); - copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); - - std::vector stride; - const auto& strides = ctx->Attr>("strides"); - copy(strides.begin(), strides.end(), back_inserter(stride)); + auto input = ctx->Tensor4ArgNameAndIndex("in", 0); + auto output = ctx->Tensor4ArgNameAndIndex("out", 0); + auto weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + auto buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - std::vector dilation; - const auto& dilation_rate = ctx->Attr>("dilation_rate"); - copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + if (input->shape_view().elem_cnt() == 0) return; + CudnnConvArgsV8 args(*ctx, input, output, weight); // process add_to_output - float beta = 0.0f; if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + auto add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); Memcpy( ctx->stream(), output->mut_dptr(), add_to_output->dptr(), add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); - beta = 1.0f; + args.beta = 1.0f; } // trigger conv compute - cudnnHandle_t handle = ctx->stream()->As()->cudnn_handle(); + auto handle = ctx->stream()->As()->cudnn_handle(); RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, input, output, - weight, buffer, padding, stride, dilation, beta); + weight, buffer, args); // process bias - const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + auto bias = ctx->Tensor4ArgNameAndIndex("bias", 0); if (bias != nullptr) { - const auto* conv_cache = dynamic_cast(cache); + auto conv_cache = dynamic_cast(cache); CHECK_NOTNULL(conv_cache); - const std::string& data_format = ctx->Attr("data_format"); + const auto& data_format = ctx->Attr("data_format"); CudnnTensorDesc output_desc(output->data_type(), output->shape_view(), data_format); OF_CUDNN_CHECK(cudnnAddTensor(ctx->stream()->As()->cudnn_handle(), CudnnSPOnePtr(input->data_type()), conv_cache->bias_desc->Get(), @@ -340,12 +330,22 @@ class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGrap } }; -#define REGISTER_CONV_KERNEL_V8(op_name, ndims) \ - REGISTER_USER_KERNEL(#op_name) \ - .SetCreateFn>() \ - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA \ - && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) \ +#define REGISTER_CONV_KERNEL_V8(op_name, ndims) \ + REGISTER_USER_KERNEL(#op_name) \ + .SetCreateFn>() \ + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA \ + && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + auto& input = ctx->InputTensorDesc("in", 0); \ + auto& output = ctx->InputTensorDesc("out", 0); \ + auto& weight = ctx->InputTensorDesc("weight", 0); \ + CudnnConvArgsV8 args(*ctx, input, output, weight); \ + auto handle = Singleton::Get()->Get(); \ + std::string tag; \ + auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, \ + args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); \ + return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); \ + }) \ .SetPriority(user_op::kKernelPriorityOptimized); REGISTER_CONV_KERNEL_V8(conv1d, 1); @@ -425,6 +425,72 @@ REGISTER_USER_KERNEL("conv_data_grad") return Maybe::Ok(); }); +class ConvDataGradGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + OF_DISALLOW_COPY_AND_MOVE(ConvDataGradGpuKernelV8); + ConvDataGradGpuKernelV8() = default; + ~ConvDataGradGpuKernelV8() = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + auto input_diff = ctx->Tensor4ArgNameAndIndex("dx", 0); + auto output_diff = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto weight = ctx->Tensor4ArgNameAndIndex("filter", 0); + auto buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + if (output_diff->shape_view().elem_cnt() == 0) return; + + CudnnConvArgsV8 args(*ctx, input_diff, output_diff, weight); + // process add_to_output + if (ctx->has_input("_add_to_output", 0)) { + auto add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + Memcpy( + ctx->stream(), input_diff->mut_dptr(), add_to_output->dptr(), + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + args.beta = 1.0f; + } + + // trigger conv compute + auto handle = ctx->stream()->As()->cudnn_handle(); + RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, input_diff, + output_diff, weight, buffer, args); + } + + bool IsCudaGraphSupported(user_op::KernelInitContext* ctx, + user_op::OpKernelState* state) const override { + return Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_heuristic_search_algo(); + } +}; + +REGISTER_USER_KERNEL("conv_data_grad") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA + && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { + auto& input_diff = ctx->InputTensorDesc("dx", 0); + auto& output_diff = ctx->InputTensorDesc("dy", 0); + auto& weight = ctx->InputTensorDesc("filter", 0); + CudnnConvArgsV8 args(*ctx, input_diff, output_diff, weight); + auto handle = Singleton::Get()->Get(); + std::string tag; + auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + }) + .SetInplaceProposalFn([](const user_op::InferContext& ctx, + const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe { + if (ctx.has_input("_add_to_output", 0)) { + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "_add_to_output", 0, true)); + } + return Maybe::Ok(); + }) + .SetPriority(user_op::kKernelPriorityOptimized); + class ConvFilterGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { public: OF_DISALLOW_COPY_AND_MOVE(ConvFilterGradGpuKernel); @@ -484,6 +550,62 @@ REGISTER_USER_KERNEL("conv_filter_grad") cudnn_conf.cudnn_conv_force_bwd_filter_algo()); }); +class ConvFilterGradGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + OF_DISALLOW_COPY_AND_MOVE(ConvFilterGradGpuKernelV8); + ConvFilterGradGpuKernelV8() = default; + ~ConvFilterGradGpuKernelV8() = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + auto input = ctx->Tensor4ArgNameAndIndex("x", 0); + auto output_diff = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto weight_diff = ctx->Tensor4ArgNameAndIndex("filter_diff", 0); + auto buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + if (input->shape_view().elem_cnt() == 0) { + Memset( + ctx->stream(), weight_diff->mut_dptr(), 0, + weight_diff->shape_view().elem_cnt() * GetSizeOfDataType(weight_diff->data_type())); + return; + } + + CudnnConvArgsV8 args(*ctx, input, output_diff, weight_diff); + + // trigger conv compute + auto handle = ctx->stream()->As()->cudnn_handle(); + RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, input, + output_diff, weight_diff, buffer, args); + } + + bool IsCudaGraphSupported(user_op::KernelInitContext* ctx, + user_op::OpKernelState* state) const override { + return Singleton::Get() + ->resource() + .cudnn_conf() + .cudnn_conv_heuristic_search_algo(); + } +}; + +REGISTER_USER_KERNEL("conv_filter_grad") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA + && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { + auto& input = ctx->InputTensorDesc("x", 0); + auto& output_diff = ctx->InputTensorDesc("dy", 0); + auto& weight_diff = ctx->InputTensorDesc("filter_diff", 0); + CudnnConvArgsV8 args(*ctx, input, output_diff, weight_diff); + auto handle = Singleton::Get()->Get(); + std::string tag; + auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + }) + .SetPriority(user_op::kKernelPriorityOptimized); + struct ConvBiasGradState final : public user_op::OpKernelState { std::unique_ptr bias_diff_desc; }; From e19d08a32e59ce20d12a4e7035f9ec64437e7b15 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Thu, 28 Mar 2024 08:25:18 +0000 Subject: [PATCH 05/12] transform output type to fp32 for input type fp16 --- oneflow/core/device/cudnn_conv_util.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index ce5d90b56a3..834ae2afe16 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -513,6 +513,10 @@ cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const i cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, cudnnDataType_t data_type) { + if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { + data_type = CUDNN_DATA_FLOAT; + } + std::vector padding; const auto& padding_before = ctx.Attr>("padding_before"); copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); @@ -539,6 +543,10 @@ cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, cudnnDataType_t data_type) { + if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { + data_type = CUDNN_DATA_FLOAT; + } + std::vector padding; const auto& padding_before = ctx.Attr>("padding_before"); copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); From cd38272c5811306abce8a443a3ec183e2ae8aa3c Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 10 Apr 2024 11:15:48 +0000 Subject: [PATCH 06/12] refine workspace assign for conv v8 --- oneflow/core/device/cudnn_conv_util.cpp | 22 +++++++++++++++----- oneflow/user/kernels/conv_cudnn_kernels.cpp | 23 ++++++++++++++------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index 834ae2afe16..69aabc75c77 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -84,7 +84,7 @@ perf_t GetBestAlgorithm(const CudnnConvArgs& args, CudnnConvResource* res, FOR_RANGE(size_t, i, 0, perf_vec.size()) { // Note: Shouldn't all returned results be successful? CHECK_EQ(perf_vec[i].status, CUDNN_STATUS_SUCCESS); - if (perf_vec[i].memory > args.params.max_ws_size) { continue; } + // if (perf_vec[i].memory > args.params.max_ws_size) { continue; } if (args.deterministic && perf_vec[i].determinism == CUDNN_NON_DETERMINISTIC) { continue; } found_algo_idx = i; break; @@ -596,7 +596,20 @@ std::vector GetGeneratorSources( FilterEngineConfigs(engine_configs, filtered_configs, deterministic); return filtered_configs; }; - std::vector sources = {heurgen_method}; + // Method for engine config generator based on fallback list + const auto fallback_method = + [desc, + deterministic](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList { + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(opGraph) + .setOperation(desc) + .build(); + auto& fallback_list = fallback.getFallbackList(); + cudnn_frontend::EngineConfigList filtered_configs; + FilterEngineConfigs(fallback_list, filtered_configs, deterministic); + return filtered_configs; + }; + std::vector sources = {heurgen_method, fallback_method}; return sources; } @@ -635,7 +648,6 @@ void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, cudnn_frontend::EngineConfigList& configs, const std::string& tag) { - size_t workspace_size = 0; for (auto& config : configs) { try { auto plan = cudnn_frontend::ExecutionPlanBuilder() @@ -643,10 +655,10 @@ size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, .setEngineConfig(config, tag) .build(); if (PlanErrataException(handle, plan.getTag())) { continue; } - if (plan.getWorkspaceSize() > workspace_size) { workspace_size = plan.getWorkspaceSize(); } + if (plan.getWorkspaceSize() > 0L) { return plan.getWorkspaceSize(); } } catch (cudnn_frontend::cudnnException& e) {} } - return workspace_size; + return 1L; } bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) { diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index 148b3cd518e..db116a103a5 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -103,6 +103,7 @@ size_t InferTmpSizeWithCudnn(const user_op::TensorDesc* x, const user_op::Tensor CHECK_EQ(algo_perf.status, CUDNN_STATUS_SUCCESS) << "op (" << ctx.op_name() << ") find algorithm perference failed. algo: " << algo_perf.algo; + // TODO workspace size limit will lead to dismatch result with pytorch for large tensor CHECK_LE(algo_perf.memory, workspace_size) << "op (" << ctx.op_name() << ") find algorithm " << algo_perf.algo << ", need memory " << algo_perf.memory << ", but cudnn_buf_limit_byte is " << workspace_size; @@ -344,7 +345,9 @@ class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGrap std::string tag; \ auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, \ args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); \ - return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); \ + size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); \ + Singleton::Get()->Put(handle); \ + return workspace_size; \ }) \ .SetPriority(user_op::kKernelPriorityOptimized); @@ -478,9 +481,12 @@ REGISTER_USER_KERNEL("conv_data_grad") CudnnConvArgsV8 args(*ctx, input_diff, output_diff, weight); auto handle = Singleton::Get()->Get(); std::string tag; - auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, - args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); - return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + auto configs = + GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, + args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + Singleton::Get()->Put(handle); + return workspace_size; }) .SetInplaceProposalFn([](const user_op::InferContext& ctx, const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe { @@ -600,9 +606,12 @@ REGISTER_USER_KERNEL("conv_filter_grad") CudnnConvArgsV8 args(*ctx, input, output_diff, weight_diff); auto handle = Singleton::Get()->Get(); std::string tag; - auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, - args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); - return GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + auto configs = + GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, + args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); + Singleton::Get()->Put(handle); + return workspace_size; }) .SetPriority(user_op::kKernelPriorityOptimized); From 19f46f290854de92cdd7e472afeb35334344a9a8 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 10 Apr 2024 11:18:26 +0000 Subject: [PATCH 07/12] fix comment --- oneflow/core/device/cudnn_conv_util.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index 69aabc75c77..69d8b308455 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -84,7 +84,8 @@ perf_t GetBestAlgorithm(const CudnnConvArgs& args, CudnnConvResource* res, FOR_RANGE(size_t, i, 0, perf_vec.size()) { // Note: Shouldn't all returned results be successful? CHECK_EQ(perf_vec[i].status, CUDNN_STATUS_SUCCESS); - // if (perf_vec[i].memory > args.params.max_ws_size) { continue; } + // TODO workspace size limit will lead to dismatch result with pytorch for large tensor + if (perf_vec[i].memory > args.params.max_ws_size) { continue; } if (args.deterministic && perf_vec[i].determinism == CUDNN_NON_DETERMINISTIC) { continue; } found_algo_idx = i; break; From 142295401b65f635fe85b3403e84cf11e5598d67 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 10 Apr 2024 11:31:31 +0000 Subject: [PATCH 08/12] change heuristic value --- oneflow/core/device/cudnn_conv_util.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index 69d8b308455..fbdcebf2ec8 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -578,11 +578,7 @@ std::vector GetGeneratorSources( ->resource() .cudnn_conf() .cudnn_conv_use_deterministic_algo_only(); - bool heuristic = Singleton::Get() - ->resource() - .cudnn_conf() - .cudnn_conv_heuristic_search_algo() - || (!LazyMode::is_enabled()); + bool heuristic = ParseBooleanFromEnv("ONEFLOW_CUDNN_USE_HEURISTIC_MODE_B", false); auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A; // Method for engine config generator based on heuristics const auto heurgen_method = From ddeb9070d1a15d372fad6a61e97cfac1cd56cad7 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Wed, 24 Apr 2024 06:55:35 +0000 Subject: [PATCH 09/12] move cmake files to external --- cmake/third_party.cmake | 3 --- cmake/third_party/cudnn-frontend.cmake | 29 -------------------------- external/CMakeLists.txt | 10 +++++++++ external/cudnn-frontend/CMakeLists.txt | 7 +++++++ 4 files changed, 17 insertions(+), 32 deletions(-) delete mode 100644 cmake/third_party/cudnn-frontend.cmake create mode 100644 external/cudnn-frontend/CMakeLists.txt diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 99cb9577894..c7ac2893e6e 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -145,7 +145,6 @@ if(BUILD_CUDA) include(nccl) include(cutlass) include(trt_flash_attention) - include(cudnn-frontend) list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES}) list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES}) @@ -165,8 +164,6 @@ if(BUILD_CUDA) list(APPEND oneflow_third_party_dependencies trt_flash_attention) list(APPEND oneflow_third_party_libs ${TRT_FLASH_ATTENTION_LIBRARIES}) list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${TRT_FLASH_ATTENTION_INCLUDE_DIR}) - list(APPEND oneflow_third_party_dependencies cudnn_frontend_copy_headers_to_destination) - list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_FRONTEND_INCLUDE_DIR}) endif() if(BUILD_RDMA) diff --git a/cmake/third_party/cudnn-frontend.cmake b/cmake/third_party/cudnn-frontend.cmake deleted file mode 100644 index e43a5196125..00000000000 --- a/cmake/third_party/cudnn-frontend.cmake +++ /dev/null @@ -1,29 +0,0 @@ -include(ExternalProject) - -set(CUDNN_FRONTEND_URL https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.2.zip) -set(CUDNN_FRONTEND_MD5 7e16cc2dcaddefa7fd0f3d82b9cf5d73) -use_mirror(VARIABLE CUDNN_FRONTEND_URL URL ${CUDNN_FRONTEND_URL}) - -set(CUDNN_FRONTEND_INCLUDE_DIR ${THIRD_PARTY_DIR}/cudnn-frontend/include) -set(CUDNN_FRONTEND_BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cudnn-frontend/src/cudnn-frontend) - -if(THIRD_PARTY) - ExternalProject_Add( - cudnn-frontend - PREFIX cudnn-frontend - URL ${CUDNN_FRONTEND_URL} - URL_MD5 ${CUDNN_FRONTEND_MD5} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "") - - add_copy_headers_target( - NAME - cudnn_frontend - SRC - ${CUDNN_FRONTEND_BASE_DIR}/include/ - DST - ${CUDNN_FRONTEND_INCLUDE_DIR} - DEPS - cudnn-frontend) -endif(THIRD_PARTY) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index 495f6754b7e..a72548e5b58 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -17,6 +17,11 @@ set(KINETO_URL use_mirror(VARIABLE KINETO_URL URL ${KINETO_URL}) set(KINETO_MD5 f9b550591b3899fb267270c19484933f) +set(CUDNN_FRONTEND_URL + https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.2.zip) +use_mirror(VARIABLE CUDNN_FRONTEND_URL URL ${CUDNN_FRONTEND_URL}) +set(CUDNN_FRONTEND_MD5 7e16cc2dcaddefa7fd0f3d82b9cf5d73) + set(EXTERNAL_TARGETS) if(WITH_TBB) # set(WITH_${threading_runtime_item} ON) in threading.cmake @@ -33,6 +38,11 @@ list(APPEND EXTERNAL_TARGETS fmt) add_subdirectory(kineto) list(APPEND EXTERNAL_TARGETS kineto) +if(BUILD_CUDA) + add_subdirectory(cudnn-frontend) + list(APPEND EXTERNAL_TARGETS cudnn_frontend) +endif() + mark_targets_as_system(${EXTERNAL_TARGETS}) set_property(GLOBAL PROPERTY EXTERNAL_TARGETS ${EXTERNAL_TARGETS}) diff --git a/external/cudnn-frontend/CMakeLists.txt b/external/cudnn-frontend/CMakeLists.txt new file mode 100644 index 00000000000..350d9e8e14e --- /dev/null +++ b/external/cudnn-frontend/CMakeLists.txt @@ -0,0 +1,7 @@ +include(FetchContent) +FetchContent_Declare( + cudnn-frontend + URL ${CUDNN_FRONTEND_URL} + URL_HASH MD5=${CUDNN_FRONTEND_MD5} +) +FetchContent_MakeAvailable(cudnn-frontend) From 18625d9a1ca9df07bdcb9d618478800837828b7c Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Tue, 30 Apr 2024 09:06:24 +0000 Subject: [PATCH 10/12] fix cmake bug --- external/CMakeLists.txt | 6 +++--- external/{cudnn-frontend => cudnn_frontend}/CMakeLists.txt | 5 +++-- oneflow/core/device/cudnn_conv_util.h | 3 +++ oneflow/core/device/cudnn_util.h | 3 --- 4 files changed, 9 insertions(+), 8 deletions(-) rename external/{cudnn-frontend => cudnn_frontend}/CMakeLists.txt (54%) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index a72548e5b58..b112432f95e 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -18,9 +18,9 @@ use_mirror(VARIABLE KINETO_URL URL ${KINETO_URL}) set(KINETO_MD5 f9b550591b3899fb267270c19484933f) set(CUDNN_FRONTEND_URL - https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.2.zip) + https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.9.1.zip) use_mirror(VARIABLE CUDNN_FRONTEND_URL URL ${CUDNN_FRONTEND_URL}) -set(CUDNN_FRONTEND_MD5 7e16cc2dcaddefa7fd0f3d82b9cf5d73) +set(CUDNN_FRONTEND_MD5 0d28ff6aaa984dac4f7d16acfc48de72) set(EXTERNAL_TARGETS) @@ -39,7 +39,7 @@ add_subdirectory(kineto) list(APPEND EXTERNAL_TARGETS kineto) if(BUILD_CUDA) - add_subdirectory(cudnn-frontend) + add_subdirectory(cudnn_frontend) list(APPEND EXTERNAL_TARGETS cudnn_frontend) endif() diff --git a/external/cudnn-frontend/CMakeLists.txt b/external/cudnn_frontend/CMakeLists.txt similarity index 54% rename from external/cudnn-frontend/CMakeLists.txt rename to external/cudnn_frontend/CMakeLists.txt index 350d9e8e14e..eff9a226b03 100644 --- a/external/cudnn-frontend/CMakeLists.txt +++ b/external/cudnn_frontend/CMakeLists.txt @@ -1,7 +1,8 @@ include(FetchContent) FetchContent_Declare( - cudnn-frontend + cudnn_frontend URL ${CUDNN_FRONTEND_URL} URL_HASH MD5=${CUDNN_FRONTEND_MD5} ) -FetchContent_MakeAvailable(cudnn-frontend) +set(CUDNN_FRONTEND_BUILD_SAMPLES OFF) +FetchContent_MakeAvailable(cudnn_frontend) diff --git a/oneflow/core/device/cudnn_conv_util.h b/oneflow/core/device/cudnn_conv_util.h index c5b9b642721..b804f76b884 100644 --- a/oneflow/core/device/cudnn_conv_util.h +++ b/oneflow/core/device/cudnn_conv_util.h @@ -17,6 +17,9 @@ limitations under the License. #define ONEFLOW_CORE_DEVICE_CUDNN_CONV_UTIL_H_ #ifdef WITH_CUDA + +#include "cudnn_frontend.h" +#include "cudnn_frontend_EngineConfigGenerator.h" #include "oneflow/core/common/tensor_desc.h" #include "oneflow/core/device/cudnn_util.h" #include "oneflow/core/common/protobuf.h" diff --git a/oneflow/core/device/cudnn_util.h b/oneflow/core/device/cudnn_util.h index e05c5726fcb..92c3d50e950 100644 --- a/oneflow/core/device/cudnn_util.h +++ b/oneflow/core/device/cudnn_util.h @@ -22,9 +22,6 @@ limitations under the License. #ifdef WITH_CUDA #include "cudnn.h" -#include "cudnn_frontend.h" -#include "cudnn_frontend_find_plan.h" -#include "cudnn_frontend_get_plan.h" namespace oneflow { From a8fd7a3b6cbc658c60e25714d8ff10c8f7450d74 Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Tue, 30 Apr 2024 11:27:02 +0000 Subject: [PATCH 11/12] install cudnn_frontend --- external/cudnn_frontend/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/external/cudnn_frontend/CMakeLists.txt b/external/cudnn_frontend/CMakeLists.txt index eff9a226b03..7375f373f57 100644 --- a/external/cudnn_frontend/CMakeLists.txt +++ b/external/cudnn_frontend/CMakeLists.txt @@ -6,3 +6,11 @@ FetchContent_Declare( ) set(CUDNN_FRONTEND_BUILD_SAMPLES OFF) FetchContent_MakeAvailable(cudnn_frontend) + +set(CUDNN_FRONTEND_INSTALL_DIR ${THIRD_PARTY_DIR}/cudnn_frontend) +install( + TARGETS cudnn_frontend + EXPORT oneflow + LIBRARY DESTINATION ${CUDNN_FRONTEND_INSTALL_DIR}/lib + ARCHIVE DESTINATION ${CUDNN_FRONTEND_INSTALL_DIR}/lib +) From bea5080a9e2fc0cfb7fbae130e8ce5b0d25d02ac Mon Sep 17 00:00:00 2001 From: linzs148 <1483100349@qq.com> Date: Tue, 30 Apr 2024 14:55:40 +0000 Subject: [PATCH 12/12] refine functions --- oneflow/core/device/cudnn_conv_util.cpp | 288 ++++++++++---------- oneflow/core/device/cudnn_conv_util.h | 55 +--- oneflow/user/kernels/conv_cudnn_kernels.cpp | 61 +++-- 3 files changed, 186 insertions(+), 218 deletions(-) diff --git a/oneflow/core/device/cudnn_conv_util.cpp b/oneflow/core/device/cudnn_conv_util.cpp index fbdcebf2ec8..39f8e7e83a4 100644 --- a/oneflow/core/device/cudnn_conv_util.cpp +++ b/oneflow/core/device/cudnn_conv_util.cpp @@ -335,6 +335,90 @@ CudnnConvArgs::CudnnConvArgs(const user_op::KernelComputeContext& ctx, DataType params.max_ws_size = max_workspace_size; } +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id) { + auto dim = t->shape_view(); + auto stride = t->stride(); + return cudnn_frontend::TensorBuilder() + .setDim(dim.size(), dim.data()) + .setStride(stride.size(), stride.data()) + .setId(id) + .setAlignment(32) + .setDataType(GetCudnnDataType(t->data_type())) + .build(); +} + +cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id) { + auto dim = t.shape(); + auto stride = t.stride(); + return cudnn_frontend::TensorBuilder() + .setDim(dim.size(), dim.data()) + .setStride(stride.size(), stride.data()) + .setId(id) + .setAlignment(32) + .setDataType(GetCudnnDataType(t.data_type())) + .build(); +} + +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, + cudnnDataType_t data_type) { + if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { + data_type = CUDNN_DATA_FLOAT; + } + + std::vector padding; + const auto& padding_before = ctx.Attr>("padding_before"); + copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); + + std::vector stride; + const auto& strides = ctx.Attr>("strides"); + copy(strides.begin(), strides.end(), back_inserter(stride)); + + std::vector dilation; + const auto& dilation_rate = ctx.Attr>("dilation_rate"); + copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + + uint64_t ndim = stride.size(); + return cudnn_frontend::ConvDescBuilder() + .setDataType(data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(ndim) + .setStrides(ndim, stride.data()) + .setPrePadding(ndim, padding.data()) + .setPostPadding(ndim, padding.data()) + .setDilation(ndim, dilation.data()) + .build(); +} + +cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, + cudnnDataType_t data_type) { + if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { + data_type = CUDNN_DATA_FLOAT; + } + + std::vector padding; + const auto& padding_before = ctx.Attr>("padding_before"); + copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); + + std::vector stride; + const auto& strides = ctx.Attr>("strides"); + copy(strides.begin(), strides.end(), back_inserter(stride)); + + std::vector dilation; + const auto& dilation_rate = ctx.Attr>("dilation_rate"); + copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); + + uint64_t ndim = stride.size(); + return cudnn_frontend::ConvDescBuilder() + .setDataType(data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(ndim) + .setStrides(ndim, stride.data()) + .setPrePadding(ndim, padding.data()) + .setPostPadding(ndim, padding.data()) + .setDilation(ndim, dilation.data()) + .build(); +} + CudnnConvArgsV8::CudnnConvArgsV8(const user_op::InferContext& ctx, const user_op::TensorDesc& x, const user_op::TensorDesc& y, const user_op::TensorDesc& w) : xdesc(GetTensorDescriptor(x, 'x')), @@ -443,30 +527,6 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso args.wdesc.Get(), algo, sz); } -void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, - user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, user_op::Tensor* b, - const CudnnConvArgsV8& args) { - std::string tag; - auto configs = - GetConfigs(handle, desc, args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); - TryConfigs(handle, x, y, w, b, configs, tag); -} - -cudnn_frontend::EngineConfigList GetConfigs(const cudnnHandle_t handle, - const cudnnBackendDescriptorType_t desc, - const cudnn_frontend::Tensor& xdesc, - const cudnn_frontend::Tensor& ydesc, - const cudnn_frontend::Tensor& wdesc, - const cudnn_frontend::ConvDesc& cdesc, float beta, - std::string& tag) { - auto op_graph = BuildConvOpGraph(handle, desc, xdesc, ydesc, wdesc, cdesc, beta); - tag = op_graph.getTag(); - auto sources = GetGeneratorSources(desc); - cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); - auto configs = generator.generate_engine_config(op_graph); - return configs; -} - cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const cudnn_frontend::Tensor& xdesc, @@ -488,88 +548,20 @@ cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle, return op_graph; } -cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id) { - auto dim = t->shape_view(); - auto stride = t->stride(); - return cudnn_frontend::TensorBuilder() - .setDim(dim.size(), dim.data()) - .setStride(stride.size(), stride.data()) - .setId(id) - .setAlignment(32) - .setDataType(GetCudnnDataType(t->data_type())) - .build(); -} - -cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id) { - auto dim = t.shape(); - auto stride = t.stride(); - return cudnn_frontend::TensorBuilder() - .setDim(dim.size(), dim.data()) - .setStride(stride.size(), stride.data()) - .setId(id) - .setAlignment(32) - .setDataType(GetCudnnDataType(t.data_type())) - .build(); -} - -cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, - cudnnDataType_t data_type) { - if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { - data_type = CUDNN_DATA_FLOAT; - } - - std::vector padding; - const auto& padding_before = ctx.Attr>("padding_before"); - copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); - - std::vector stride; - const auto& strides = ctx.Attr>("strides"); - copy(strides.begin(), strides.end(), back_inserter(stride)); - - std::vector dilation; - const auto& dilation_rate = ctx.Attr>("dilation_rate"); - copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); - - uint64_t ndim = stride.size(); - return cudnn_frontend::ConvDescBuilder() - .setDataType(data_type) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setNDims(ndim) - .setStrides(ndim, stride.data()) - .setPrePadding(ndim, padding.data()) - .setPostPadding(ndim, padding.data()) - .setDilation(ndim, dilation.data()) - .build(); -} - -cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, - cudnnDataType_t data_type) { - if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_BFLOAT16) { - data_type = CUDNN_DATA_FLOAT; - } - - std::vector padding; - const auto& padding_before = ctx.Attr>("padding_before"); - copy(padding_before.begin(), padding_before.end(), back_inserter(padding)); - - std::vector stride; - const auto& strides = ctx.Attr>("strides"); - copy(strides.begin(), strides.end(), back_inserter(stride)); - - std::vector dilation; - const auto& dilation_rate = ctx.Attr>("dilation_rate"); - copy(dilation_rate.begin(), dilation_rate.end(), back_inserter(dilation)); - - uint64_t ndim = stride.size(); - return cudnn_frontend::ConvDescBuilder() - .setDataType(data_type) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setNDims(ndim) - .setStrides(ndim, stride.data()) - .setPrePadding(ndim, padding.data()) - .setPostPadding(ndim, padding.data()) - .setDilation(ndim, dilation.data()) - .build(); +void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, + cudnn_frontend::EngineConfigList& to, bool deterministic) { + auto filter = [=](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote(c)) { + return true; + } + } + if (cudnn_frontend::hasNumericalNote(c)) { + return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); } std::vector GetGeneratorSources( @@ -579,7 +571,7 @@ std::vector GetGeneratorSources( .cudnn_conf() .cudnn_conv_use_deterministic_algo_only(); bool heuristic = ParseBooleanFromEnv("ONEFLOW_CUDNN_USE_HEURISTIC_MODE_B", false); - auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_A; + auto heur_mode = heuristic ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT; // Method for engine config generator based on heuristics const auto heurgen_method = [deterministic, @@ -610,20 +602,43 @@ std::vector GetGeneratorSources( return sources; } -void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, - cudnn_frontend::EngineConfigList& to, bool deterministic) { - auto filter = [=](cudnnBackendDescriptor_t c) { - if (deterministic) { - if (cudnn_frontend::hasNumericalNote(c)) { - return true; - } - } - if (cudnn_frontend::hasNumericalNote(c)) { - return true; - } +cudnn_frontend::EngineConfigList CudnnFrontendGetConfigs(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, + float beta, std::string& tag) { + auto op_graph = BuildConvOpGraph(handle, desc, xdesc, ydesc, wdesc, cdesc, beta); + tag = op_graph.getTag(); + auto sources = GetGeneratorSources(desc); + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(op_graph); + return configs; +} + +bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) { + static nlohmann::json errata_json_handle; + static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, ""); + if (!has_json) { return false; - }; - cudnn_frontend::filter(from, to, filter); + } else { + return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle, + []() { return true; }); + } +} + +void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, + user_op::Tensor* w, user_op::Tensor* buf, + const cudnn_frontend::ExecutionPlan& plan) { + void* data[] = {x->mut_dptr(), y->mut_dptr(), w->mut_dptr()}; + int64_t ids[] = {'x', 'y', 'w'}; + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(buf->mut_dptr()) + .setDataPointers(3, data) + .setUids(3, ids) + .build(); + OF_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); } void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, @@ -642,6 +657,15 @@ void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* } } +void CudnnFrontendRunConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, + user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, + user_op::Tensor* b, const CudnnConvArgsV8& args) { + std::string tag; + auto configs = CudnnFrontendGetConfigs(handle, desc, args.xdesc, args.ydesc, args.wdesc, + args.cdesc, args.beta, tag); + TryConfigs(handle, x, y, w, b, configs, tag); +} + size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, cudnn_frontend::EngineConfigList& configs, const std::string& tag) { @@ -658,30 +682,6 @@ size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, return 1L; } -bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag) { - static nlohmann::json errata_json_handle; - static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, ""); - if (!has_json) { - return false; - } else { - return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle, - []() { return true; }); - } -} - -void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, - user_op::Tensor* w, user_op::Tensor* buf, - const cudnn_frontend::ExecutionPlan& plan) { - void* data[] = {x->mut_dptr(), y->mut_dptr(), w->mut_dptr()}; - int64_t ids[] = {'x', 'y', 'w'}; - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(buf->mut_dptr()) - .setDataPointers(3, data) - .setUids(3, ids) - .build(); - OF_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); -} - template<> struct CudnnConvAlgorithmSearch { using perf_t = cudnnConvolutionFwdAlgoPerf_t; diff --git a/oneflow/core/device/cudnn_conv_util.h b/oneflow/core/device/cudnn_conv_util.h index b804f76b884..90946a61a9e 100644 --- a/oneflow/core/device/cudnn_conv_util.h +++ b/oneflow/core/device/cudnn_conv_util.h @@ -186,55 +186,22 @@ cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvReso cudnnStatus_t GetCudnnConvWorkspaceSize(const CudnnConvArgs& args, CudnnConvResource* res, cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz); -void RunSingleConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, - user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, user_op::Tensor* b, - const CudnnConvArgsV8& args); - -cudnn_frontend::EngineConfigList GetConfigs(const cudnnHandle_t handle, - const cudnnBackendDescriptorType_t desc, - const cudnn_frontend::Tensor& xdesc, - const cudnn_frontend::Tensor& ydesc, - const cudnn_frontend::Tensor& wdesc, - const cudnn_frontend::ConvDesc& cdesc, float beta, - std::string& tag); - -cudnn_frontend::OperationGraph BuildConvOpGraph(const cudnnHandle_t handle, - const cudnnBackendDescriptorType_t desc, - const cudnn_frontend::Tensor& xdesc, - const cudnn_frontend::Tensor& ydesc, - const cudnn_frontend::Tensor& wdesc, - const cudnn_frontend::ConvDesc& cdesc, float beta); - -cudnn_frontend::Tensor GetTensorDescriptor(const user_op::Tensor* t, const int64_t id); - -cudnn_frontend::Tensor GetTensorDescriptor(const user_op::TensorDesc& t, const int64_t id); - -cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::InferContext& ctx, - cudnnDataType_t data_type); - -cudnn_frontend::ConvDesc GetConvDescriptor(const user_op::KernelComputeContext& ctx, - cudnnDataType_t data_type); - -std::vector GetGeneratorSources( - const cudnnBackendDescriptorType_t desc); - -void FilterEngineConfigs(cudnn_frontend::EngineConfigList& from, - cudnn_frontend::EngineConfigList& to, bool deterministic); - -void TryConfigs(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, - user_op::Tensor* w, user_op::Tensor* buf, cudnn_frontend::EngineConfigList& configs, - const std::string& tag); +cudnn_frontend::EngineConfigList CudnnFrontendGetConfigs(const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const cudnn_frontend::Tensor& xdesc, + const cudnn_frontend::Tensor& ydesc, + const cudnn_frontend::Tensor& wdesc, + const cudnn_frontend::ConvDesc& cdesc, + float beta, std::string& tag); + +void CudnnFrontendRunConv(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, + user_op::Tensor* x, user_op::Tensor* y, user_op::Tensor* w, + user_op::Tensor* b, const CudnnConvArgsV8& args); size_t GetCudnnConvWorkspaceSizeV8(const cudnnHandle_t handle, cudnn_frontend::EngineConfigList& configs, const std::string& tag); -bool PlanErrataException(const cudnnHandle_t handle, const std::string& executionPlanTag); - -void RunConvPlan(const cudnnHandle_t handle, user_op::Tensor* x, user_op::Tensor* y, - user_op::Tensor* w, user_op::Tensor* buf, - const cudnn_frontend::ExecutionPlan& plan); - template perf_t FindCudnnConvAlgorithm(CudnnConvArgs* args); diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index db116a103a5..f8475c30202 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -305,8 +305,8 @@ class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGrap // trigger conv compute auto handle = ctx->stream()->As()->cudnn_handle(); - RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, input, output, - weight, buffer, args); + CudnnFrontendRunConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, input, + output, weight, buffer, args); // process bias auto bias = ctx->Tensor4ArgNameAndIndex("bias", 0); @@ -331,24 +331,25 @@ class ConvGpuKernelV8 final : public user_op::OpKernel, public user_op::CudaGrap } }; -#define REGISTER_CONV_KERNEL_V8(op_name, ndims) \ - REGISTER_USER_KERNEL(#op_name) \ - .SetCreateFn>() \ - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA \ - && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - auto& input = ctx->InputTensorDesc("in", 0); \ - auto& output = ctx->InputTensorDesc("out", 0); \ - auto& weight = ctx->InputTensorDesc("weight", 0); \ - CudnnConvArgsV8 args(*ctx, input, output, weight); \ - auto handle = Singleton::Get()->Get(); \ - std::string tag; \ - auto configs = GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, \ - args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); \ - size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); \ - Singleton::Get()->Put(handle); \ - return workspace_size; \ - }) \ +#define REGISTER_CONV_KERNEL_V8(op_name, ndims) \ + REGISTER_USER_KERNEL(#op_name) \ + .SetCreateFn>() \ + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA \ + && user_op::HobEnvBool("ONEFLOW_KERNEL_ENABLE_CUDNN_V8", false)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + auto& input = ctx->InputTensorDesc("in", 0); \ + auto& output = ctx->InputTensorDesc("out", 0); \ + auto& weight = ctx->InputTensorDesc("weight", 0); \ + CudnnConvArgsV8 args(*ctx, input, output, weight); \ + auto handle = Singleton::Get()->Get(); \ + std::string tag; \ + auto configs = CudnnFrontendGetConfigs( \ + handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, args.xdesc, \ + args.ydesc, args.wdesc, args.cdesc, args.beta, tag); \ + size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); \ + Singleton::Get()->Put(handle); \ + return workspace_size; \ + }) \ .SetPriority(user_op::kKernelPriorityOptimized); REGISTER_CONV_KERNEL_V8(conv1d, 1); @@ -457,8 +458,8 @@ class ConvDataGradGpuKernelV8 final : public user_op::OpKernel, public user_op:: // trigger conv compute auto handle = ctx->stream()->As()->cudnn_handle(); - RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, input_diff, - output_diff, weight, buffer, args); + CudnnFrontendRunConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, + input_diff, output_diff, weight, buffer, args); } bool IsCudaGraphSupported(user_op::KernelInitContext* ctx, @@ -481,9 +482,9 @@ REGISTER_USER_KERNEL("conv_data_grad") CudnnConvArgsV8 args(*ctx, input_diff, output_diff, weight); auto handle = Singleton::Get()->Get(); std::string tag; - auto configs = - GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, - args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + auto configs = CudnnFrontendGetConfigs( + handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, args.xdesc, + args.ydesc, args.wdesc, args.cdesc, args.beta, tag); size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); Singleton::Get()->Put(handle); return workspace_size; @@ -582,8 +583,8 @@ class ConvFilterGradGpuKernelV8 final : public user_op::OpKernel, public user_op // trigger conv compute auto handle = ctx->stream()->As()->cudnn_handle(); - RunSingleConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, input, - output_diff, weight_diff, buffer, args); + CudnnFrontendRunConv(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, + input, output_diff, weight_diff, buffer, args); } bool IsCudaGraphSupported(user_op::KernelInitContext* ctx, @@ -606,9 +607,9 @@ REGISTER_USER_KERNEL("conv_filter_grad") CudnnConvArgsV8 args(*ctx, input, output_diff, weight_diff); auto handle = Singleton::Get()->Get(); std::string tag; - auto configs = - GetConfigs(handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, - args.xdesc, args.ydesc, args.wdesc, args.cdesc, args.beta, tag); + auto configs = CudnnFrontendGetConfigs( + handle, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, args.xdesc, + args.ydesc, args.wdesc, args.cdesc, args.beta, tag); size_t workspace_size = GetCudnnConvWorkspaceSizeV8(handle, configs, tag); Singleton::Get()->Put(handle); return workspace_size;