diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h index 713bdf536aaf90..2bb083b0dd75cb 100644 --- a/flang/include/flang/Runtime/CUDA/memory.h +++ b/flang/include/flang/Runtime/CUDA/memory.h @@ -44,6 +44,10 @@ void RTDECL(CUFDataTransferPtrDesc)(void *dst, Descriptor *src, void RTDECL(CUFDataTransferDescDesc)(Descriptor *dst, Descriptor *src, unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); +/// Data transfer from a scalar descriptor to a descriptor. +void RTDECL(CUFDataTransferCstDesc)(Descriptor *dst, Descriptor *src, + unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); + /// Data transfer from a descriptor to a descriptor. void RTDECL(CUFDataTransferDescDescNoRealloc)(Descriptor *dst, Descriptor *src, unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index ec7f67dff763b4..9de20f0f0d45e1 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -563,8 +563,9 @@ struct CUFDataTransferOpConversion // until we have more infrastructure. mlir::Value src = emboxSrc(rewriter, op, symtab); mlir::Value dst = emboxDst(rewriter, op, symtab); - mlir::func::FuncOp func = fir::runtime::getRuntimeFunc(loc, builder); + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc( + loc, builder); auto fTy = func.getFunctionType(); mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); mlir::Value sourceLine = @@ -648,6 +649,9 @@ struct CUFDataTransferOpConversion mlir::Value src = op.getSrc(); if (!mlir::isa(srcTy)) { src = emboxSrc(rewriter, op, symtab); + if (fir::isa_trivial(srcTy)) + func = fir::runtime::getRuntimeFunc( + loc, builder); } auto materializeBoxIfNeeded = [&](mlir::Value val) -> mlir::Value { if (mlir::isa(val.getDefiningOp())) { diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp index 7b40b837e7666e..68963c4d7738ac 100644 --- a/flang/runtime/CUDA/memory.cpp +++ b/flang/runtime/CUDA/memory.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/memory.h" +#include "../assign-impl.h" #include "../terminator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -120,6 +121,24 @@ void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc, *dstDesc, *srcDesc, terminator, MaybeReallocate, memmoveFct); } +void RTDECL(CUFDataTransferCstDesc)(Descriptor *dstDesc, Descriptor *srcDesc, + unsigned mode, const char *sourceFile, int sourceLine) { + MemmoveFct memmoveFct; + Terminator terminator{sourceFile, sourceLine}; + if (mode == kHostToDevice) { + memmoveFct = &MemmoveHostToDevice; + } else if (mode == kDeviceToHost) { + memmoveFct = &MemmoveDeviceToHost; + } else if (mode == kDeviceToDevice) { + memmoveFct = &MemmoveDeviceToDevice; + } else { + terminator.Crash("host to host copy not supported"); + } + + Fortran::runtime::DoFromSourceAssign( + *dstDesc, *srcDesc, terminator, memmoveFct); +} + void RTDECL(CUFDataTransferDescDescNoRealloc)(Descriptor *dstDesc, Descriptor *srcDesc, unsigned mode, const char *sourceFile, int sourceLine) { diff --git a/flang/runtime/assign-impl.h b/flang/runtime/assign-impl.h index f07a501d1d1263..5db0bc81510bff 100644 --- a/flang/runtime/assign-impl.h +++ b/flang/runtime/assign-impl.h @@ -9,16 +9,29 @@ #ifndef FORTRAN_RUNTIME_ASSIGN_IMPL_H_ #define FORTRAN_RUNTIME_ASSIGN_IMPL_H_ +#include "flang/Runtime/freestanding-tools.h" + namespace Fortran::runtime { class Descriptor; class Terminator; +using MemmoveFct = void *(*)(void *, const void *, std::size_t); + // Assign one object to another via allocate statement from source specifier. // Note that if allocate object and source expression have the same rank, the // value of the allocate object becomes the value provided; otherwise the value // of each element of allocate object becomes the value provided (9.7.1.2(7)). -RT_API_ATTRS void DoFromSourceAssign( - Descriptor &, const Descriptor &, Terminator &); +#ifdef RT_DEVICE_COMPILATION +static RT_API_ATTRS void *MemmoveWrapper( + void *dest, const void *src, std::size_t count) { + return Fortran::runtime::memmove(dest, src, count); +} +RT_API_ATTRS void DoFromSourceAssign(Descriptor &, const Descriptor &, + Terminator &, MemmoveFct memmoveFct = &MemmoveWrapper); +#else +RT_API_ATTRS void DoFromSourceAssign(Descriptor &, const Descriptor &, + Terminator &, MemmoveFct memmoveFct = &Fortran::runtime::memmove); +#endif } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_ASSIGN_IMPL_H_ diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp index 83c0b9c70ed0d1..8f0efaa376c198 100644 --- a/flang/runtime/assign.cpp +++ b/flang/runtime/assign.cpp @@ -509,8 +509,8 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from, RT_OFFLOAD_API_GROUP_BEGIN -RT_API_ATTRS void DoFromSourceAssign( - Descriptor &alloc, const Descriptor &source, Terminator &terminator) { +RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc, + const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) { if (alloc.rank() > 0 && source.rank() == 0) { // The value of each element of allocate object becomes the value of source. DescriptorAddendum *allocAddendum{alloc.Addendum()}; @@ -523,17 +523,17 @@ RT_API_ATTRS void DoFromSourceAssign( alloc.IncrementSubscripts(allocAt)) { Descriptor allocElement{*Descriptor::Create(*allocDerived, reinterpret_cast(alloc.Element(allocAt)), 0)}; - Assign(allocElement, source, terminator, NoAssignFlags); + Assign(allocElement, source, terminator, NoAssignFlags, memmoveFct); } } else { // intrinsic type for (std::size_t n{alloc.Elements()}; n-- > 0; alloc.IncrementSubscripts(allocAt)) { - Fortran::runtime::memmove(alloc.Element(allocAt), - source.raw().base_addr, alloc.ElementBytes()); + memmoveFct(alloc.Element(allocAt), source.raw().base_addr, + alloc.ElementBytes()); } } } else { - Assign(alloc, source, terminator, NoAssignFlags); + Assign(alloc, source, terminator, NoAssignFlags, memmoveFct); } } diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index 3209197e118d19..1ee44f3c6d97c9 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -38,7 +38,7 @@ func.func @_QPsub2() { // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref> // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref>) -> !fir.ref> -// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none func.func @_QPsub3() { %0 = cuf.alloc !fir.box>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFsub3Eadev"} -> !fir.ref>>> @@ -58,7 +58,7 @@ func.func @_QPsub3() { // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref> // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[V_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref>) -> !fir.ref> -// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none func.func @_QPsub4() { %0 = cuf.alloc !fir.box>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFsub4Eadev"} -> !fir.ref>>> @@ -297,7 +297,7 @@ func.func @_QPscalar_to_array() { } // CHECK-LABEL: func.func @_QPscalar_to_array() -// CHECK: _FortranACUFDataTransferDescDescNoRealloc +// CHECK: _FortranACUFDataTransferCstDesc func.func @_QPtest_type() { %0 = cuf.alloc !fir.type<_QMbarTcmplx{id:i32,c:complex}> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFtest_typeEa"} -> !fir.ref}>> @@ -344,7 +344,7 @@ func.func @_QPshape_shift() { } // CHECK-LABEL: func.func @_QPshape_shift() -// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc +// CHECK: fir.call @_FortranACUFDataTransferCstDesc func.func @_QPshape_shift2() { %c11 = arith.constant 11 : index @@ -383,7 +383,7 @@ func.func @_QPdevice_addr_conv() { // CHECK: %[[DEV_ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[DEV_ADDR_CONV:.*]] = fir.convert %[[DEV_ADDR]] : (!fir.llvm_ptr) -> !fir.ref> // CHECK: fir.embox %[[DEV_ADDR_CONV]](%{{.*}}) : (!fir.ref>, !fir.shape<1>) -> !fir.box> -// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc +// CHECK: fir.call @_FortranACUFDataTransferCstDesc func.func @_QQchar_transfer() attributes {fir.bindc_name = "char_transfer"} { %c1 = arith.constant 1 : index @@ -464,6 +464,6 @@ func.func @_QPlogical_cst() { // CHECK: %[[EMBOX:.*]] = fir.embox %[[CONST]] : (!fir.ref>) -> !fir.box> // CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref>> // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref>>) -> !fir.ref> -// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none } // end of module