Skip to content

Commit

Permalink
Format CUDA documentation examples
Browse files Browse the repository at this point in the history
  • Loading branch information
msimberg committed Nov 25, 2024
1 parent 0f61125 commit e7c908a
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 33 deletions.
18 changes: 10 additions & 8 deletions examples/documentation/cuda_overview_documentation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,27 @@ int main(int argc, char* argv[])
pika::start(argc, argv);
ex::thread_pool_scheduler cpu_sched{};

// Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler that uses the pool.
// Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler
// that uses the pool.
cu::cuda_pool pool{};
cu::cuda_scheduler cuda_sched{pool};

{
// Enable polling of CUDA events on the default pool. This is required to allow the adaptors
// below to signal completion of kernels.
// Enable polling of CUDA events on the default pool. This is required
// to allow the adaptors below to signal completion of kernels.
cu::enable_user_polling p{};

// The work created by the adaptors below will all be scheduled on the same stream from the
// pool since the work is sequential.
// The work created by the adaptors below will all be scheduled on the
// same stream from the pool since the work is sequential.
//
// Note that error checking is omitted below.
auto s = ex::just(42) | ex::continues_on(cuda_sched) |
// CUDA kernel through a lambda.
ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) |
// Explicitly launch a CUDA kernel with a stream (see https://github.com/eth-cscs/whip
// for details about whip)
cu::then_with_stream([](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
// Explicitly launch a CUDA kernel with a stream (see
// https://github.com/eth-cscs/whip for details about whip)
cu::then_with_stream(
[](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
tt::sync_wait(std::move(s));
}

Expand Down
31 changes: 18 additions & 13 deletions examples/documentation/then_with_cublas_documentation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ class gpu_data
std::size_t n{0};

public:
// Note that blocking functions such as cudaMalloc will block the underlying operating system
// thread instead of yielding the pika task. Consider using e.g. a pool of GPU memory to avoid
// blocking the thread for too long.
// Note that blocking functions such as cudaMalloc will block the underlying
// operating system thread instead of yielding the pika task. Consider using
// e.g. a pool of GPU memory to avoid blocking the thread for too long.
gpu_data(std::size_t n)
: n(n)
{
Expand Down Expand Up @@ -96,19 +96,24 @@ int main(int argc, char* argv[])
double alpha = 1.0;
double beta = 1.0;

auto s = ex::just(std::move(a), std::move(b), std::move(c)) | ex::continues_on(cuda_sched) |
cu::then_with_stream([](auto& a, auto& b, auto& c, whip::stream_t stream) {
init<<<n * n / 256, 256, 0, stream>>>(a.get(), b.get(), c.get(), n * n);
return std::make_tuple(std::move(a), std::move(b), std::move(c));
}) |
auto s = ex::just(std::move(a), std::move(b), std::move(c)) |
ex::continues_on(cuda_sched) |
cu::then_with_stream(
[](auto& a, auto& b, auto& c, whip::stream_t stream) {
init<<<n * n / 256, 256, 0, stream>>>(
a.get(), b.get(), c.get(), n * n);
return std::make_tuple(
std::move(a), std::move(b), std::move(c));
}) |
ex::unpack() |
// a, b, and c will be kept alive by the then_with_cublas operation state at least until
// the GPU kernels complete. Values sent by the predecessor sender are passed as the
// last arguments after the handle.
// a, b, and c will be kept alive by the then_with_cublas operation
// state at least until the GPU kernels complete. Values sent by
// the predecessor sender are passed as the last arguments after the
// handle.
cu::then_with_cublas(
[&](blas_handle_t handle, auto& a, auto& b, auto& c) {
blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha, a.get(), n, b.get(), n,
&beta, c.get(), n);
blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha,
a.get(), n, b.get(), n, &beta, c.get(), n);
},
blas_pointer_mode);
tt::sync_wait(std::move(s));
Expand Down
30 changes: 18 additions & 12 deletions examples/documentation/then_with_stream_documentation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,26 @@ int main(int argc, char* argv[])
constexpr std::size_t n = 32;
int* a = nullptr;

// whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the sender adaptors the
// allocation, work, and deallocation can all be scheduled onto the same stream.
// whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the
// sender adaptors the allocation, work, and deallocation can all be
// scheduled onto the same stream.
auto s = ex::just(&a, n * sizeof(int)) | ex::continues_on(cuda_sched) |
cu::then_with_stream(whip::malloc_async) |
// The then_with_stream callable accepts values sent by the predecessor. They will be
// passed by reference before the stream. This allows e.g. whip::malloc_async to be
// used above with values sent by the just sender. The values are passed by reference
// and will be kept alive until the work done on the stream is done.
cu::then_with_stream([&a](/* other values by reference here */ whip::stream_t stream) {
kernel<<<1, n, 0, stream>>>(a, 17);
// Even though the function returns here, the sync_wait below will wait for the
// kernel to finish. Values returned are passed on to continuations.
return a;
}) |
// The then_with_stream callable accepts values sent by the
// predecessor. They will be passed by reference before the stream.
// This allows e.g. whip::malloc_async to be used above with values
// sent by the just sender. The values are passed by reference and
// will be kept alive until the work done on the stream is done.
cu::then_with_stream(
[&a](
/* other values by reference here */ whip::stream_t
stream) {
kernel<<<1, n, 0, stream>>>(a, 17);
// Even though the function returns here, the sync_wait below
// will wait for the kernel to finish. Values returned are
// passed on to continuations.
return a;
}) |
cu::then_with_stream(whip::free_async);

tt::sync_wait(std::move(s));
Expand Down

0 comments on commit e7c908a

Please sign in to comment.