Format CUDA documentation examples

pika-org · Nov 25, 2024 · e7c908a · e7c908a
1 parent 0f61125
commit e7c908a
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 33 deletions.
diff --git a/examples/documentation/cuda_overview_documentation.cu b/examples/documentation/cuda_overview_documentation.cu
@@ -28,25 +28,27 @@ int main(int argc, char* argv[])
     pika::start(argc, argv);
     ex::thread_pool_scheduler cpu_sched{};
 
-    // Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler that uses the pool.
+    // Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler
+    // that uses the pool.
     cu::cuda_pool pool{};
     cu::cuda_scheduler cuda_sched{pool};
 
     {
-        // Enable polling of CUDA events on the default pool. This is required to allow the adaptors
-        // below to signal completion of kernels.
+        // Enable polling of CUDA events on the default pool. This is required
+        // to allow the adaptors below to signal completion of kernels.
         cu::enable_user_polling p{};
 
-        // The work created by the adaptors below will all be scheduled on the same stream from the
-        // pool since the work is sequential.
+        // The work created by the adaptors below will all be scheduled on the
+        // same stream from the pool since the work is sequential.
         //
         // Note that error checking is omitted below.
         auto s = ex::just(42) | ex::continues_on(cuda_sched) |
             // CUDA kernel through a lambda.
             ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) |
-            // Explicitly launch a CUDA kernel with a stream (see https://github.com/eth-cscs/whip
-            // for details about whip)
-            cu::then_with_stream([](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
+            // Explicitly launch a CUDA kernel with a stream (see
+            // https://github.com/eth-cscs/whip for details about whip)
+            cu::then_with_stream(
+                [](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
         tt::sync_wait(std::move(s));
     }
 

diff --git a/examples/documentation/then_with_cublas_documentation.cu b/examples/documentation/then_with_cublas_documentation.cu
@@ -37,9 +37,9 @@ class gpu_data
     std::size_t n{0};
 
 public:
-    // Note that blocking functions such as cudaMalloc will block the underlying operating system
-    // thread instead of yielding the pika task. Consider using e.g. a pool of GPU memory to avoid
-    // blocking the thread for too long.
+    // Note that blocking functions such as cudaMalloc will block the underlying
+    // operating system thread instead of yielding the pika task. Consider using
+    // e.g. a pool of GPU memory to avoid blocking the thread for too long.
     gpu_data(std::size_t n)
       : n(n)
     {
@@ -96,19 +96,24 @@ int main(int argc, char* argv[])
         double alpha = 1.0;
         double beta = 1.0;
 
-        auto s = ex::just(std::move(a), std::move(b), std::move(c)) | ex::continues_on(cuda_sched) |
-            cu::then_with_stream([](auto& a, auto& b, auto& c, whip::stream_t stream) {
-                init<<<n * n / 256, 256, 0, stream>>>(a.get(), b.get(), c.get(), n * n);
-                return std::make_tuple(std::move(a), std::move(b), std::move(c));
-            }) |
+        auto s = ex::just(std::move(a), std::move(b), std::move(c)) |
+            ex::continues_on(cuda_sched) |
+            cu::then_with_stream(
+                [](auto& a, auto& b, auto& c, whip::stream_t stream) {
+                    init<<<n * n / 256, 256, 0, stream>>>(
+                        a.get(), b.get(), c.get(), n * n);
+                    return std::make_tuple(
+                        std::move(a), std::move(b), std::move(c));
+                }) |
             ex::unpack() |
-            // a, b, and c will be kept alive by the then_with_cublas operation state at least until
-            // the GPU kernels complete.  Values sent by the predecessor sender are passed as the
-            // last arguments after the handle.
+            // a, b, and c will be kept alive by the then_with_cublas operation
+            // state at least until the GPU kernels complete.  Values sent by
+            // the predecessor sender are passed as the last arguments after the
+            // handle.
             cu::then_with_cublas(
                 [&](blas_handle_t handle, auto& a, auto& b, auto& c) {
-                    blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha, a.get(), n, b.get(), n,
-                        &beta, c.get(), n);
+                    blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha,
+                        a.get(), n, b.get(), n, &beta, c.get(), n);
                 },
                 blas_pointer_mode);
         tt::sync_wait(std::move(s));

diff --git a/examples/documentation/then_with_stream_documentation.cu b/examples/documentation/then_with_stream_documentation.cu
@@ -38,20 +38,26 @@ int main(int argc, char* argv[])
         constexpr std::size_t n = 32;
         int* a = nullptr;
 
-        // whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the sender adaptors the
-        // allocation, work, and deallocation can all be scheduled onto the same stream.
+        // whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the
+        // sender adaptors the allocation, work, and deallocation can all be
+        // scheduled onto the same stream.
         auto s = ex::just(&a, n * sizeof(int)) | ex::continues_on(cuda_sched) |
             cu::then_with_stream(whip::malloc_async) |
-            // The then_with_stream callable accepts values sent by the predecessor. They will be
-            // passed by reference before the stream.  This allows e.g. whip::malloc_async to be
-            // used above with values sent by the just sender. The values are passed by reference
-            // and will be kept alive until the work done on the stream is done.
-            cu::then_with_stream([&a](/* other values by reference here */ whip::stream_t stream) {
-                kernel<<<1, n, 0, stream>>>(a, 17);
-                // Even though the function returns here, the sync_wait below will wait for the
-                // kernel to finish. Values returned are passed on to continuations.
-                return a;
-            }) |
+            // The then_with_stream callable accepts values sent by the
+            // predecessor. They will be passed by reference before the stream.
+            // This allows e.g. whip::malloc_async to be used above with values
+            // sent by the just sender. The values are passed by reference and
+            // will be kept alive until the work done on the stream is done.
+            cu::then_with_stream(
+                [&a](
+                    /* other values by reference here */ whip::stream_t
+                        stream) {
+                    kernel<<<1, n, 0, stream>>>(a, 17);
+                    // Even though the function returns here, the sync_wait below
+                    // will wait for the kernel to finish. Values returned are
+                    // passed on to continuations.
+                    return a;
+                }) |
             cu::then_with_stream(whip::free_async);
 
         tt::sync_wait(std::move(s));