Cuda round robin stream scheduling

ComputationalRadiationPhysics · Apr 16, 2024 · 238e6f6 · 238e6f6
1 parent 7e0dcd6
commit 238e6f6
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 43 deletions.
diff --git a/examples/cuda_mandelbrot.cu b/examples/cuda_mandelbrot.cu
@@ -77,7 +77,7 @@ int main()
 
     auto rg = redGrapes::init<redGrapes::dispatch::cuda::CudaTaskProperties>(
         redGrapes::SchedulerDescription(
-            std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(),
+            std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(2),
             CudaTag{}),
         redGrapes::SchedulerDescription(
             std::make_shared<redGrapes::scheduler::PoolScheduler<redGrapes::dispatch::thread::DefaultWorker<RGTask>>>(
@@ -128,48 +128,46 @@ int main()
          * calculate picture
          */
         rg.emplace_task<CudaTag>(
-              [width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
-              {
-                  double begin_x = mid_x - w;
-                  double end_x = mid_x + w;
-                  double begin_y = mid_y - w;
-                  double end_y = mid_y + w;
-
-                  dim3 threadsPerBlock(8, 8);
-                  dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);
-
-                  auto current_stream = cudaSched.getCudaStream(0);
-                  mandelbrot<<<numBlocks, threadsPerBlock, 0, cudaSched.getCudaStream(0)>>>(
-                      begin_x,
-                      end_x,
-                      begin_y,
-                      end_y,
-                      width,
-                      height,
-                      *device_buffer);
-                  std::cout << "launched kernel to stream " << current_stream << std::endl;
-              },
-              device_buffer.write())
-            .cuda_stream_index(0u);
+            [width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
+            {
+                double begin_x = mid_x - w;
+                double end_x = mid_x + w;
+                double begin_y = mid_y - w;
+                double end_y = mid_y + w;
+
+                dim3 threadsPerBlock(8, 8);
+                dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);
+
+                auto current_stream = cudaSched.getCudaStream();
+                mandelbrot<<<numBlocks, threadsPerBlock, 0, current_stream>>>(
+                    begin_x,
+                    end_x,
+                    begin_y,
+                    end_y,
+                    width,
+                    height,
+                    *device_buffer);
+                std::cout << "launched kernel to stream " << current_stream << std::endl;
+            },
+            device_buffer.write());
 
         /*
          * copy data
          */
         rg.emplace_task<CudaTag>(
-              [area, &cudaSched](auto host_buffer, auto device_buffer)
-              {
-                  auto current_stream = cudaSched.getCudaStream(0);
-                  cudaMemcpyAsync(
-                      *host_buffer,
-                      *device_buffer,
-                      area * sizeof(Color),
-                      cudaMemcpyDeviceToHost,
-                      current_stream);
-                  std::cout << "launched memcpy to stream " << current_stream << std::endl;
-              },
-              host_buffer.write(),
-              device_buffer.read())
-            .cuda_stream_index(0u);
+            [area, &cudaSched](auto host_buffer, auto device_buffer)
+            {
+                auto current_stream = cudaSched.getCudaStream();
+                cudaMemcpyAsync(
+                    *host_buffer,
+                    *device_buffer,
+                    area * sizeof(Color),
+                    cudaMemcpyDeviceToHost,
+                    current_stream);
+                std::cout << "launched memcpy to stream " << current_stream << std::endl;
+            },
+            host_buffer.write(),
+            device_buffer.read());
         ;
 
         /*
@@ -209,9 +207,7 @@ int main()
     /*
      * cleanup
      */
-    rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write())
-        .cuda_stream_index(0u);
+    rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write());
 
-    rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write())
-        .cuda_stream_index(0u);
+    rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write());
 }
diff --git a/redGrapes/dispatch/cuda/cuda_worker.hpp b/redGrapes/dispatch/cuda/cuda_worker.hpp
@@ -119,6 +119,7 @@ namespace redGrapes::dispatch::cuda
             auto event = task();
 
             cudaEvent_t cuda_event = event_pool.alloc();
+            // works even if the m_cuda_stream index optional is nullopt, because it gets casted to 0
             cudaEventRecord(cuda_event, streams[*(task->m_cuda_stream_idx)].cuda_stream);
             auto my_event = TaskCtx<TTask>::create_event();
             events.push(std::make_pair(cuda_event, *my_event));

diff --git a/redGrapes/scheduler/cuda_thread_scheduler.hpp b/redGrapes/scheduler/cuda_thread_scheduler.hpp
@@ -7,10 +7,12 @@
 
 #pragma once
 
+#include "redGrapes/TaskCtx.hpp"
 #include "redGrapes/TaskFreeCtx.hpp"
 #include "redGrapes/dispatch/cuda/cuda_worker.hpp"
 #include "redGrapes/scheduler/thread_scheduler.hpp"
 
+#include <atomic>
 #include <cassert>
 
 namespace redGrapes
@@ -63,11 +65,28 @@ namespace redGrapes
                 ;
             }
 
-            cudaStream_t getCudaStream(unsigned idx) const
+            /**
+             * Only to be used if the user wants to manage streams directly
+             * The user must ensure that if this method is used, they must set the cuda_stream_index() property
+             */
+            cudaStream_t getCudaStreamIdx(unsigned idx) const
             {
                 assert(idx < num_streams);
                 return this->m_worker_thread->worker->streams[idx].cuda_stream;
             }
+
+            /**
+             * Returns the cuda stream to the user to use in their cuda kernel
+             * Also sets the stream index on the task which calls this method
+             * requires current_task is not nullptr
+             */
+            cudaStream_t getCudaStream() const
+            {
+                static std::atomic_uint stream_idx = 0;
+                auto task_stream_idx = stream_idx.fetch_add(1) % num_streams;
+                TaskCtx<TTask>::current_task->m_cuda_stream_idx = task_stream_idx;
+                return this->m_worker_thread->worker->streams[task_stream_idx].cuda_stream;
+            }
         };
     } // namespace scheduler
 } // namespace redGrapes