diff --git a/examples/cuda_mandelbrot.cu b/examples/cuda_mandelbrot.cu index 8ae9cb68..fbd7bd6a 100644 --- a/examples/cuda_mandelbrot.cu +++ b/examples/cuda_mandelbrot.cu @@ -77,7 +77,7 @@ int main() auto rg = redGrapes::init( redGrapes::SchedulerDescription( - std::make_shared>(), + std::make_shared>(2), CudaTag{}), redGrapes::SchedulerDescription( std::make_shared>>( @@ -128,48 +128,46 @@ int main() * calculate picture */ rg.emplace_task( - [width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer) - { - double begin_x = mid_x - w; - double end_x = mid_x + w; - double begin_y = mid_y - w; - double end_y = mid_y + w; - - dim3 threadsPerBlock(8, 8); - dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y); - - auto current_stream = cudaSched.getCudaStream(0); - mandelbrot<<>>( - begin_x, - end_x, - begin_y, - end_y, - width, - height, - *device_buffer); - std::cout << "launched kernel to stream " << current_stream << std::endl; - }, - device_buffer.write()) - .cuda_stream_index(0u); + [width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer) + { + double begin_x = mid_x - w; + double end_x = mid_x + w; + double begin_y = mid_y - w; + double end_y = mid_y + w; + + dim3 threadsPerBlock(8, 8); + dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y); + + auto current_stream = cudaSched.getCudaStream(); + mandelbrot<<>>( + begin_x, + end_x, + begin_y, + end_y, + width, + height, + *device_buffer); + std::cout << "launched kernel to stream " << current_stream << std::endl; + }, + device_buffer.write()); /* * copy data */ rg.emplace_task( - [area, &cudaSched](auto host_buffer, auto device_buffer) - { - auto current_stream = cudaSched.getCudaStream(0); - cudaMemcpyAsync( - *host_buffer, - *device_buffer, - area * sizeof(Color), - cudaMemcpyDeviceToHost, - current_stream); - std::cout << "launched memcpy to stream " << current_stream << std::endl; - }, - host_buffer.write(), - device_buffer.read()) - .cuda_stream_index(0u); + [area, &cudaSched](auto host_buffer, auto device_buffer) + { + auto current_stream = cudaSched.getCudaStream(); + cudaMemcpyAsync( + *host_buffer, + *device_buffer, + area * sizeof(Color), + cudaMemcpyDeviceToHost, + current_stream); + std::cout << "launched memcpy to stream " << current_stream << std::endl; + }, + host_buffer.write(), + device_buffer.read()); ; /* @@ -209,9 +207,7 @@ int main() /* * cleanup */ - rg.emplace_task([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write()) - .cuda_stream_index(0u); + rg.emplace_task([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write()); - rg.emplace_task([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write()) - .cuda_stream_index(0u); + rg.emplace_task([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write()); } diff --git a/redGrapes/dispatch/cuda/cuda_worker.hpp b/redGrapes/dispatch/cuda/cuda_worker.hpp index ee861848..4edb9799 100644 --- a/redGrapes/dispatch/cuda/cuda_worker.hpp +++ b/redGrapes/dispatch/cuda/cuda_worker.hpp @@ -119,6 +119,7 @@ namespace redGrapes::dispatch::cuda auto event = task(); cudaEvent_t cuda_event = event_pool.alloc(); + // works even if the m_cuda_stream index optional is nullopt, because it gets casted to 0 cudaEventRecord(cuda_event, streams[*(task->m_cuda_stream_idx)].cuda_stream); auto my_event = TaskCtx::create_event(); events.push(std::make_pair(cuda_event, *my_event)); diff --git a/redGrapes/scheduler/cuda_thread_scheduler.hpp b/redGrapes/scheduler/cuda_thread_scheduler.hpp index 46bc552f..6d08e9a8 100644 --- a/redGrapes/scheduler/cuda_thread_scheduler.hpp +++ b/redGrapes/scheduler/cuda_thread_scheduler.hpp @@ -7,10 +7,12 @@ #pragma once +#include "redGrapes/TaskCtx.hpp" #include "redGrapes/TaskFreeCtx.hpp" #include "redGrapes/dispatch/cuda/cuda_worker.hpp" #include "redGrapes/scheduler/thread_scheduler.hpp" +#include #include namespace redGrapes @@ -63,11 +65,28 @@ namespace redGrapes ; } - cudaStream_t getCudaStream(unsigned idx) const + /** + * Only to be used if the user wants to manage streams directly + * The user must ensure that if this method is used, they must set the cuda_stream_index() property + */ + cudaStream_t getCudaStreamIdx(unsigned idx) const { assert(idx < num_streams); return this->m_worker_thread->worker->streams[idx].cuda_stream; } + + /** + * Returns the cuda stream to the user to use in their cuda kernel + * Also sets the stream index on the task which calls this method + * requires current_task is not nullptr + */ + cudaStream_t getCudaStream() const + { + static std::atomic_uint stream_idx = 0; + auto task_stream_idx = stream_idx.fetch_add(1) % num_streams; + TaskCtx::current_task->m_cuda_stream_idx = task_stream_idx; + return this->m_worker_thread->worker->streams[task_stream_idx].cuda_stream; + } }; } // namespace scheduler } // namespace redGrapes