Skip to content

Commit

Permalink
Cuda round robin stream scheduling
Browse files Browse the repository at this point in the history
  • Loading branch information
ikbuibui committed Apr 16, 2024
1 parent 7e0dcd6 commit 238e6f6
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 43 deletions.
80 changes: 38 additions & 42 deletions examples/cuda_mandelbrot.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ int main()

auto rg = redGrapes::init<redGrapes::dispatch::cuda::CudaTaskProperties>(
redGrapes::SchedulerDescription(
std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(),
std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(2),
CudaTag{}),
redGrapes::SchedulerDescription(
std::make_shared<redGrapes::scheduler::PoolScheduler<redGrapes::dispatch::thread::DefaultWorker<RGTask>>>(
Expand Down Expand Up @@ -128,48 +128,46 @@ int main()
* calculate picture
*/
rg.emplace_task<CudaTag>(
[width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
{
double begin_x = mid_x - w;
double end_x = mid_x + w;
double begin_y = mid_y - w;
double end_y = mid_y + w;

dim3 threadsPerBlock(8, 8);
dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);

auto current_stream = cudaSched.getCudaStream(0);
mandelbrot<<<numBlocks, threadsPerBlock, 0, cudaSched.getCudaStream(0)>>>(
begin_x,
end_x,
begin_y,
end_y,
width,
height,
*device_buffer);
std::cout << "launched kernel to stream " << current_stream << std::endl;
},
device_buffer.write())
.cuda_stream_index(0u);
[width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
{
double begin_x = mid_x - w;
double end_x = mid_x + w;
double begin_y = mid_y - w;
double end_y = mid_y + w;

dim3 threadsPerBlock(8, 8);
dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);

auto current_stream = cudaSched.getCudaStream();
mandelbrot<<<numBlocks, threadsPerBlock, 0, current_stream>>>(
begin_x,
end_x,
begin_y,
end_y,
width,
height,
*device_buffer);
std::cout << "launched kernel to stream " << current_stream << std::endl;
},
device_buffer.write());

/*
* copy data
*/
rg.emplace_task<CudaTag>(
[area, &cudaSched](auto host_buffer, auto device_buffer)
{
auto current_stream = cudaSched.getCudaStream(0);
cudaMemcpyAsync(
*host_buffer,
*device_buffer,
area * sizeof(Color),
cudaMemcpyDeviceToHost,
current_stream);
std::cout << "launched memcpy to stream " << current_stream << std::endl;
},
host_buffer.write(),
device_buffer.read())
.cuda_stream_index(0u);
[area, &cudaSched](auto host_buffer, auto device_buffer)
{
auto current_stream = cudaSched.getCudaStream();
cudaMemcpyAsync(
*host_buffer,
*device_buffer,
area * sizeof(Color),
cudaMemcpyDeviceToHost,
current_stream);
std::cout << "launched memcpy to stream " << current_stream << std::endl;
},
host_buffer.write(),
device_buffer.read());
;

/*
Expand Down Expand Up @@ -209,9 +207,7 @@ int main()
/*
* cleanup
*/
rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write())
.cuda_stream_index(0u);
rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write());

rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write())
.cuda_stream_index(0u);
rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write());
}
1 change: 1 addition & 0 deletions redGrapes/dispatch/cuda/cuda_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace redGrapes::dispatch::cuda
auto event = task();

cudaEvent_t cuda_event = event_pool.alloc();
// works even if the m_cuda_stream index optional is nullopt, because it gets casted to 0
cudaEventRecord(cuda_event, streams[*(task->m_cuda_stream_idx)].cuda_stream);
auto my_event = TaskCtx<TTask>::create_event();
events.push(std::make_pair(cuda_event, *my_event));
Expand Down
21 changes: 20 additions & 1 deletion redGrapes/scheduler/cuda_thread_scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@

#pragma once

#include "redGrapes/TaskCtx.hpp"
#include "redGrapes/TaskFreeCtx.hpp"
#include "redGrapes/dispatch/cuda/cuda_worker.hpp"
#include "redGrapes/scheduler/thread_scheduler.hpp"

#include <atomic>
#include <cassert>

namespace redGrapes
Expand Down Expand Up @@ -63,11 +65,28 @@ namespace redGrapes
;
}

cudaStream_t getCudaStream(unsigned idx) const
/**
* Only to be used if the user wants to manage streams directly
* The user must ensure that if this method is used, they must set the cuda_stream_index() property
*/
cudaStream_t getCudaStreamIdx(unsigned idx) const
{
assert(idx < num_streams);
return this->m_worker_thread->worker->streams[idx].cuda_stream;
}

/**
* Returns the cuda stream to the user to use in their cuda kernel
* Also sets the stream index on the task which calls this method
* requires current_task is not nullptr
*/
cudaStream_t getCudaStream() const
{
static std::atomic_uint stream_idx = 0;
auto task_stream_idx = stream_idx.fetch_add(1) % num_streams;
TaskCtx<TTask>::current_task->m_cuda_stream_idx = task_stream_idx;
return this->m_worker_thread->worker->streams[task_stream_idx].cuda_stream;
}
};
} // namespace scheduler
} // namespace redGrapes

0 comments on commit 238e6f6

Please sign in to comment.