From 169cd22ef913816d4070917098dc471dfdcb522d Mon Sep 17 00:00:00 2001
From: Tapish <narwal83@hzdr.de>
Date: Fri, 5 Apr 2024 15:32:26 +0200
Subject: [PATCH] cuda fix and fix waker_id bug

---
 examples/cuda_mandelbrot.cu                   | 152 +++++-----
 redGrapes/TaskCtx.hpp                         |   2 +-
 ...roperties.hpp => cuda_task_properties.hpp} |  12 +-
 redGrapes/dispatch/cuda/cuda_worker.hpp       | 267 ++++++++++++++++++
 redGrapes/dispatch/cuda/event_pool.hpp        |  10 +-
 redGrapes/dispatch/cuda/scheduler.hpp         | 194 -------------
 redGrapes/resource/access/io.hpp              |   3 +-
 redGrapes/scheduler/cuda_thread_scheduler.hpp |  73 +++++
 redGrapes/scheduler/pool_scheduler.tpp        |   7 +-
 redGrapesConfig.cmake                         |   2 +-
 10 files changed, 440 insertions(+), 282 deletions(-)
 rename redGrapes/dispatch/cuda/{task_properties.hpp => cuda_task_properties.hpp} (76%)
 create mode 100644 redGrapes/dispatch/cuda/cuda_worker.hpp
 delete mode 100644 redGrapes/dispatch/cuda/scheduler.hpp
 create mode 100644 redGrapes/scheduler/cuda_thread_scheduler.hpp
diff --git a/examples/cuda_mandelbrot.cu b/examples/cuda_mandelbrot.cu
index e4f78065..8ae9cb68 100644
--- a/examples/cuda_mandelbrot.cu
+++ b/examples/cuda_mandelbrot.cu
@@ -5,7 +5,18 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
+#include <redGrapes/SchedulerDescription.hpp>
+#include <redGrapes/dispatch/cuda/cuda_task_properties.hpp>
+#include <redGrapes/dispatch/cuda/cuda_worker.hpp>
+#include <redGrapes/redGrapes.hpp>
+#include <redGrapes/resource/fieldresource.hpp>
+#include <redGrapes/resource/ioresource.hpp>
+#include <redGrapes/scheduler/cuda_thread_scheduler.hpp>
+#include <redGrapes/scheduler/pool_scheduler.hpp>
+#include <redGrapes/task/property/resource.hpp>
+
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <pngwriter.h>
 
 #include <chrono>
@@ -13,23 +24,6 @@
 #include <iomanip>
 #include <iostream>
 
-enum SchedulerTag
-{
-    SCHED_CUDA
-};
-
-#define REDGRAPES_TASK_PROPERTIES dispatch::cuda::CudaTaskProperties, scheduler::SchedulingTagProperties<SchedulerTag>
-
-#include "redGrapes/dispatch/cuda/scheduler.hpp"
-#include <redGrapes/redGrapes.hpp>
-#include "redGrapes/resource/fieldresource.hpp"
-#include "redGrapes/resource/ioresource.hpp"
-#include "redGrapes/scheduler/pool_scheduler.hpp"
-#include "redGrapes/scheduler/tag_match.hpp"
-#include "redGrapes/task/property/resource.hpp"
-
-namespace rg = redGrapes;
-
 struct Color
 {
     float r, g, b;
@@ -70,18 +64,27 @@ __global__ void mandelbrot(
         out[index] = Color{cosf(float(i) / 7.0), cosf(2.0 + float(i) / 11.0), cosf(4.0 + float(i) / 13.0)};
 }
 
-int main()
+struct CudaTag
 {
-    auto pool_scheduler = std::make_shared<rg::scheduler::PoolScheduler>(4 /* number of CPU workers */);
+};
+
+using RGTask = redGrapes::Task<redGrapes::dispatch::cuda::CudaTaskProperties>;
 
-    auto cuda_scheduler = std::make_shared<rg::dispatch::cuda::CudaScheduler>(
-        [](rg::Task const& t) { return t.required_scheduler_tags.test(SCHED_CUDA); },
-        4 /* number of cuda streams */
-    );
+int main()
+{
+    spdlog::set_level(spdlog::level::trace);
+    spdlog::set_pattern("[thread %t] %^[%l]%$ %v");
 
-    rg::idle = [cuda_scheduler] { cuda_scheduler->poll(); };
+    auto rg = redGrapes::init<redGrapes::dispatch::cuda::CudaTaskProperties>(
+        redGrapes::SchedulerDescription(
+            std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(),
+            CudaTag{}),
+        redGrapes::SchedulerDescription(
+            std::make_shared<redGrapes::scheduler::PoolScheduler<redGrapes::dispatch::thread::DefaultWorker<RGTask>>>(
+                4),
+            redGrapes::DefaultTag{}));
 
-    rg::init(rg::scheduler::make_tag_match_scheduler().add({}, pool_scheduler).add({SCHED_CUDA}, cuda_scheduler));
+    auto& cudaSched = rg.getScheduler<CudaTag>();
 
     double mid_x = 0.41820187155955555;
     double mid_y = 0.32743154895555555;
@@ -90,10 +93,10 @@ int main()
     size_t height = 4096;
     size_t area = width * height;
 
-    rg::IOResource<Color*> host_buffer;
-    rg::IOResource<Color*> device_buffer;
+    redGrapes::IOResource<Color*, RGTask> host_buffer;
+    redGrapes::IOResource<Color*, RGTask> device_buffer;
 
-    rg::emplace_task(
+    rg.emplace_task(
         [area](auto host_buffer)
         {
             void* ptr;
@@ -102,7 +105,7 @@ int main()
         },
         host_buffer.write());
 
-    rg::emplace_task(
+    rg.emplace_task(
         [area](auto device_buffer)
         {
             void* ptr;
@@ -124,52 +127,55 @@ int main()
         /*
          * calculate picture
          */
-        rg::emplace_task(
-            [width, height, area, i, mid_x, mid_y, w](auto device_buffer)
-            {
-                double begin_x = mid_x - w;
-                double end_x = mid_x + w;
-                double begin_y = mid_y - w;
-                double end_y = mid_y + w;
-
-                dim3 threadsPerBlock(8, 8);
-                dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);
-
-                mandelbrot<<<numBlocks, threadsPerBlock, 0, rg::dispatch::cuda::current_stream>>>(
-                    begin_x,
-                    end_x,
-                    begin_y,
-                    end_y,
-                    width,
-                    height,
-                    *device_buffer);
-                std::cout << "launched kernel to stream " << rg::dispatch::cuda::current_stream << std::endl;
-            },
-            rg::TaskProperties::Builder().scheduling_tags({SCHED_CUDA}),
-            device_buffer.write());
+        rg.emplace_task<CudaTag>(
+              [width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
+              {
+                  double begin_x = mid_x - w;
+                  double end_x = mid_x + w;
+                  double begin_y = mid_y - w;
+                  double end_y = mid_y + w;
+
+                  dim3 threadsPerBlock(8, 8);
+                  dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);
+
+                  auto current_stream = cudaSched.getCudaStream(0);
+                  mandelbrot<<<numBlocks, threadsPerBlock, 0, cudaSched.getCudaStream(0)>>>(
+                      begin_x,
+                      end_x,
+                      begin_y,
+                      end_y,
+                      width,
+                      height,
+                      *device_buffer);
+                  std::cout << "launched kernel to stream " << current_stream << std::endl;
+              },
+              device_buffer.write())
+            .cuda_stream_index(0u);
 
         /*
          * copy data
          */
-        rg::emplace_task(
-            [area](auto host_buffer, auto device_buffer)
-            {
-                cudaMemcpyAsync(
-                    *host_buffer,
-                    *device_buffer,
-                    area * sizeof(Color),
-                    cudaMemcpyDeviceToHost,
-                    rg::dispatch::cuda::current_stream);
-                std::cout << "launched memcpy to stream " << rg::dispatch::cuda::current_stream << std::endl;
-            },
-            rg::TaskProperties::Builder().scheduling_tags({SCHED_CUDA}),
-            host_buffer.write(),
-            device_buffer.read());
+        rg.emplace_task<CudaTag>(
+              [area, &cudaSched](auto host_buffer, auto device_buffer)
+              {
+                  auto current_stream = cudaSched.getCudaStream(0);
+                  cudaMemcpyAsync(
+                      *host_buffer,
+                      *device_buffer,
+                      area * sizeof(Color),
+                      cudaMemcpyDeviceToHost,
+                      current_stream);
+                  std::cout << "launched memcpy to stream " << current_stream << std::endl;
+              },
+              host_buffer.write(),
+              device_buffer.read())
+            .cuda_stream_index(0u);
+        ;
 
         /*
          * write png
          */
-        rg::emplace_task(
+        rg.emplace_task(
             [width, height, i](auto host_buffer)
             {
                 std::stringstream step;
@@ -179,9 +185,9 @@ int main()
                 pngwriter png(width, height, 0, filename.c_str());
                 png.setcompressionlevel(9);
 
-                for(int y = 0; y < height; ++y)
+                for(size_t y = 0; y < height; ++y)
                 {
-                    for(int x = 0; x < width; ++x)
+                    for(size_t x = 0; x < width; ++x)
                     {
                         auto& color = (*host_buffer)[x + y * width];
                         png.plot(x + 1, height - y, color.r, color.g, color.b);
@@ -194,7 +200,7 @@ int main()
             host_buffer.read());
     }
 
-    rg::emplace_task([](auto b) {}, host_buffer.write()).get();
+    rg.emplace_task([]([[maybe_unused]] auto b) {}, host_buffer.write()).get();
 
     auto t2 = std::chrono::high_resolution_clock::now();
     std::cout << "runtime: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << " μs"
@@ -203,7 +209,9 @@ int main()
     /*
      * cleanup
      */
-    rg::emplace_task([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write());
+    rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write())
+        .cuda_stream_index(0u);
 
-    rg::emplace_task([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write());
+    rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write())
+        .cuda_stream_index(0u);
 }
diff --git a/redGrapes/TaskCtx.hpp b/redGrapes/TaskCtx.hpp
index 8c41ca97..42e5a34c 100644
--- a/redGrapes/TaskCtx.hpp
+++ b/redGrapes/TaskCtx.hpp
@@ -32,7 +32,7 @@ namespace redGrapes
             }
             else
             {
-                event->waker_id = event.task->scheduler_p->getNextWorkerID();
+                event->waker_id = event.task->scheduler_p->getNextWorkerID() + 1;
                 while(!event->is_reached())
                     TaskFreeCtx::idle();
             }
diff --git a/redGrapes/dispatch/cuda/task_properties.hpp b/redGrapes/dispatch/cuda/cuda_task_properties.hpp
similarity index 76%
rename from redGrapes/dispatch/cuda/task_properties.hpp
rename to redGrapes/dispatch/cuda/cuda_task_properties.hpp
index e8532fc0..f20634e6 100644
--- a/redGrapes/dispatch/cuda/task_properties.hpp
+++ b/redGrapes/dispatch/cuda/cuda_task_properties.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Michael Sippel
+/* Copyright 2024 Tapish Narwal
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <optional>
+
 namespace redGrapes
 {
     namespace dispatch
@@ -16,7 +18,7 @@ namespace redGrapes
 
             struct CudaTaskProperties
             {
-                std::optional<cudaEvent_t> cuda_event;
+                std::optional<unsigned> m_cuda_stream_idx;
 
                 CudaTaskProperties()
                 {
@@ -30,6 +32,12 @@ namespace redGrapes
                     Builder(PropertiesBuilder& b) : builder(b)
                     {
                     }
+
+                    PropertiesBuilder& cuda_stream_index(unsigned cuda_stream_idx)
+                    {
+                        *(builder.task->m_cuda_stream_idx) = cuda_stream_idx;
+                        return builder;
+                    }
                 };
 
                 struct Patch
diff --git a/redGrapes/dispatch/cuda/cuda_worker.hpp b/redGrapes/dispatch/cuda/cuda_worker.hpp
new file mode 100644
index 00000000..ee861848
--- /dev/null
+++ b/redGrapes/dispatch/cuda/cuda_worker.hpp
@@ -0,0 +1,267 @@
+/* Copyright 2024 Tapish Narwal
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "redGrapes/TaskCtx.hpp"
+#include "redGrapes/dispatch/cuda/cuda_task_properties.hpp"
+#include "redGrapes/dispatch/cuda/event_pool.hpp"
+#include "redGrapes/scheduler/event.hpp"
+#include "redGrapes/sync/cv.hpp"
+#include "redGrapes/task/queue.hpp"
+
+#include <cuda.h>
+#include <fmt/format.h>
+#include <spdlog/spdlog.h>
+
+#include <queue>
+
+namespace redGrapes::dispatch::cuda
+{
+    struct CudaStreamWrapper
+    {
+        cudaStream_t cuda_stream;
+
+        CudaStreamWrapper()
+        {
+            cudaStreamCreate(&cuda_stream);
+        }
+
+        CudaStreamWrapper(CudaStreamWrapper const& other)
+        {
+            spdlog::warn("CudaStreamWrapper copy constructor called!");
+        }
+
+        ~CudaStreamWrapper()
+        {
+            cudaStreamDestroy(cuda_stream);
+        }
+    };
+
+    // this class is not thread safe
+    // Stream dispatcher
+    template<typename TTask>
+    struct CudaWorker
+
+    {
+        using task_type = TTask;
+
+        WorkerId id;
+        std::vector<CudaStreamWrapper> streams;
+        EventPool event_pool;
+
+        /*! if true, the thread shall stop
+         * instead of waiting when it is out of jobs
+         */
+        std::atomic_bool m_stop{false};
+        std::atomic<unsigned> task_count{0};
+
+
+        std::queue<std::pair<cudaEvent_t, scheduler::EventPtr<TTask>>> events;
+        std::recursive_mutex mutex;
+
+        //! condition variable for waiting if queue is empty
+        CondVar cv;
+
+        static constexpr size_t queue_capacity = 128;
+        task::Queue<TTask> emplacement_queue{queue_capacity};
+        task::Queue<TTask> ready_queue{queue_capacity};
+
+        CudaWorker(WorkerId worker_id) : id(worker_id)
+        {
+        }
+
+        CudaWorker(WorkerId worker_id, unsigned num_streams) : id{worker_id}, streams{num_streams}
+        {
+        }
+
+        inline scheduler::WakerId get_waker_id()
+        {
+            return id + 1;
+        }
+
+        inline bool wake()
+        {
+            return cv.notify();
+        }
+
+        void stop()
+        {
+            SPDLOG_TRACE("Worker::stop()");
+            m_stop.store(true, std::memory_order_release);
+            wake();
+        }
+
+        /* adds a new task to the emplacement queue
+         * and wakes up thread to kickstart execution
+         */
+        inline void dispatch_task(TTask& task)
+        {
+            emplacement_queue.push(&task);
+            wake();
+        }
+
+        inline void execute_task(TTask& task)
+        {
+            TRACE_EVENT("Worker", "dispatch task");
+
+            SPDLOG_DEBUG("cuda thread dispatch: execute task {}", task.task_id);
+            assert(task.is_ready());
+            std::lock_guard<std::recursive_mutex> lock(mutex);
+
+            TaskCtx<TTask>::current_task = &task;
+
+            // run the code that calls the CUDA API and submits work to *task->m_cuda_stream_idx
+            auto event = task();
+
+            cudaEvent_t cuda_event = event_pool.alloc();
+            cudaEventRecord(cuda_event, streams[*(task->m_cuda_stream_idx)].cuda_stream);
+            auto my_event = TaskCtx<TTask>::create_event();
+            events.push(std::make_pair(cuda_event, *my_event));
+            SPDLOG_TRACE(
+                "CudaStreamDispatcher {}: recorded event {}",
+                streams[*(task->m_cuda_stream_idx)].cuda_stream,
+                cuda_event);
+
+            // TODO figure out the correct position for this
+            task.get_pre_event().notify();
+
+            if(event)
+            {
+                event->get_event().waker_id = get_waker_id();
+                task.sg_pause(*event);
+
+                task.pre_event.up();
+                task.get_pre_event().notify();
+            }
+            else
+                task.get_post_event().notify();
+
+            TaskCtx<TTask>::current_task = nullptr;
+        }
+
+        /* repeatedly try to find and execute tasks
+         * until stop-flag is triggered by stop()
+         */
+        void work_loop()
+        {
+            SPDLOG_TRACE("Worker {} start work_loop()", this->id);
+            while(!this->m_stop.load(std::memory_order_consume))
+            {
+                // this->cv.wait(); // TODO fix this by fixing event_ptr notify to wake
+
+                while(TTask* task = this->gather_task())
+                {
+                    execute_task(*task);
+                    poll(); // TODO fix where to poll
+                }
+                poll();
+            }
+            SPDLOG_TRACE("Worker {} end work_loop()", this->id);
+        }
+
+        /* find a task that shall be executed next
+         */
+        TTask* gather_task()
+        {
+            {
+                TRACE_EVENT("Worker", "gather_task()");
+                TTask* task = nullptr;
+
+                /* STAGE 1:
+                 *
+                 * first, execute all tasks in the ready queue
+                 */
+                SPDLOG_TRACE("Worker {}: consume ready queue", id);
+                if((task = ready_queue.pop()))
+                    return task;
+
+                /* STAGE 2:
+                 *
+                 * after the ready queue is fully consumed,
+                 * try initializing new tasks until one
+                 * of them is found to be ready
+                 */
+                SPDLOG_TRACE("Worker {}: try init new tasks", id);
+                while(this->init_dependencies(task, true))
+                    if(task)
+                        return task;
+
+                return task;
+            }
+        }
+
+        /*! take a task from the emplacement queue and initialize it,
+         * @param t is set to the task if the new task is ready,
+         * @param t is set to nullptr if the new task is blocked.
+         * @param claimed if set, the new task will not be actiated,
+         *        if it is false, activate_task will be called by notify_event
+         *
+         * @return false if queue is empty
+         */
+        bool init_dependencies(TTask*& t, bool claimed = true)
+        {
+            {
+                TRACE_EVENT("Worker", "init_dependencies()");
+                if(TTask* task = emplacement_queue.pop())
+                {
+                    SPDLOG_DEBUG("init task {}", task->task_id);
+
+                    task->pre_event.up();
+                    task->init_graph();
+
+                    if(task->get_pre_event().notify(claimed))
+                        t = task;
+                    else
+                    {
+                        t = nullptr;
+                    }
+
+                    return true;
+                }
+                else
+                    return false;
+            }
+        }
+
+        //! checks if some cuda calls finished and notify the redGrapes manager
+        void poll()
+        {
+            std::lock_guard<std::recursive_mutex> lock(mutex);
+            if(!events.empty())
+            {
+                auto& cuda_event = events.front().first;
+                auto& event = events.front().second;
+
+                if(cudaEventQuery(cuda_event) == cudaSuccess)
+                {
+                    SPDLOG_TRACE("cuda event {} ready", cuda_event);
+                    event_pool.free(cuda_event);
+                    event.notify();
+
+                    events.pop();
+                }
+            }
+        }
+    };
+
+} // namespace redGrapes::dispatch::cuda
+
+template<>
+struct fmt::formatter<redGrapes::dispatch::cuda::CudaTaskProperties>
+{
+    constexpr auto parse(format_parse_context& ctx)
+    {
+        return ctx.begin();
+    }
+
+    template<typename FormatContext>
+    auto format(redGrapes::dispatch::cuda::CudaTaskProperties const& prop, FormatContext& ctx)
+    {
+        return fmt::format_to(ctx.out(), "\"cuda_stream_idx\" : {}", *(prop.m_cuda_stream_idx));
+    }
+};
diff --git a/redGrapes/dispatch/cuda/event_pool.hpp b/redGrapes/dispatch/cuda/event_pool.hpp
index a8123117..87aabec1 100644
--- a/redGrapes/dispatch/cuda/event_pool.hpp
+++ b/redGrapes/dispatch/cuda/event_pool.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Michael Sippel
+/* Copyright 2020-2024 Michael Sippel, Tapish Narwal
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <cuda.h>
+
 #include <mutex>
 #include <vector>
 
@@ -28,12 +30,6 @@ namespace redGrapes
                 {
                 }
 
-                static EventPool& get()
-                {
-                    static EventPool singleton;
-                    return singleton;
-                }
-
                 ~EventPool()
                 {
                     std::lock_guard<std::mutex> lock(mutex);
diff --git a/redGrapes/dispatch/cuda/scheduler.hpp b/redGrapes/dispatch/cuda/scheduler.hpp
deleted file mode 100644
index b3031fa2..00000000
--- a/redGrapes/dispatch/cuda/scheduler.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright 2020-2024 Michael Sippel, Tapish Narwal
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include "redGrapes/dispatch/cuda/event_pool.hpp"
-#include "redGrapes/dispatch/cuda/task_properties.hpp"
-#include "redGrapes/scheduler/event.hpp"
-#include "redGrapes/scheduler/scheduler.hpp"
-#include "redGrapes/task/property/graph.hpp"
-
-#include <fmt/format.h>
-#include <spdlog/spdlog.h>
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <queue>
-#include <unordered_map>
-
-namespace redGrapes
-{
-    namespace dispatch
-    {
-        namespace cuda
-        {
-
-            thread_local cudaStream_t current_stream;
-
-            // this class is not thread safe
-            template<typename Task>
-            struct CudaStreamDispatcher
-            {
-                cudaStream_t cuda_stream;
-                std::recursive_mutex mutex;
-                std::queue<std::pair<cudaEvent_t, scheduler::EventPtr>> events;
-
-                CudaStreamDispatcher()
-                {
-                    cudaStreamCreate(&cuda_stream);
-                }
-
-                CudaStreamDispatcher(CudaStreamDispatcher const& other)
-                {
-                    spdlog::warn("CudaStreamDispatcher copy constructor called!");
-                }
-
-                ~CudaStreamDispatcher()
-                {
-                    cudaStreamDestroy(cuda_stream);
-                }
-
-                void poll()
-                {
-                    std::lock_guard<std::recursive_mutex> lock(mutex);
-                    if(!events.empty())
-                    {
-                        auto& cuda_event = events.front().first;
-                        auto& event = events.front().second;
-
-                        if(cudaEventQuery(cuda_event) == cudaSuccess)
-                        {
-                            SPDLOG_TRACE("cuda event {} ready", cuda_event);
-                            EventPool::get().free(cuda_event);
-                            event.notify();
-
-                            events.pop();
-                        }
-                    }
-                }
-
-                void dispatch_task(Task& task)
-                {
-                    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-                    for(auto predecessor : task.in_edges)
-                    {
-                        SPDLOG_TRACE("cudaDispatcher: consider predecessor \"{}\"", predecessor->label);
-
-                        if(auto cuda_event = predecessor->cuda_event)
-                        {
-                            SPDLOG_TRACE(
-                                "cudaDispatcher: task {} \"{}\" wait for {}",
-                                task.task_id,
-                                task.label,
-                                *cuda_event);
-
-                            cudaStreamWaitEvent(cuda_stream, *cuda_event, 0);
-                        }
-                    }
-
-                    SPDLOG_TRACE("CudaScheduler: start {}", task_id);
-
-                    current_stream = cuda_stream;
-
-                    // run the code that calls the CUDA API and submits work to current_stream
-                    task->run();
-
-                    cudaEvent_t cuda_event = EventPool::get().alloc();
-                    cudaEventRecord(cuda_event, cuda_stream);
-                    task->cuda_event = cuda_event;
-
-                    task->get_pre_event().notify();
-
-                    SPDLOG_TRACE("CudaStreamDispatcher {}: recorded event {}", cuda_stream, cuda_event);
-                    events.push(std::make_pair(cuda_event, task->get_post_event()));
-                }
-            };
-
-            template<typename TTask>
-            struct CudaScheduler : redGrapes::scheduler::IScheduler<TTask>
-            {
-            private:
-                bool recording;
-                bool cuda_graph_enabled;
-
-                std::recursive_mutex mutex;
-                unsigned int current_stream;
-                std::vector<CudaStreamDispatcher<TTask>> streams;
-
-                std::function<bool(TTask const&)> is_cuda_task;
-
-            public:
-                CudaScheduler(
-                    std::function<bool(TTask const&)> is_cuda_task,
-                    size_t stream_count = 1,
-                    bool cuda_graph_enabled = false)
-                    : is_cuda_task(is_cuda_task)
-                    , current_stream(0)
-                    , cuda_graph_enabled(cuda_graph_enabled)
-                {
-                    // reserve to avoid copy constructor of CudaStreamDispatcher
-                    streams.reserve(stream_count);
-
-                    for(size_t i = 0; i < stream_count; ++i)
-                        streams.emplace_back();
-
-                    SPDLOG_TRACE("CudaScheduler: use {} streams", streams.size());
-                }
-
-                //! submits the call to the cuda runtime
-                void activate_task(TTask& task)
-                {
-                    unsigned int stream_id = current_stream;
-                    current_stream = (current_stream + 1) % streams.size();
-
-                    SPDLOG_TRACE("Dispatch Cuda task {} \"{}\" on stream {}", task.task_id, task.label, stream_id);
-                    streams[stream_id].dispatch_task(task);
-                }
-
-                //! checks if some cuda calls finished and notify the redGrapes manager
-                void poll()
-                {
-                    for(size_t stream_id = 0; stream_id < streams.size(); ++stream_id)
-                        streams[stream_id].poll();
-                }
-
-                /*! whats the task dependency type for the edge a -> b (task a precedes task b)
-                 * @return true if task b depends on the pre event of task a, false if task b depends on the post event
-                 * of task b.
-                 */
-                bool task_dependency_type(TTask const& a, TTask const& b)
-                {
-                    assert(is_cuda_task(b));
-                    return is_cuda_task(a);
-                }
-            };
-
-        } // namespace cuda
-
-    } // namespace dispatch
-
-} // namespace redGrapes
-
-template<>
-struct fmt::formatter<redGrapes::dispatch::cuda::CudaTaskProperties>
-{
-    constexpr auto parse(format_parse_context& ctx)
-    {
-        return ctx.begin();
-    }
-
-    template<typename FormatContext>
-    auto format(redGrapes::dispatch::cuda::CudaTaskProperties const& prop, FormatContext& ctx)
-    {
-        if(auto e = prop.cuda_event)
-            return fmt::format_to(ctx.out(), "\"cuda_event\" : {}", *e);
-        else
-            return fmt::format_to(ctx.out(), "\"cuda_event\" : null");
-    }
-};
diff --git a/redGrapes/resource/access/io.hpp b/redGrapes/resource/access/io.hpp
index 8dc5ec4d..ea07dfba 100644
--- a/redGrapes/resource/access/io.hpp
+++ b/redGrapes/resource/access/io.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019 Michael Sippel
+/* Copyright 2019-2024 Michael Sippel, Tapish Narwal
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,7 +11,6 @@
 
 #pragma once
 
-#include <fmt/color.h>
 #include <fmt/format.h>
 
 namespace redGrapes
diff --git a/redGrapes/scheduler/cuda_thread_scheduler.hpp b/redGrapes/scheduler/cuda_thread_scheduler.hpp
new file mode 100644
index 00000000..46bc552f
--- /dev/null
+++ b/redGrapes/scheduler/cuda_thread_scheduler.hpp
@@ -0,0 +1,73 @@
+/* Copyright 2024 Tapish Narwal
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "redGrapes/TaskFreeCtx.hpp"
+#include "redGrapes/dispatch/cuda/cuda_worker.hpp"
+#include "redGrapes/scheduler/thread_scheduler.hpp"
+
+#include <cassert>
+
+namespace redGrapes
+{
+    namespace scheduler
+    {
+        template<typename TTask>
+        class CudaThreadScheduler : public ThreadScheduler<dispatch::cuda::CudaWorker<TTask>>
+        {
+        private:
+            unsigned num_streams;
+
+        public:
+            CudaThreadScheduler(unsigned num_streams = 1) : num_streams{num_streams}
+            {
+            }
+
+            void init(WorkerId base_id) override
+            {
+                this->m_base_id = base_id;
+                // TODO check if it was already initalized
+                if(!this->m_worker_thread)
+                {
+                    unsigned pu_id = base_id % TaskFreeCtx::n_pus;
+                    // allocate worker with id `i` on arena `i`,
+                    hwloc_obj_t obj = hwloc_get_obj_by_type(TaskFreeCtx::hwloc_ctx.topology, HWLOC_OBJ_PU, pu_id);
+                    TaskFreeCtx::worker_alloc_pool->allocs.emplace_back(
+                        memory::HwlocAlloc(TaskFreeCtx::hwloc_ctx, obj),
+                        REDGRAPES_ALLOC_CHUNKSIZE);
+
+                    this->m_worker_thread
+                        = memory::alloc_shared_bind<dispatch::thread::WorkerThread<dispatch::cuda::CudaWorker<TTask>>>(
+                            this->m_base_id,
+                            obj,
+                            this->m_base_id,
+                            num_streams);
+                }
+            }
+
+            /*! whats the task dependency type for the edge a -> b (task a precedes task b)
+             * @return true if task b depends on the pre event of task a, false if task b depends on the post event
+             * of task b.
+             */
+            bool task_dependency_type(TTask const& a, TTask const& b)
+            {
+                if(a.m_cuda_stream_idx)
+                    return true;
+                else
+                    return false;
+                ;
+            }
+
+            cudaStream_t getCudaStream(unsigned idx) const
+            {
+                assert(idx < num_streams);
+                return this->m_worker_thread->worker->streams[idx].cuda_stream;
+            }
+        };
+    } // namespace scheduler
+} // namespace redGrapes
diff --git a/redGrapes/scheduler/pool_scheduler.tpp b/redGrapes/scheduler/pool_scheduler.tpp
index ad20c4b6..fb89fd8e 100644
--- a/redGrapes/scheduler/pool_scheduler.tpp
+++ b/redGrapes/scheduler/pool_scheduler.tpp
@@ -116,11 +116,12 @@ namespace redGrapes
         template<typename Worker>
         bool PoolScheduler<Worker>::wake(WakerId id)
         {
-            if(id == 0)
+            auto local_waker_id = id - m_base_id;
+            if(local_waker_id == 0)
                 return cv.notify();
             // TODO analyse and optimize
-            else if(id > 0 && id - m_base_id <= n_workers)
-                return m_worker_pool->get_worker_thread(id - m_base_id - 1).worker->wake();
+            else if(local_waker_id > 0 && local_waker_id <= n_workers)
+                return m_worker_pool->get_worker_thread(local_waker_id - 1).worker->wake();
             else
                 return false;
         }
diff --git a/redGrapesConfig.cmake b/redGrapesConfig.cmake
index d79be696..56eed262 100644
--- a/redGrapesConfig.cmake
+++ b/redGrapesConfig.cmake
@@ -53,7 +53,7 @@ if( NOT TARGET redGrapes )
   if(MSVC)
     target_compile_options(redGrapes INTERFACE /W4 /WX)
   else()
-    target_compile_options(redGrapes INTERFACE -Wall -Wextra -Wpedantic)
+    target_compile_options(redGrapes INTERFACE -Wall -Wextra)
 endif()
 
 endif()