scylladb · michoecho · Apr 14, 2023 · Dec 17, 2024 · Dec 18, 2024 · Dec 22, 2024
diff --git a/include/seastar/core/byteorder.hh b/include/seastar/core/byteorder.hh
@@ -71,10 +71,11 @@ read_le(const char* p) noexcept {
 
 template <typename T>
 inline
-void
+char*
 write_le(char* p, T datum) noexcept {
     datum = cpu_to_le(datum);
     std::copy_n(reinterpret_cast<const char*>(&datum), sizeof(T), p);
+    return p + sizeof(T);
 }
 
 template <typename T>

diff --git a/include/seastar/core/execution_stage.hh b/include/seastar/core/execution_stage.hh
@@ -220,6 +220,7 @@ class concrete_execution_stage final : public execution_stage {
 
     struct work_item {
         input_type _in;
+        task_id _task_id;
         promise_type _ready;
 
         work_item(typename internal::wrap_for_es<Args>::type... args) : _in(std::move(args)...) { }
@@ -244,7 +245,11 @@ private:
             auto wi_in = std::move(wi._in);
             auto wi_ready = std::move(wi._ready);
             _queue.pop_front();
-            futurize<ReturnType>::apply(_function, unwrap(std::move(wi_in))).forward_to(std::move(wi_ready));
+            {
+                tracepoint_run_execution_stage_task(wi._task_id);
+                auto st = switch_task(wi._task_id);
+                futurize<ReturnType>::apply(_function, unwrap(std::move(wi_in))).forward_to(std::move(wi_ready));
+            }
             _stats.function_calls_executed++;
 
             if (internal::scheduler_need_preempt()) {

diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
@@ -254,10 +254,12 @@ public:
     capacity_t per_tick_grab_threshold() const noexcept { return _per_tick_threshold; }
     capacity_t grab_capacity(capacity_t cap) noexcept;
     clock_type::time_point replenished_ts() const noexcept { return _token_bucket.replenished_ts(); }
+    void refund_tokens(capacity_t) noexcept;
     void replenish_capacity(clock_type::time_point now) noexcept;
     void maybe_replenish_capacity(clock_type::time_point& local_ts) noexcept;
 
     capacity_t capacity_deficiency(capacity_t from) const noexcept;
+    capacity_t head() const noexcept;
 
     std::chrono::duration<double> rate_limit_duration() const noexcept {
         std::chrono::duration<double, rate_resolution> dur((double)_token_bucket.limit() / _token_bucket.rate());
@@ -343,13 +345,12 @@ private:
      * in the middle of the waiting
      */
     struct pending {
-        capacity_t head;
-        capacity_t cap;
-
-        pending(capacity_t t, capacity_t c) noexcept : head(t), cap(c) {}
+        capacity_t head = 0;
+        capacity_t cap = 0;
     };
 
-    std::optional<pending> _pending;
+    pending _pending;
+    capacity_t _queued_cap = 0;
 
     void push_priority_class(priority_class_data& pc) noexcept;
     void push_priority_class_from_idle(priority_class_data& pc) noexcept;
@@ -359,7 +360,7 @@ private:
 
     enum class grab_result { grabbed, cant_preempt, pending };
     grab_result grab_capacity(const fair_queue_entry& ent) noexcept;
-    grab_result grab_pending_capacity(const fair_queue_entry& ent) noexcept;
+    capacity_t reap_pending_capacity(bool& contact) noexcept;
 public:
     /// Constructs a fair queue with configuration parameters \c cfg.
     ///

diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
@@ -68,6 +68,7 @@
 #include <cstring>
 #include <memory>
 #include <string_view>
+#include <map>
 #include <unordered_map>
 #include <vector>
 #include <unistd.h>
@@ -281,6 +282,8 @@ private:
     };
 
     boost::container::static_vector<std::unique_ptr<task_queue>, max_scheduling_groups()> _task_queues;
+public:
+    std::vector<std::tuple<int, std::reference_wrapper<const sstring>, float>> list_groups();
     internal::scheduling_group_specific_thread_local_data _scheduling_group_specific_data;
     int64_t _last_vruntime = 0;
     task_queue_list _active_task_queues;

diff --git a/include/seastar/core/task.hh b/include/seastar/core/task.hh
@@ -23,6 +23,7 @@
 
 #include <seastar/core/scheduling.hh>
 #include <seastar/util/backtrace.hh>
+#include <seastar/util/tracer.hh>
 
 #ifndef SEASTAR_MODULE
 #include <utility>
@@ -31,7 +32,29 @@
 namespace seastar {
 
 SEASTAR_MODULE_EXPORT
+
+extern thread_local uint64_t fresh_task_id;
+extern thread_local uint64_t current_task_id;
+
+struct task_id {
+    uint64_t _value;
+    task_id(uint64_t id = current_task_id) : _value(id) {}
+    operator uint64_t() { return _value; };
+};
+
+struct [[nodiscard]] switch_task {
+    task_id _prev;
+    switch_task(uint64_t id) {
+        current_task_id = id;
+    }
+    ~switch_task() {
+        current_task_id = _prev;
+    }
+};
+
 class task {
+public:
+    task_id _id;
 protected:
     scheduling_group _sg;
 private:

diff --git a/include/seastar/util/shared_token_bucket.hh b/include/seastar/util/shared_token_bucket.hh
@@ -116,8 +116,9 @@ class shared_token_bucket {
     rovers_t _rovers;
 
     T tail() const noexcept { return _rovers.tail.load(std::memory_order_relaxed); }
+public:
     T head() const noexcept { return _rovers.head.load(std::memory_order_relaxed); }
-
+private:
     /*
      * Need to make sure that the multiplication in accumulated_in() doesn't
      * overflow. Not to introduce an extra branch there, define that the
@@ -159,6 +160,10 @@ public:
         _rovers.release(tokens);
     }
 
+    void refund(T tokens) noexcept {
+        fetch_add(_rovers.head, tokens);
+    }
+
     void replenish(typename Clock::time_point now) noexcept {
         auto ts = _replenished.load(std::memory_order_relaxed);
 

diff --git a/include/seastar/util/tracer.hh b/include/seastar/util/tracer.hh
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <deque>
+#include <vector>
+#include <cstdint>
+#include <cstring>
+#include <seastar/core/byteorder.hh>
+
+namespace seastar {
+
+struct tracer {
+    static constexpr size_t buffer_size = (128 * 1024);
+    std::deque<std::vector<std::byte>> _old;
+    std::vector<std::byte> _current;
+    size_t _cur_pos = 0;
+
+    tracer() {
+        for (int i = 0; i < 480; ++i) {
+            _old.push_back(std::vector<std::byte>());
+        }
+        _current.resize(buffer_size);
+    }
+
+    void rotate() {
+        _current.resize(_cur_pos);
+        _old.push_back(std::move(_current));
+        _current = std::move(_old.front());
+        _old.pop_front();
+        _current.resize(buffer_size);
+        _cur_pos = 0;
+    }
+
+    std::byte* write(size_t n) {
+        if (_current.size() - _cur_pos < n) [[unlikely]] {
+            rotate();
+        }
+        auto result = &_current[_cur_pos];
+        _cur_pos += n;
+        return result;
+    }
+
+    std::deque<std::vector<std::byte>> snapshot() {
+        auto result = _old;
+        auto cur = _current;
+        cur.resize(_cur_pos);
+        result.push_back(cur);
+        return result;
+    }
+
+    uint64_t rdtsc() {
+        uint64_t rax, rdx;
+        asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+        return (uint64_t)(( rdx << 32 ) + rax);
+    }
+};
+extern thread_local tracer g_tracer;
+
+
+enum class trace_events {
+    POLL,
+    SLEEP,
+    WAKEUP,
+    RUN_TASK_QUEUE,
+    RUN_TASK_QUEUE_END,
+    RUN_TASK,
+    RUN_EXECUTION_STAGE_TASK,
+    GRAB_CAPACITY,
+    GRAB_CAPACITY_PENDING,
+    DISPATCH_REQUESTS,
+    DISPATCH_QUEUE,
+    REPLENISH,
+    IO_QUEUE,
+    IO_DISPATCH,
+    IO_COMPLETE,
+    IO_CANCEL,
+    MONITORING_SCRAPE,
+    COUNT,
+};
+
+[[gnu::always_inline]]
+inline void tracepoint_nullary(trace_events header) {
+    auto p = reinterpret_cast<char*>(g_tracer.write(12));
+    p = seastar::write_le<uint32_t>(p, static_cast<uint32_t>(header));
+    p = seastar::write_le<uint64_t>(p, g_tracer.rdtsc());
+}
+
+template <typename T>
+[[gnu::always_inline]]
+inline void tracepoint_unary(uint32_t header, T arg) {
+    auto p = reinterpret_cast<char*>(g_tracer.write(12 + sizeof(T)));
+    p = seastar::write_le<uint32_t>(p, static_cast<uint32_t>(header));
+    p = seastar::write_le<uint64_t>(p, g_tracer.rdtsc());
+    p = seastar::write_le<T>(p, arg);
+}
+
+template <typename T>
+[[gnu::always_inline]]
+inline void tracepoint_unary(trace_events header, T arg) {
+    tracepoint_unary(static_cast<uint32_t>(header), arg);
+}
+
+inline void tracepoint_poll() {
+    tracepoint_nullary(trace_events::POLL);
+}
+
+inline void tracepoint_sleep() {
+    tracepoint_nullary(trace_events::SLEEP);
+}
+
+inline void tracepoint_wakeup() {
+    tracepoint_nullary(trace_events::WAKEUP);
+}
+
+inline void tracepoint_run_task_queue(uint8_t sg) {
+    tracepoint_unary<uint8_t>(trace_events::RUN_TASK_QUEUE, sg);
+}
+
+inline void tracepoint_run_task_queue_end() {
+    tracepoint_nullary(trace_events::RUN_TASK_QUEUE_END);
+}
+
+inline void tracepoint_run_task(int64_t task_id) {
+    tracepoint_unary<uint64_t>(trace_events::RUN_TASK, task_id);
+}
+
+inline void tracepoint_run_execution_stage_task(int64_t task_id) {
+    tracepoint_unary<uint64_t>(trace_events::RUN_EXECUTION_STAGE_TASK, task_id);
+}
+
+inline void tracepoint_io_queue(uint8_t direction, uint64_t tokens, uint64_t io_id) {
+    auto p = reinterpret_cast<char*>(g_tracer.write(12 + 17));
+    p = seastar::write_le<uint32_t>(p, static_cast<uint32_t>(trace_events::IO_QUEUE));
+    p = seastar::write_le<uint64_t>(p, g_tracer.rdtsc());
+    p = seastar::write_le<uint8_t>(p, direction);
+    p = seastar::write_le<uint64_t>(p, tokens);
+    p = seastar::write_le<uint64_t>(p, io_id);
+}
+
+inline void tracepoint_io_dispatch(uint64_t io_id) {
+    tracepoint_unary<uint64_t>(trace_events::IO_DISPATCH, io_id);
+}
+
+inline void tracepoint_io_complete(uint64_t io_id) {
+    tracepoint_unary<uint64_t>(trace_events::IO_COMPLETE, io_id);
+}
+
+inline void tracepoint_io_cancel(uint64_t io_id) {
+    tracepoint_unary<uint64_t>(trace_events::IO_CANCEL, io_id);
+}
+
+inline void tracepoint_grab_capacity(uint64_t cap, uint64_t want_head, uint64_t head) {
+    auto p = reinterpret_cast<char*>(g_tracer.write(12 + 24));
+    p = seastar::write_le<uint32_t>(p, static_cast<uint32_t>(trace_events::GRAB_CAPACITY));
+    p = seastar::write_le<uint64_t>(p, g_tracer.rdtsc());
+    p = seastar::write_le<uint64_t>(p, cap);
+    p = seastar::write_le<uint64_t>(p, want_head);
+    p = seastar::write_le<uint64_t>(p, head);
+}
+
+inline void tracepoint_grab_capacity_pending(uint64_t cap, uint64_t head) {
+    auto p = reinterpret_cast<char*>(g_tracer.write(12 + 16));
+    p = seastar::write_le<uint32_t>(p, static_cast<uint32_t>(trace_events::GRAB_CAPACITY_PENDING));
+    p = seastar::write_le<uint64_t>(p, g_tracer.rdtsc());
+    p = seastar::write_le<uint64_t>(p, cap);
+    p = seastar::write_le<uint64_t>(p, head);
+}
+
+inline void tracepoint_replenish(uint64_t new_head) {
+    tracepoint_unary<uint64_t>(trace_events::REPLENISH, new_head);
+}
+
+inline void tracepoint_dispatch_queue(uint8_t id) {
+    tracepoint_unary<uint8_t>(trace_events::DISPATCH_QUEUE, id);
+}
+
+inline void tracepoint_dispatch_requests(uint64_t queued) {
+    tracepoint_unary<uint64_t>(trace_events::DISPATCH_REQUESTS, queued);
+}
+
+inline void tracepoint_monitoring_scrape() {
+    tracepoint_nullary(trace_events::MONITORING_SCRAPE);
+}
+
+
+} // namespace seastar