From aef4ee3d9c82860261cc02bf09a71b48f13a714d Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Wed, 31 Jul 2024 23:59:49 +0300 Subject: [PATCH] debug: debug_stream_thread_info: Thread info records over debug stream Implementation for collecting Thread stack and CPU usage statistic from Zephyr core and send the info and debug_stream records. The records encoding is defined and documented in debug_stream_thread_info.c source file. Signed-off-by: Jyri Sarha --- src/debug/debug_stream/CMakeLists.txt | 2 + src/debug/debug_stream/Kconfig | 23 +- .../debug_stream/debug_stream_thread_info.c | 378 ++++++++++++++++++ src/include/user/debug_stream_thread_info.h | 33 ++ 4 files changed, 435 insertions(+), 1 deletion(-) create mode 100644 src/debug/debug_stream/debug_stream_thread_info.c create mode 100644 src/include/user/debug_stream_thread_info.h diff --git a/src/debug/debug_stream/CMakeLists.txt b/src/debug/debug_stream/CMakeLists.txt index 3c8e6c00d5c3..4d353128a819 100644 --- a/src/debug/debug_stream/CMakeLists.txt +++ b/src/debug/debug_stream/CMakeLists.txt @@ -1,3 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause add_local_sources_ifdef(CONFIG_SOF_DEBUG_STREAM_SLOT sof debug_stream_slot.c) + +add_local_sources_ifdef(CONFIG_SOF_DEBUG_STREAM_THREAD_INFO sof debug_stream_thread_info.c) diff --git a/src/debug/debug_stream/Kconfig b/src/debug/debug_stream/Kconfig index 40f222d019fd..3a5f3d1f49cf 100644 --- a/src/debug/debug_stream/Kconfig +++ b/src/debug/debug_stream/Kconfig @@ -22,5 +22,26 @@ config SOF_DEBUG_STREAM_SLOT_NUMBER used for descriptors, so for slot 3 to be mapped, the WIN_2_SIZE must be (1 + 3) * 4k = 16k or greater. -endif +config SOF_DEBUG_STREAM_THREAD_INFO + bool "Enable Zephyr thread info reporting through Debug-Stream" + select INIT_STACKS + select THREAD_MONITOR + select THREAD_STACK_INFO + select THREAD_RUNTIME_STATS + help + This activates Zephyr thread info reporting through + Debug-Stream. Thread info reports some basic live data from + the Zephyr threads, like stack usage high-water-mark and CPU + usage. Please select THREAD_NAME=y for the thread names more + than just hex numbers. +config SOF_DEBUG_STREAM_THREAD_INFO_INTERVAL + int "Thread information collection interval in seconds" + depends on SOF_DEBUG_STREAM_THREAD_INFO + default 2 + range 1 10 + help + Decides how often thread info runs and checks execution cycle + statistics and stack usage. + +endif diff --git a/src/debug/debug_stream/debug_stream_thread_info.c b/src/debug/debug_stream/debug_stream_thread_info.c new file mode 100644 index 000000000000..e83296828c65 --- /dev/null +++ b/src/debug/debug_stream/debug_stream_thread_info.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. + +#include +#include +#include +#include +#include +#include +#include + +#include + +LOG_MODULE_REGISTER(thread_info); + +#define THREAD_INFO_MAX_THREADS 64 +/* Must be bigger than sizeof(struct thread_info_record_hdr) */ +#define THREAD_INFO_INITIAL_RECORD_BUFFER_SIZE 256 + +#ifdef CONFIG_THREAD_RUNTIME_STATS +/* Data structure to store the cycle counter values from the previous + * round. The numbers are used to calculate what the load was on this + * round. + */ +static struct previous_counters { /* Cached data from previous round */ + uint64_t active; /* All execution cycles */ + uint64_t all; /* All cycles including idle */ + struct thread_counters { + void *tid; /* thread ID (the thread struct ptr) */ + uint64_t cycles; /* cycle counter value */ + } threads[THREAD_INFO_MAX_THREADS]; /* The max amount of threads we follow */ +} __aligned(CONFIG_DCACHE_LINE_SIZE) previous[CONFIG_MP_MAX_NUM_CPUS]; +#endif + +/* + * Buffer structure for building the Debug Stream record before + * sending it forward. The buffer is persistent and each record is + * build in the same buffer. Its reallocated as double size if it + * becomes too small. + */ +struct record_buf { + size_t size; + size_t w_ptr; + uint8_t *buf; +}; + +/* + * Data structure to be passed down to thread_info_cb() by + * k_thread_foreach_current_cpu(). + */ +struct user_data { + int core; + struct record_buf *bufd; + int thread_count; +#ifdef CONFIG_THREAD_RUNTIME_STATS + bool stats_valid; + uint32_t all_cycles; + void *active_threads[THREAD_INFO_MAX_THREADS]; + struct previous_counters *previous; +#endif +}; + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Finds and/or updates cached cycle counter value for 'tid'-thread + * and calculates used execution cycles since previous round and + * returns it. The tid is actually a k_tread pointer, but since its + * used just as an id and never accessed, I pass it as a void pointer. + */ + +static uint32_t thread_info_get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats, + struct user_data *ud, const char *name) + +{ + int i; + + if (ud->thread_count >= ARRAY_SIZE(ud->active_threads)) { + LOG_WRN("Thread could exceeds tha max threads %u >= %u", + ud->thread_count, ARRAY_SIZE(ud->active_threads)); + return 0; + } + + /* Mark the thread as active */ + ud->active_threads[ud->thread_count] = tid; + /* look for cached value from previous round for 'tid'-thread */ + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == tid) { + /* Calculate number cycles since previous round */ + uint32_t cycles = (uint32_t) (thread_stats->execution_cycles - + ud->previous->threads[i].cycles); + + LOG_DBG("%p found at %d (%s %llu)", tid, i, + name, thread_stats->execution_cycles); + /* update cached value */ + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + return cycles; + } + } + + /* If no cached value was found, look for an empty slot to store the recent value */ + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == NULL) { + ud->previous->threads[i].tid = tid; + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + LOG_DBG("%p placed at %d (%s %llu)", tid, i, + name, ud->previous->threads[i].cycles); + break; + } + } + + /* If there is more than THREAD_INFO_MAX_THREADS threads in this core */ + if (i == ARRAY_SIZE(ud->previous->threads)) + LOG_WRN("No place found for %s %p", name, tid); + + /* If there was no previous counter value to compare, return 0 cycles. */ + return 0; +} + +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud, const char *name) +{ + k_thread_runtime_stats_t thread_stats; + uint32_t cycles; + int ret; + + if (!ud->stats_valid) + return 0; + + if (k_thread_runtime_stats_get(thread, &thread_stats) != 0) + return 0; + + cycles = thread_info_get_cycles(thread, &thread_stats, ud, name); + + LOG_DBG("thread %s %u / %u", name, cycles, ud->all_cycles); + + return (uint8_t) ((255llu * cycles) / ud->all_cycles); +} +#else +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud, const char *name) +{ + return 0; +} +#endif + +#ifdef CONFIG_THREAD_STACK_INFO +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + size_t stack_size, stack_unused; + int ret; + + stack_size = thread->stack_info.size; + ret = k_thread_stack_space_get(thread, &stack_unused); + if (ret) { + LOG_ERR(" %-20s: unable to get stack space (%d)", + name, ret); + stack_unused = 0; + } + return (UINT8_MAX * (stack_size - stack_unused)) / stack_size; +} +#else +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + return 0; +} +#endif + +static int thread_info_buf_realloc(struct record_buf *bufd, size_t req_size) +{ + size_t size = bufd->size; + + while (size < bufd->w_ptr + req_size) + size *= 2; + + if (size != bufd->size) { + uint8_t *buf = rmalloc(SOF_MEM_ZONE_RUNTIME, 0, SOF_MEM_CAPS_RAM, size); + + if (!buf) + return -ENOMEM; + + memcpy_s(buf, size, bufd->buf, bufd->w_ptr); + rfree(bufd->buf); + bufd->size = size; + bufd->buf = buf; + } + + return 0; +} + +static void thread_info_cb(const struct k_thread *cthread, void *user_data) +{ + struct k_thread *thread = (struct k_thread *)cthread; + struct user_data *ud = user_data; + struct thread_info *tinfo; + const char *name; + + name = k_thread_name_get((k_tid_t)thread); + if (!name || name[0] == '\0') { + size_t ptr_str_len = 11; /* length of "0x12345678\0" */ + + if (thread_info_buf_realloc(ud->bufd, sizeof(*tinfo) + ptr_str_len)) + return; + + tinfo = (struct thread_info *) &ud->bufd->buf[ud->bufd->w_ptr]; + snprintk(tinfo->name, ptr_str_len, "%p", (void *)thread); + /* Drop the terminating '\0', that is why -1 is here. */ + tinfo->name_len = ptr_str_len - 1; + ud->bufd->w_ptr += sizeof(*tinfo) + ptr_str_len - 1; + } else { + size_t len = strlen(name); + + if (thread_info_buf_realloc(ud->bufd, sizeof(*tinfo) + len + 1)) + return; + + tinfo = (struct thread_info *) &ud->bufd->buf[ud->bufd->w_ptr]; + strncpy(tinfo->name, name, len + 1); + tinfo->name_len = len; + ud->bufd->w_ptr += sizeof(*tinfo) + len; + } + + tinfo->stack_usage = thread_info_stack_level(thread, name); + tinfo->cpu_usage = thread_info_cpu_utilization(thread, ud, name); + + LOG_DBG("core %u %s stack %u%% cpu %u%%", ud->core, + tinfo->name, tinfo->stack_usage * 100U / 255, + tinfo->cpu_usage * 100U / 255); + + ud->thread_count++; +} + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Marks cached thread cycle counter entries of removed threads + * free. This also happens to threads pinned to other cpu than the + * primary. In the beginning they are listed on primary cpu, until the + * pinned cpu is started up and the thread is executed for the fist + * time and it moves to the cpu its pinned on. + */ +static void cleanup_old_thread_cycles(struct user_data *ud) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + bool found = false; + + /* This entry is already free, continue */ + if (ud->previous->threads[i].tid == NULL) + continue; + + /* Check if the thread was seen on previous round */ + for (j = 0; j < ud->thread_count; j++) { + if (ud->active_threads[j] == ud->previous->threads[i].tid) { + found = true; + break; + } + } + /* If the thead is not any more active, mark the entry free */ + if (!found) { + ud->previous->threads[i].tid = NULL; + ud->previous->threads[i].cycles = 0; + } + } +} +#else +static void cleanup_old_thread_cycles(struct user_data *ud) { } +#endif + +static void thread_info_get(int core, struct record_buf *bufd) +{ + k_thread_runtime_stats_t core_stats; + struct user_data ud = { + .core = core, + .bufd = bufd, + .thread_count = 0, +#ifdef CONFIG_THREAD_RUNTIME_STATS + .active_threads = { NULL }, + .previous = &previous[core], +#endif + }; + struct thread_info_record_hdr *hdr; + uint8_t load = 0; +#ifdef CONFIG_THREAD_RUNTIME_STATS + int ret = k_thread_runtime_stats_cpu_get(core, &core_stats); + + if (ret == 0) { + uint32_t active_cycles = (uint32_t) (core_stats.total_cycles - + ud.previous->active); + uint32_t all_cycles = (uint32_t) (core_stats.execution_cycles - + ud.previous->all); + + ud.stats_valid = true; + load = (uint8_t) ((255LLU * active_cycles) / all_cycles); + LOG_DBG("Core %u load %u / %u total %llu / %llu", core, + active_cycles, all_cycles, + core_stats.total_cycles, core_stats.execution_cycles); + ud.previous->active = core_stats.total_cycles; + ud.previous->all = core_stats.execution_cycles; + ud.all_cycles = all_cycles; + } +#endif + + hdr = (struct thread_info_record_hdr *) bufd->buf; + bufd->w_ptr = sizeof(*hdr); + hdr->hdr.id = DEBUG_STREAM_RECORD_ID_THREAD_INFO; + hdr->load = load; + /* This is best effort debug tool. Unlocked version should be fine. */ + k_thread_foreach_unlocked_filter_by_cpu(core, thread_info_cb, &ud); + + cleanup_old_thread_cycles(&ud); + + hdr->thread_count = ud.thread_count; + hdr->hdr.size_words = SOF_DIV_ROUND_UP(bufd->w_ptr, sizeof(hdr->hdr.data[0])); + debug_stream_slot_send_record(&hdr->hdr); +} + +static void thread_info_run(void *cnum, void *a, void *b) +{ + int core = (int) cnum; + struct record_buf bufd = { + .size = THREAD_INFO_INITIAL_RECORD_BUFFER_SIZE, + .w_ptr = 0, + }; + + bufd.buf = rmalloc(SOF_MEM_ZONE_RUNTIME, 0, SOF_MEM_CAPS_RAM, bufd.size); + if (!bufd.buf) { + LOG_ERR("malloc failed"); + return; + } + + for (;;) { + thread_info_get(core, &bufd); + k_sleep(K_SECONDS(CONFIG_SOF_DEBUG_STREAM_THREAD_INFO_INTERVAL)); + } +} + +#define THREAD_STACK_SIZE (2048) +static K_THREAD_STACK_ARRAY_DEFINE(info_thread_stacks, CONFIG_MP_MAX_NUM_CPUS, + THREAD_STACK_SIZE); +static struct k_thread info_thread[CONFIG_MP_MAX_NUM_CPUS]; + +static int thread_info_start(void) +{ + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(info_thread); i++) { + char name[24]; + k_tid_t tid; + int ret; + + tid = k_thread_create(&info_thread[i], info_thread_stacks[i], + THREAD_STACK_SIZE, thread_info_run, + (void *) i, NULL, NULL, + K_LOWEST_APPLICATION_THREAD_PRIO, 0, + K_FOREVER); + if (!tid) { + LOG_ERR("k_thread_create() failed for core %u", i); + continue; + } + ret = k_thread_cpu_pin(tid, i); + if (ret < 0) { + LOG_ERR("Pinning thread to code core %u", i); + k_thread_abort(tid); + continue; + } + snprintf(name, sizeof(name), "%u thread info", i); + ret = k_thread_name_set(tid, name); + if (ret < 0) + LOG_INF("k_thread_name_set failed: %d for %u", ret, i); + + k_thread_start(tid); + LOG_DBG("Thread %p for core %u started", tid, i); + } + + return 0; +} + +SYS_INIT(thread_info_start, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); diff --git a/src/include/user/debug_stream_thread_info.h b/src/include/user/debug_stream_thread_info.h new file mode 100644 index 000000000000..02990e9a1c94 --- /dev/null +++ b/src/include/user/debug_stream_thread_info.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2024 Intel Corporation. + */ + +#ifndef __SOC_DEBUG_STREAM_THREAD_INFO_H__ +#define __SOC_DEBUG_STREAM_THREAD_INFO_H__ + +#include + +/* + * Debug Stream Thread Info record header, including the cpu load for the + * core, and followed by thread_count. Immediately after the header + * follows the thread fields. The thread_count indicates the number of + * thread fields. + */ +struct thread_info_record_hdr { + struct debug_stream_record hdr; + uint8_t load; /* Core's load U(0,8) fixed point value */ + uint8_t thread_count; +} __packed; + +/* + * Debug Stream Thread Info field for a single thread. + */ +struct thread_info { + uint8_t stack_usage; /* Relative stack usage U(0,8) fixed point value */ + uint8_t cpu_usage; /* Relative cpu usage U(0,8) fixed point value */ + uint8_t name_len; /* Length of name string */ + char name[]; +} __packed; + +#endif /* __SOC_DEBUG_STREAM_THREAD_INFO_H__ */