diff --git a/src/Kconfig b/src/Kconfig index d9f66a94bfab..378cd533b0b7 100644 --- a/src/Kconfig +++ b/src/Kconfig @@ -23,3 +23,5 @@ rsource "math/Kconfig" rsource "library_manager/Kconfig" rsource "debug/telemetry/Kconfig" + +rsource "debug/debug_stream/Kconfig" diff --git a/src/debug/debug_stream/CMakeLists.txt b/src/debug/debug_stream/CMakeLists.txt new file mode 100644 index 000000000000..4d353128a819 --- /dev/null +++ b/src/debug/debug_stream/CMakeLists.txt @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause + +add_local_sources_ifdef(CONFIG_SOF_DEBUG_STREAM_SLOT sof debug_stream_slot.c) + +add_local_sources_ifdef(CONFIG_SOF_DEBUG_STREAM_THREAD_INFO sof debug_stream_thread_info.c) diff --git a/src/debug/debug_stream/Kconfig b/src/debug/debug_stream/Kconfig new file mode 100644 index 000000000000..3a5f3d1f49cf --- /dev/null +++ b/src/debug/debug_stream/Kconfig @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: BSD-3-Clause + +config SOF_DEBUG_STREAM_SLOT + bool "Enable SOF debug stream debug window slot" + help + Debug stream is an abstract stream of records of debug + information from SOF system that are streamed from SOF DSP + to the host side for decoding and presentation. This option + enables transferring the records from DSP to host over a + debug window slot. + +if SOF_DEBUG_STREAM_SLOT + +config SOF_DEBUG_STREAM_SLOT_NUMBER + int "Debug window slot where to put debug stream slot" + default 3 + range 0 14 + help + Which debug slot to reserve for Debug Stream. Remember to map + the slot with MEMORY_WIN_2_SIZE in soc/intel/intel_adsp/Kconfig, + in Zephyr source tree. The slots are 4k in size and one slot is + used for descriptors, so for slot 3 to be mapped, the WIN_2_SIZE + must be (1 + 3) * 4k = 16k or greater. + +config SOF_DEBUG_STREAM_THREAD_INFO + bool "Enable Zephyr thread info reporting through Debug-Stream" + select INIT_STACKS + select THREAD_MONITOR + select THREAD_STACK_INFO + select THREAD_RUNTIME_STATS + help + This activates Zephyr thread info reporting through + Debug-Stream. Thread info reports some basic live data from + the Zephyr threads, like stack usage high-water-mark and CPU + usage. Please select THREAD_NAME=y for the thread names more + than just hex numbers. + +config SOF_DEBUG_STREAM_THREAD_INFO_INTERVAL + int "Thread information collection interval in seconds" + depends on SOF_DEBUG_STREAM_THREAD_INFO + default 2 + range 1 10 + help + Decides how often thread info runs and checks execution cycle + statistics and stack usage. + +endif diff --git a/src/debug/debug_stream/debug_stream_slot.c b/src/debug/debug_stream/debug_stream_slot.c new file mode 100644 index 000000000000..0b916455c642 --- /dev/null +++ b/src/debug/debug_stream/debug_stream_slot.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. + +#include +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_REGISTER(debug_strem_slot); + +struct cpu_mutex { + struct k_mutex m; +} __aligned(CONFIG_DCACHE_LINE_SIZE); + +/* CPU specific mutexes for each circular buffer */ +static struct cpu_mutex cpu_mutex[CONFIG_MP_MAX_NUM_CPUS]; + +static const int debug_stream_slot = CONFIG_SOF_DEBUG_STREAM_SLOT_NUMBER; + +static struct debug_stream_slot_hdr *debug_stream_get_slot(void) +{ + return (struct debug_stream_slot_hdr *)ADSP_DW->slots[debug_stream_slot]; +} + +static +struct debug_stream_circular_buf * +debug_stream_get_circular_buffer(struct debug_stream_section_descriptor *desc, unsigned int core) +{ + struct debug_stream_slot_hdr *hdr = debug_stream_get_slot(); + void *ptr; + + if (hdr->hdr.magic != DEBUG_STREAM_IDENTIFIER) { + LOG_ERR("Debug stream slot not initialized."); + return NULL; + } + + *desc = hdr->section_desc[core]; + LOG_DBG("Section %u (desc %u %u %u)", core, desc->core_id, desc->buf_words, desc->offset); + + return (struct debug_stream_circular_buf *) (((uint8_t *)hdr) + desc->offset); +} + +int debug_stream_slot_send_record(struct debug_stream_record *rec) +{ + struct debug_stream_section_descriptor desc; + struct debug_stream_circular_buf *buf = + debug_stream_get_circular_buffer(&desc, arch_proc_id()); + uint32_t record_size = rec->size_words; + uint32_t record_start, buf_remain; + + LOG_DBG("Sending record %u id %u len %u\n", rec->seqno, rec->id, rec->size_words); + + if (!buf) + return -ENODEV; + + if (rec->size_words >= desc.buf_words) { + LOG_ERR("Record too big %u >= %u (desc %u %u %u)", rec->size_words, + desc.buf_words, desc.core_id, desc.buf_words, desc.offset); + return -ENOMEM; + } + k_mutex_lock(&cpu_mutex[arch_proc_id()].m, K_FOREVER); + + rec->seqno = buf->next_seqno++; + rec->size_words = record_size + 1; /* +1 for size at the end of record */ + record_start = buf->w_ptr; + buf->w_ptr = (record_start + record_size) % desc.buf_words; + buf_remain = desc.buf_words - record_start; + if (buf_remain < record_size) { + uint32_t rec_remain = record_size - buf_remain; + int ret; + + ret = memcpy_s(&buf->data[record_start], buf_remain * sizeof(uint32_t), + rec, buf_remain * sizeof(uint32_t)); + assert(!ret); + ret = memcpy_s(&buf->data[0], desc.buf_words * sizeof(uint32_t), + ((uint32_t *) rec) + buf_remain, rec_remain * sizeof(uint32_t)); + assert(!ret); + } else { + int ret; + + ret = memcpy_s(&buf->data[record_start], buf_remain * sizeof(uint32_t), + rec, record_size * sizeof(uint32_t)); + assert(!ret); + } + /* Write record size again after the record */ + buf->data[buf->w_ptr] = record_size + 1; + buf->w_ptr = (buf->w_ptr + 1) % desc.buf_words; + + k_mutex_unlock(&cpu_mutex[arch_proc_id()].m); + + LOG_DBG("Record %u id %u len %u sent\n", rec->seqno, rec->id, record_size); + return 0; +} + +static int debug_stream_slot_init(void) +{ + struct debug_stream_slot_hdr *hdr = debug_stream_get_slot(); + size_t hdr_size = offsetof(struct debug_stream_slot_hdr, + section_desc[CONFIG_MP_MAX_NUM_CPUS]); + size_t section_area_size = ADSP_DW_SLOT_SIZE - hdr_size; + size_t section_size = ALIGN_DOWN(section_area_size / + CONFIG_MP_MAX_NUM_CPUS, + CONFIG_DCACHE_LINE_SIZE); + size_t offset = hdr_size; + int i; + + LOG_INF("%u sections of %u bytes, hdr %u, secton area %u\n", + CONFIG_MP_MAX_NUM_CPUS, section_size, hdr_size, + section_area_size); + + if (ADSP_DW->descs[debug_stream_slot].type != 0) + LOG_WRN("Slot %d was not free: %u", debug_stream_slot, + ADSP_DW->descs[debug_stream_slot].type); + ADSP_DW->descs[debug_stream_slot].type = ADSP_DW_SLOT_DEBUG_STREAM; + + hdr->hdr.magic = DEBUG_STREAM_IDENTIFIER; + hdr->hdr.hdr_size = hdr_size; + hdr->total_size = hdr_size + CONFIG_MP_MAX_NUM_CPUS * section_size; + hdr->num_sections = CONFIG_MP_MAX_NUM_CPUS; + for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) { + hdr->section_desc[i].core_id = i; + hdr->section_desc[i].buf_words = + (section_size - offsetof(struct debug_stream_circular_buf, data[0]))/ + sizeof(uint32_t); + hdr->section_desc[i].offset = offset; + LOG_INF("sections %u, size %u, offset %u\n", + i, section_size, offset); + offset += section_size; + } + for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) { + struct debug_stream_section_descriptor desc; + struct debug_stream_circular_buf *buf = + debug_stream_get_circular_buffer(&desc, i); + + buf->next_seqno = 0; + buf->w_ptr = 0; + k_mutex_init(&cpu_mutex[i].m); + /* The core specific mutexes are now .dss which is uncached so the + * following line is commented out. However, since the mutexes are + * core specific there should be nothing preventing from having them + * in cached memory. + * + * sys_cache_data_flush_range(&cpu_mutex[i], sizeof(cpu_mutex[i])); + */ + } + LOG_INF("Debug stream slot initialized\n"); + + return 0; +} + +SYS_INIT(debug_stream_slot_init, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); diff --git a/src/debug/debug_stream/debug_stream_thread_info.c b/src/debug/debug_stream/debug_stream_thread_info.c new file mode 100644 index 000000000000..e83296828c65 --- /dev/null +++ b/src/debug/debug_stream/debug_stream_thread_info.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. + +#include +#include +#include +#include +#include +#include +#include + +#include + +LOG_MODULE_REGISTER(thread_info); + +#define THREAD_INFO_MAX_THREADS 64 +/* Must be bigger than sizeof(struct thread_info_record_hdr) */ +#define THREAD_INFO_INITIAL_RECORD_BUFFER_SIZE 256 + +#ifdef CONFIG_THREAD_RUNTIME_STATS +/* Data structure to store the cycle counter values from the previous + * round. The numbers are used to calculate what the load was on this + * round. + */ +static struct previous_counters { /* Cached data from previous round */ + uint64_t active; /* All execution cycles */ + uint64_t all; /* All cycles including idle */ + struct thread_counters { + void *tid; /* thread ID (the thread struct ptr) */ + uint64_t cycles; /* cycle counter value */ + } threads[THREAD_INFO_MAX_THREADS]; /* The max amount of threads we follow */ +} __aligned(CONFIG_DCACHE_LINE_SIZE) previous[CONFIG_MP_MAX_NUM_CPUS]; +#endif + +/* + * Buffer structure for building the Debug Stream record before + * sending it forward. The buffer is persistent and each record is + * build in the same buffer. Its reallocated as double size if it + * becomes too small. + */ +struct record_buf { + size_t size; + size_t w_ptr; + uint8_t *buf; +}; + +/* + * Data structure to be passed down to thread_info_cb() by + * k_thread_foreach_current_cpu(). + */ +struct user_data { + int core; + struct record_buf *bufd; + int thread_count; +#ifdef CONFIG_THREAD_RUNTIME_STATS + bool stats_valid; + uint32_t all_cycles; + void *active_threads[THREAD_INFO_MAX_THREADS]; + struct previous_counters *previous; +#endif +}; + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Finds and/or updates cached cycle counter value for 'tid'-thread + * and calculates used execution cycles since previous round and + * returns it. The tid is actually a k_tread pointer, but since its + * used just as an id and never accessed, I pass it as a void pointer. + */ + +static uint32_t thread_info_get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats, + struct user_data *ud, const char *name) + +{ + int i; + + if (ud->thread_count >= ARRAY_SIZE(ud->active_threads)) { + LOG_WRN("Thread could exceeds tha max threads %u >= %u", + ud->thread_count, ARRAY_SIZE(ud->active_threads)); + return 0; + } + + /* Mark the thread as active */ + ud->active_threads[ud->thread_count] = tid; + /* look for cached value from previous round for 'tid'-thread */ + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == tid) { + /* Calculate number cycles since previous round */ + uint32_t cycles = (uint32_t) (thread_stats->execution_cycles - + ud->previous->threads[i].cycles); + + LOG_DBG("%p found at %d (%s %llu)", tid, i, + name, thread_stats->execution_cycles); + /* update cached value */ + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + return cycles; + } + } + + /* If no cached value was found, look for an empty slot to store the recent value */ + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + if (ud->previous->threads[i].tid == NULL) { + ud->previous->threads[i].tid = tid; + ud->previous->threads[i].cycles = thread_stats->execution_cycles; + LOG_DBG("%p placed at %d (%s %llu)", tid, i, + name, ud->previous->threads[i].cycles); + break; + } + } + + /* If there is more than THREAD_INFO_MAX_THREADS threads in this core */ + if (i == ARRAY_SIZE(ud->previous->threads)) + LOG_WRN("No place found for %s %p", name, tid); + + /* If there was no previous counter value to compare, return 0 cycles. */ + return 0; +} + +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud, const char *name) +{ + k_thread_runtime_stats_t thread_stats; + uint32_t cycles; + int ret; + + if (!ud->stats_valid) + return 0; + + if (k_thread_runtime_stats_get(thread, &thread_stats) != 0) + return 0; + + cycles = thread_info_get_cycles(thread, &thread_stats, ud, name); + + LOG_DBG("thread %s %u / %u", name, cycles, ud->all_cycles); + + return (uint8_t) ((255llu * cycles) / ud->all_cycles); +} +#else +static uint8_t thread_info_cpu_utilization(struct k_thread *thread, + struct user_data *ud, const char *name) +{ + return 0; +} +#endif + +#ifdef CONFIG_THREAD_STACK_INFO +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + size_t stack_size, stack_unused; + int ret; + + stack_size = thread->stack_info.size; + ret = k_thread_stack_space_get(thread, &stack_unused); + if (ret) { + LOG_ERR(" %-20s: unable to get stack space (%d)", + name, ret); + stack_unused = 0; + } + return (UINT8_MAX * (stack_size - stack_unused)) / stack_size; +} +#else +static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name) +{ + return 0; +} +#endif + +static int thread_info_buf_realloc(struct record_buf *bufd, size_t req_size) +{ + size_t size = bufd->size; + + while (size < bufd->w_ptr + req_size) + size *= 2; + + if (size != bufd->size) { + uint8_t *buf = rmalloc(SOF_MEM_ZONE_RUNTIME, 0, SOF_MEM_CAPS_RAM, size); + + if (!buf) + return -ENOMEM; + + memcpy_s(buf, size, bufd->buf, bufd->w_ptr); + rfree(bufd->buf); + bufd->size = size; + bufd->buf = buf; + } + + return 0; +} + +static void thread_info_cb(const struct k_thread *cthread, void *user_data) +{ + struct k_thread *thread = (struct k_thread *)cthread; + struct user_data *ud = user_data; + struct thread_info *tinfo; + const char *name; + + name = k_thread_name_get((k_tid_t)thread); + if (!name || name[0] == '\0') { + size_t ptr_str_len = 11; /* length of "0x12345678\0" */ + + if (thread_info_buf_realloc(ud->bufd, sizeof(*tinfo) + ptr_str_len)) + return; + + tinfo = (struct thread_info *) &ud->bufd->buf[ud->bufd->w_ptr]; + snprintk(tinfo->name, ptr_str_len, "%p", (void *)thread); + /* Drop the terminating '\0', that is why -1 is here. */ + tinfo->name_len = ptr_str_len - 1; + ud->bufd->w_ptr += sizeof(*tinfo) + ptr_str_len - 1; + } else { + size_t len = strlen(name); + + if (thread_info_buf_realloc(ud->bufd, sizeof(*tinfo) + len + 1)) + return; + + tinfo = (struct thread_info *) &ud->bufd->buf[ud->bufd->w_ptr]; + strncpy(tinfo->name, name, len + 1); + tinfo->name_len = len; + ud->bufd->w_ptr += sizeof(*tinfo) + len; + } + + tinfo->stack_usage = thread_info_stack_level(thread, name); + tinfo->cpu_usage = thread_info_cpu_utilization(thread, ud, name); + + LOG_DBG("core %u %s stack %u%% cpu %u%%", ud->core, + tinfo->name, tinfo->stack_usage * 100U / 255, + tinfo->cpu_usage * 100U / 255); + + ud->thread_count++; +} + +#ifdef CONFIG_THREAD_RUNTIME_STATS + +/* Marks cached thread cycle counter entries of removed threads + * free. This also happens to threads pinned to other cpu than the + * primary. In the beginning they are listed on primary cpu, until the + * pinned cpu is started up and the thread is executed for the fist + * time and it moves to the cpu its pinned on. + */ +static void cleanup_old_thread_cycles(struct user_data *ud) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) { + bool found = false; + + /* This entry is already free, continue */ + if (ud->previous->threads[i].tid == NULL) + continue; + + /* Check if the thread was seen on previous round */ + for (j = 0; j < ud->thread_count; j++) { + if (ud->active_threads[j] == ud->previous->threads[i].tid) { + found = true; + break; + } + } + /* If the thead is not any more active, mark the entry free */ + if (!found) { + ud->previous->threads[i].tid = NULL; + ud->previous->threads[i].cycles = 0; + } + } +} +#else +static void cleanup_old_thread_cycles(struct user_data *ud) { } +#endif + +static void thread_info_get(int core, struct record_buf *bufd) +{ + k_thread_runtime_stats_t core_stats; + struct user_data ud = { + .core = core, + .bufd = bufd, + .thread_count = 0, +#ifdef CONFIG_THREAD_RUNTIME_STATS + .active_threads = { NULL }, + .previous = &previous[core], +#endif + }; + struct thread_info_record_hdr *hdr; + uint8_t load = 0; +#ifdef CONFIG_THREAD_RUNTIME_STATS + int ret = k_thread_runtime_stats_cpu_get(core, &core_stats); + + if (ret == 0) { + uint32_t active_cycles = (uint32_t) (core_stats.total_cycles - + ud.previous->active); + uint32_t all_cycles = (uint32_t) (core_stats.execution_cycles - + ud.previous->all); + + ud.stats_valid = true; + load = (uint8_t) ((255LLU * active_cycles) / all_cycles); + LOG_DBG("Core %u load %u / %u total %llu / %llu", core, + active_cycles, all_cycles, + core_stats.total_cycles, core_stats.execution_cycles); + ud.previous->active = core_stats.total_cycles; + ud.previous->all = core_stats.execution_cycles; + ud.all_cycles = all_cycles; + } +#endif + + hdr = (struct thread_info_record_hdr *) bufd->buf; + bufd->w_ptr = sizeof(*hdr); + hdr->hdr.id = DEBUG_STREAM_RECORD_ID_THREAD_INFO; + hdr->load = load; + /* This is best effort debug tool. Unlocked version should be fine. */ + k_thread_foreach_unlocked_filter_by_cpu(core, thread_info_cb, &ud); + + cleanup_old_thread_cycles(&ud); + + hdr->thread_count = ud.thread_count; + hdr->hdr.size_words = SOF_DIV_ROUND_UP(bufd->w_ptr, sizeof(hdr->hdr.data[0])); + debug_stream_slot_send_record(&hdr->hdr); +} + +static void thread_info_run(void *cnum, void *a, void *b) +{ + int core = (int) cnum; + struct record_buf bufd = { + .size = THREAD_INFO_INITIAL_RECORD_BUFFER_SIZE, + .w_ptr = 0, + }; + + bufd.buf = rmalloc(SOF_MEM_ZONE_RUNTIME, 0, SOF_MEM_CAPS_RAM, bufd.size); + if (!bufd.buf) { + LOG_ERR("malloc failed"); + return; + } + + for (;;) { + thread_info_get(core, &bufd); + k_sleep(K_SECONDS(CONFIG_SOF_DEBUG_STREAM_THREAD_INFO_INTERVAL)); + } +} + +#define THREAD_STACK_SIZE (2048) +static K_THREAD_STACK_ARRAY_DEFINE(info_thread_stacks, CONFIG_MP_MAX_NUM_CPUS, + THREAD_STACK_SIZE); +static struct k_thread info_thread[CONFIG_MP_MAX_NUM_CPUS]; + +static int thread_info_start(void) +{ + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(info_thread); i++) { + char name[24]; + k_tid_t tid; + int ret; + + tid = k_thread_create(&info_thread[i], info_thread_stacks[i], + THREAD_STACK_SIZE, thread_info_run, + (void *) i, NULL, NULL, + K_LOWEST_APPLICATION_THREAD_PRIO, 0, + K_FOREVER); + if (!tid) { + LOG_ERR("k_thread_create() failed for core %u", i); + continue; + } + ret = k_thread_cpu_pin(tid, i); + if (ret < 0) { + LOG_ERR("Pinning thread to code core %u", i); + k_thread_abort(tid); + continue; + } + snprintf(name, sizeof(name), "%u thread info", i); + ret = k_thread_name_set(tid, name); + if (ret < 0) + LOG_INF("k_thread_name_set failed: %d for %u", ret, i); + + k_thread_start(tid); + LOG_DBG("Thread %p for core %u started", tid, i); + } + + return 0; +} + +SYS_INIT(thread_info_start, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); diff --git a/src/include/user/debug_stream.h b/src/include/user/debug_stream.h new file mode 100644 index 000000000000..60f731622478 --- /dev/null +++ b/src/include/user/debug_stream.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2024 Intel Corporation. + */ + +#ifndef __SOC_DEBUG_STREAM_H__ +#define __SOC_DEBUG_STREAM_H__ + +/* + * Debug Stream is a stream protocol for passing real-time debug + * information from SOF system. It provides the framework for + * passing pieces of abstract data objects from DSP side to host side + * debugging tools. + * + * The details of Debug Stream protocol varies depending on transfer + * method, but the stream should always start with a header that + * consists of DEBUG_STREAM_IDENTIFIER and header size. + */ + +#define DEBUG_STREAM_IDENTIFIER 0x1ED15EED /* value for 'magic' */ + +struct debug_stream_hdr { + uint32_t magic; /* Magic number to recognize stream start */ + uint32_t hdr_size; /* Header size */ +} __packed; + +/* + * After the header ('hdr_size' bytes from beginning of 'magic') a + * stream of Debug Stream records should follow. Each record will + * start with a record identifier and record size, after which the + * record payload will follow. + * + * The abstract data is application specific and is passed from DSP + * debug entity to user space debug tool for decoding and + * presentation. The data is recognized by the 'id' and the 'size_words' + * describes the amount of data. The 'seqno' is a running number of sent + * record, increased by one after each record. The protocol is + * agnostic about the contents of the records. + */ + +struct debug_stream_record { + uint32_t id; /* Record id of abstract data record */ + uint32_t seqno; /* Increments after each record */ + uint32_t size_words; /* Size of the whole record in words */ + uint32_t data[]; +} __packed; + +/* Debug Stream record identifiers */ +#define DEBUG_STREAM_RECORD_ID_UNINITIALIZED 0 /* invalid record marker */ +#define DEBUG_STREAM_RECORD_ID_THREAD_INFO 1 /* Thread info record */ + +#endif /* __SOC_DEBUG_STREAM_H__ */ diff --git a/src/include/user/debug_stream_slot.h b/src/include/user/debug_stream_slot.h new file mode 100644 index 000000000000..b31797cb3d20 --- /dev/null +++ b/src/include/user/debug_stream_slot.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2024 Intel Corporation. + */ + +#ifndef __SOC_DEBUG_WINDOW_SLOT_H__ +#define __SOC_DEBUG_WINDOW_SLOT_H__ + +#include + +/* + * This file describes how Debug Stream can be transported over debug + * window memory slot. The debug stream is an abstract media API for + * passing debug data records from SOF DSP to the host system for + * presentation. The debug stream records are described in debug_stream.h. + * + * This example describes how Debug Stream data is transferred from + * DSP to host side using a debug memory window slot. To learn more + * see soc/intel/intel_adsp/common/include/adsp_debug_window.h + * under Zephyr source tree. + * + * DEBUG_STREAM slot is reserved from SRAM window and a header is + * written in the beginning of the slot. The header is a static data + * structure that is initialized once at DSP boot time. All elements + * in bellow example are 32-bit unsigned integers: + * + * -------------------------------------------------- --- + * | id = DEBUG_STREAM_IDENTIFIER | | + * | total_size = 4096 | 64 bytes + * | num_sections = CONFIG_MP_MAX_NUM_CPUS * | | + * | | | + * -------------------------------------------------- --- + * | section_descriptor [] = { | | + * | { | | + * | core_id = 0 | | + * | size = 1344 | 64 bytes + * | offset = 64 | | + * | } | | + * | | | + * -------------------------------------------------- --- + * | { | | + * | core_id = 1 | | + * | size = 1344 | 64 bytes + * | offset = 1344+64 | | + * | } | | + * | | | + * -------------------------------------------------- --- + * | { | | + * | core_id = 2 | | + * | size = 1344 | 64 bytes + * | offset = 2*1344+64 | | + * | } | | + * | } | | + * | | | + * -------------------------------------------------- --- + * * CONFIG_MP_MAX_NUM_CPUS is 3 in this example + * + * The header contains generic information like identifier, total + * size, and number of sections. After the generic fields there is an + * array of section descriptors. Each array element is cacheline + * aligned. The array has 'num_sections' number of elements. Each + * element in the array describes a circular buffer, one for each DSP + * core. + * + * The remaining memory in the debug window slot is divided between + * those sections. The buffers are not necessarily of equal size, like + * in this example. The circular buffers are all cache line aligned, + * 64 in this example. One section looks like this: + * + * -------------------------------------------------- --- + * | next_seqno = | | + * | w_ptr = | 1344 bytes + * | buffer_data[1344/4-2] = { | | + * | | | + * | } | | + * -------------------------------------------------- --- + * + * The data records are described in debug_strem.h. In the case of + * debug window memory slot the next record should always be aligned + * to word (4-byte) boundary. + * + * The debug stream writes the records of abstract data to the + * circular buffer, and updates the w_ptr when the record is + * completely written. The host side receiver tries to keep up with the + * w_ptr and keeps track of its read position. The size of the record + * is written - again - after each record and before the next. This is + * to allow parsing the stream backwards in an overrun recovery + * situation. The w_ptr value is updated last, when the record is + * completely written. + */ + +#include + +/* Core specific section descriptor + * + * Section descriptor defines core ID, offset and size of the circular + * buffer in the debug window slot. + */ +struct debug_stream_section_descriptor { + uint32_t core_id; /* Core ID */ + uint32_t buf_words; /* Circular buffer size in 32-bit words */ + uint32_t offset; /* Core section offset */ +} __packed; + +/* Debug window slot header for Debug Stream. + * + * The header should be written in the beginning of the slot. + */ +struct debug_stream_slot_hdr { + struct debug_stream_hdr hdr; + uint32_t total_size; /* total size of payload including all sections */ + uint32_t num_sections; /* number of core specific sections */ + struct debug_stream_section_descriptor section_desc[]; +} __packed; + +struct debug_stream_circular_buf { + uint32_t next_seqno; + uint32_t w_ptr; + uint32_t data[]; +} __aligned(CONFIG_DCACHE_LINE_SIZE); + +struct debug_stream_record; +/** + * \brief Send debug stream records over debug window slot + * + * \param[in] rec the record to be written to circular buffer + * + * The debug window slot is initialized automatically at DSP boot + * time, and the core specific circular buffer is selected + * automatically. + * + * \return 0 on success + * -ENODEV if debug stream slot is not configured + * -ENOMEM if the record is too big + */ +int debug_stream_slot_send_record(struct debug_stream_record *rec); + +#endif /* __SOC_DEBUG_WINDOW_SLOT_H__ */ diff --git a/src/include/user/debug_stream_thread_info.h b/src/include/user/debug_stream_thread_info.h new file mode 100644 index 000000000000..02990e9a1c94 --- /dev/null +++ b/src/include/user/debug_stream_thread_info.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2024 Intel Corporation. + */ + +#ifndef __SOC_DEBUG_STREAM_THREAD_INFO_H__ +#define __SOC_DEBUG_STREAM_THREAD_INFO_H__ + +#include + +/* + * Debug Stream Thread Info record header, including the cpu load for the + * core, and followed by thread_count. Immediately after the header + * follows the thread fields. The thread_count indicates the number of + * thread fields. + */ +struct thread_info_record_hdr { + struct debug_stream_record hdr; + uint8_t load; /* Core's load U(0,8) fixed point value */ + uint8_t thread_count; +} __packed; + +/* + * Debug Stream Thread Info field for a single thread. + */ +struct thread_info { + uint8_t stack_usage; /* Relative stack usage U(0,8) fixed point value */ + uint8_t cpu_usage; /* Relative cpu usage U(0,8) fixed point value */ + uint8_t name_len; /* Length of name string */ + char name[]; +} __packed; + +#endif /* __SOC_DEBUG_STREAM_THREAD_INFO_H__ */ diff --git a/tools/debug_stream/debug_stream.py b/tools/debug_stream/debug_stream.py new file mode 100644 index 000000000000..717845dc49e5 --- /dev/null +++ b/tools/debug_stream/debug_stream.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-3-Clause +# +# Copyright (c) 2024, Intel Corporation. + +""" +For receiving, decoding and printing debug-stream records over debug window slot. +""" + +import argparse +import ctypes +import time + +import logging + +logging.basicConfig( + format="%(filename)s:%(lineno)s %(funcName)s: %(message)s", level=logging.WARNING +) + +DEBUG_STREAM_PAYLOAD_MAGIC = 0x1ED15EED + +# TODO: python construct would probably be cleaner than ctypes structs + +class DebugStreamHdr(ctypes.Structure): + """ + Generic Debug-stream header + """ + + _fields_ = [ + ("magic", ctypes.c_uint), + ("hdr_size", ctypes.c_uint), + ] + + +class DebugStreamRecord(ctypes.Structure): + """ + Debug Stream record for passing debug data + """ + + _fields_ = [ + ("id", ctypes.c_uint), + ("seqno", ctypes.c_uint), + ("size_words", ctypes.c_uint), + ] + + +class CPUInfo(ctypes.Structure): + """ + Thread Info record header + """ + + _pack_ = 1 + _fields_ = [ + ("hdr", DebugStreamRecord), + ("load", ctypes.c_ubyte), + ("thread_count", ctypes.c_ubyte), + ] + + +class ThreadInfo(ctypes.Structure): + """ + Thread specific data-record, the thread name string starts after name_len + """ + + _pack_ = 1 + _fields_ = [ + ("stack_usage", ctypes.c_ubyte), + ("cpu_load", ctypes.c_ubyte), + ("name_len", ctypes.c_ubyte), + ] + + +WSIZE = ctypes.sizeof(ctypes.c_uint) + + +class RecordPrinter: + """ + Debug Stream record decoder and printer class + """ + + RECORD_ID_UNINITIALIZED = 0 + RECORD_ID_THREAD_INFO = 1 + + def print_record(self, record, cpu): + """prints debug-stream record""" + recp = ctypes.cast(record, ctypes.POINTER(DebugStreamRecord)) + logging.debug( + "rec: %u %u %u", recp.contents.id, recp.contents.seqno, recp.contents.size_words + ) + if recp.contents.id == self.RECORD_ID_THREAD_INFO: + return self.print_thread_info(record, cpu) + logging.warning("cpu %u: Unsupported recodrd type %u", cpu, recp.contents.id) + return True + + def print_thread_info(self, record, cpu): + """prints thread-info record""" + remlen = len(record) - ctypes.sizeof(CPUInfo) + if remlen < 0: + logging.info("Buffer end reached, parsing failed") + return False + cpup = ctypes.cast(record, ctypes.POINTER(CPUInfo)) + print( + "CPU %u: Load: %02.1f%% %u threads (seqno %u)" + % ( + cpu, + cpup.contents.load / 2.55, + cpup.contents.thread_count, + cpup.contents.hdr.seqno, + ) + ) + remain = (ctypes.c_ubyte * (len(record) - ctypes.sizeof(CPUInfo))).from_address( + ctypes.addressof(record) + ctypes.sizeof(CPUInfo) + ) + for i in range(cpup.contents.thread_count): + remlen = remlen - ctypes.sizeof(ThreadInfo) + if remlen < 0: + logging.info("Buffer end reached, parsing failed on %u thread field", i) + return False + threadp = ctypes.cast(remain, ctypes.POINTER(ThreadInfo)) + remain = ( + ctypes.c_ubyte * (len(remain) - ctypes.sizeof(ThreadInfo)) + ).from_address(ctypes.addressof(remain) + ctypes.sizeof(ThreadInfo)) + remlen = remlen - threadp.contents.name_len + if remlen < 0: + logging.info("Buffer end reached, parsing failed on %u thread field", i) + return False + name = bytearray(remain[: threadp.contents.name_len]).decode("utf-8") + remain = ( + ctypes.c_ubyte * (len(remain) - threadp.contents.name_len) + ).from_address(ctypes.addressof(remain) + threadp.contents.name_len) + print( + " %-20s stack %02.1f%%\tload %02.1f%%" + % ( + name, + threadp.contents.stack_usage / 2.55, + threadp.contents.cpu_load / 2.55, + ) + ) + return True + + +class DebugStreamSectionDescriptor(ctypes.Structure): + """ + Describes CPU specific circular buffers + """ + + _fields_ = [ + ("core_id", ctypes.c_uint), + ("buf_words", ctypes.c_uint), + ("offset", ctypes.c_uint), + ] + + +class DebugStreamSlotHdr(ctypes.Structure): + """ + Debug Slot transport specific Debug Stream header, padded to cache line + """ + + _fields_ = [ + ("hdr", DebugStreamHdr), + ("total_size", ctypes.c_uint), + ("num_sections", ctypes.c_uint), + ] + + +class CircularBufHdr(ctypes.Structure): + """ + Live data header for CPU specific circular buffer + """ + + _fields_ = [ + ("next_seqno", ctypes.c_uint), + ("w_ptr", ctypes.c_uint), + ] + + +class CircularBufferDecoder: + """ + Class for extracting records from circular buffer + """ + + desc = None + boffset = None + buf_words = None + cpu = None + printer = None + prev_w_ptr = 0 + prev_seqno = None + error_count = 0 + + def __init__(self, desc, cpu, printer): + self.desc = desc + self.boffset = desc.offset + ctypes.sizeof(CircularBufHdr) + self.buf_words = desc.buf_words + self.cpu = cpu + self.printer = printer + logging.debug( + "boffset %u buf_words %u cpu %u", self.boffset, self.buf_words, self.cpu + ) + + def get_hdr(self, slot, pos): + """ + Get record header from position (handles circular buffer wrap) + """ + if pos >= self.buf_words: + logging.warning("Bad position %u", pos) + return None + hdr_size = ctypes.sizeof(DebugStreamRecord) + hdr_words = hdr_size // WSIZE + if pos + hdr_words > self.buf_words: + hdr = (ctypes.c_ubyte * hdr_size)() + size1 = (self.buf_words - pos) * WSIZE + size2 = hdr_size - size1 + pos1 = self.boffset + pos * WSIZE + pos2 = self.boffset + logging.debug( + "Wrapped header %u %u %u %u", pos, hdr_words, self.buf_words, size1 + ) + + hdr[0:size1] = slot[pos1 : pos1 + size1] + hdr[size1:hdr_size] = slot[pos2 : pos2 + size2] + header = ctypes.cast(hdr, ctypes.POINTER(DebugStreamRecord)).contents + else: + header = ctypes.cast( + slot[self.boffset + pos * WSIZE :], ctypes.POINTER(DebugStreamRecord) + ).contents + if header.id > 100 or header.size_words >= self.buf_words: + logging.warning( + "Broken record id %u seqno %u size %u", + header.id, + header.seqno, + header.size_words, + ) + return None + return header + + def get_record(self, slot, pos, seqno): + """ + Get record from position + """ + rec = self.get_hdr(slot, pos) + if rec is None or rec.size_words == 0: + return None + logging.debug( + "got header at pos %u rec %u %u %u", pos, rec.id, rec.seqno, rec.size_words + ) + if seqno is not None and rec.seqno != seqno: + logging.warning( + "Record seqno mismatch %u != %u, pos %u size %u", + rec.seqno, + seqno, + pos, + rec.size_words, + ) + self.error_count = self.error_count + 1 + return None + rwords = rec.size_words + rsize = rec.size_words * WSIZE + if pos + rwords > self.buf_words: + record = (ctypes.c_ubyte * rsize)() + size1 = (self.buf_words - pos) * WSIZE + size2 = rsize - size1 + pos1 = self.boffset + pos * WSIZE + pos2 = self.boffset + logging.debug( + "Wrapped record %u %u %u %u", pos, rsize, self.buf_words, size1 + ) + + record[0:size1] = slot[pos1 : pos1 + size1] + record[size1:rsize] = slot[pos2 : pos2 + size2] + else: + record = (ctypes.c_ubyte * rsize).from_buffer_copy( + slot, self.boffset + pos * WSIZE + ) + logging.info("got %u", rec.seqno) + self.error_count = 0 + return record + + def catch_up(self, slot): + """ + Search backwards from w_ptr for valid records + """ + circ = CircularBufHdr.from_buffer_copy(slot, self.desc.offset) + if circ.next_seqno == 0 or circ.w_ptr >= self.buf_words: + return + self.decode_past_records(slot, circ.w_ptr, circ.next_seqno) + self.prev_w_ptr = circ.w_ptr + self.prev_seqno = circ.next_seqno - 1 + logging.info("seqno %u w_ptr %u", self.prev_seqno, self.prev_w_ptr) + + def decode_past_records(self, slot, pos, seqno): + """ + Decode records backwards from pos. Should not be called directly, use catch_up() + """ + if self.prev_seqno is not None and self.prev_seqno >= seqno - 1: + return + if pos == 0: + spos = self.buf_words - 1 + else: + spos = pos - 1 + bsize = ctypes.cast( + slot[self.boffset + spos * 4 :], ctypes.POINTER(ctypes.c_uint) + ).contents.value + bpos = pos - bsize + if bpos < 0: + bpos = self.buf_words + pos - bsize + rec = self.get_hdr(slot, bpos) + if bsize != rec.size_words: + return + if seqno is not None: + if rec.seqno != seqno - 1: + return + else: + seqno = rec.seqno + 1 + + self.decode_past_records(slot, bpos, seqno - 1) + + record = self.get_record(slot, bpos, seqno - 1) + if record is not None: + if not self.printer.print_record(record, self.cpu): + logging.info("Parse failed on record %u", seqno - 1) + logging.info("Printing %u success", seqno - 1) + else: + logging.info("Broken record %u", seqno - 1) + + def get_next_record(self, slot): + """ + Get next record from the circular buffer and print it, returns True if a record + with expected seqno number was found and successfully decoded and printed. + """ + if self.prev_seqno is not None: + record = self.get_record(slot, self.prev_w_ptr, self.prev_seqno + 1) + else: + record = self.get_record(slot, self.prev_w_ptr, None) + if record is not None: + print_success = self.printer.print_record(record, self.cpu) + if print_success: + recp = ctypes.cast(record, ctypes.POINTER(DebugStreamRecord)) + self.prev_w_ptr = ( + self.prev_w_ptr + recp.contents.size_words + ) % self.buf_words + self.prev_seqno = recp.contents.seqno + else: + logging.info("Parse failed on record %u", self.prev_seqno + 1) + return print_success + self.error_count = self.error_count + 1 + logging.info("Record decoding failed %u", self.error_count) + return False + + def poll_buffer(self, slot): + """ + Poll for new records. If there were new records return True + """ + circ = CircularBufHdr.from_buffer_copy(slot, self.desc.offset) + if self.prev_w_ptr == circ.w_ptr: + return False + got_record = True + while self.prev_w_ptr != circ.w_ptr and got_record: + got_record = self.get_next_record(slot) + return True + + def check_error_count(self): + """ + Check if maximum error count was reached + """ + if self.error_count > 3: + return True + return False + + +class DebugStreamDecoder: + """ + Class for decoding debug-stream slot contents + """ + + file_size = 4096 # ADSP debug slot size + file = None + slot = None + descs = [] + circdec = [] + rec_printer = RecordPrinter() + + def set_file(self, file): + """ + Ser file to read the slot contents from + """ + self.file = file + + def update_slot(self): + """ + Update slot contents + """ + self.file.seek(0) + self.slot = self.file.read(self.file_size) + + def get_descriptors(self): + """ + Read the core specific descriptors and initialize core + specific circular buffer decoders. + + """ + if self.slot is None: + return False + hdr = ctypes.cast(self.slot, ctypes.POINTER(DebugStreamSlotHdr)) + if hdr.contents.hdr.magic != DEBUG_STREAM_PAYLOAD_MAGIC: + logging.warning("Debug Slot has bad magic 0x%08x", hdr.contents.hdr.magic) + return False + num_sections = hdr.contents.num_sections + if num_sections == len(self.descs): + return True + hsize = ctypes.sizeof(DebugStreamSlotHdr) + self.descs = (DebugStreamSectionDescriptor * num_sections).from_buffer_copy( + self.slot, hsize + ) + self.circdec = [ + CircularBufferDecoder(self.descs[i], i, self.rec_printer) + for i in range(len(self.descs)) + ] + logging.info( + "Header hdr_size %u total_size %u num_sections %u", + hdr.contents.hdr.hdr_size, + hdr.contents.total_size, + hdr.contents.num_sections, + ) + return True + + def catch_up_all(self): + """ + Checks all circular buffers if there is records behind w_ptr. If there + is valid seqno number available the decoding stops when there previous + printed record is reached. + """ + if len(self.descs) == 0 or self.slot is None: + return + for i in range(len(self.descs)): + self.circdec[i].catch_up(self.slot) + + def poll(self): + """ + Poll all circular buffers for more records. Returns True if nothing new + was found and its time to go to sleep. + """ + if len(self.descs) == 0 or self.slot is None: + return False + sleep = True + for i in range(len(self.descs)): + if self.circdec[i].poll_buffer(self.slot): + sleep = False + return sleep + + def check_slot(self): + """ + Check if previously updated slot contents is valid + """ + hdr = ctypes.cast(self.slot, ctypes.POINTER(DebugStreamSlotHdr)) + if hdr.contents.hdr.magic != DEBUG_STREAM_PAYLOAD_MAGIC: + self.slot = None + return False + if hdr.contents.num_sections != len(self.descs): + self.slot = None + return False + for i in range(len(self.descs)): + if self.circdec[i].check_error_count(): + self.circdec[i] = CircularBufferDecoder( + self.descs[i], i, self.rec_printer + ) + return True + + def reset(self): + """ + Reset decoder + """ + self.file = None + self.slot = None + + +def main_f(my_args): + """ + Open debug stream slot file and pass it to decoder + + This tool and the protocol it implements is for debugging and optimized + to interfere with SOF DSP as little as possible, but not worrying too much + about the host CPU load. That is why there where no synchronous mechanism + done and the host simply polls for new records. + """ + decoder = DebugStreamDecoder() + prev_error = None + while True: + try: + with open(my_args.debugstream_file, "rb") as file: + decoder.set_file(file) + decoder.update_slot() + if not decoder.get_descriptors(): + break + decoder.catch_up_all() + while True: + if decoder.poll(): + time.sleep(my_args.update_interval) + decoder.update_slot() + if not decoder.check_slot(): + break + + except FileNotFoundError: + print(f"File {my_args.debugstream_file} not found!") + break + except OSError as err: + if str(err) != prev_error: + print(f"Open {my_args.debugstream_file} failed '{err}'") + prev_error = str(err) + decoder.reset() + time.sleep(args.update_interval) + + +def parse_params(): + """Parses parameters""" + parser = argparse.ArgumentParser(description="SOF DebugStream thread info client. ") + parser.add_argument( + "-t", + "--update-interval", + type=float, + help="Telemetry2 window polling interval in seconds, default 1", + default=0.01, + ) + parser.add_argument( + "-f", + "--debugstream-file", + help="File to read the DebugStream data from, default /sys/kernel/debug/sof/debug_stream", + default="/sys/kernel/debug/sof/debug_stream", + ) + parsed_args = parser.parse_args() + return parsed_args + + +if __name__ == "__main__": + args = parse_params() + main_f(args) diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index cd21d44eba42..f9393df3f21a 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -188,6 +188,7 @@ endmacro() add_subdirectory(../src/init/ init_unused_install/) add_subdirectory(../src/ipc/ ipc_unused_install/) add_subdirectory(../src/debug/telemetry/ telemetry_unused_install/) +add_subdirectory(../src/debug/debug_stream/ debug_stream_unused_install/) add_subdirectory(test/)