diff --git a/src/cart/README.env b/src/cart/README.env index 656f2ab73e47..ad84d8c1b31b 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -167,3 +167,15 @@ This file lists the environment variables used in CaRT. . CRT_TEST_CONT When set to 1, orterun does not automatically shut down other servers when one server is shutdown. Used in cart internal testing. + + . D_CLIENT_METRICS_ENABLE + When set to 1, client side metrics will be collected on each daos client, which + can by retrieved by daos_metrics -j job_id on each client. + + . D_CLIENT_METRICS_RETAIN + when set to 1, client side metrics will be retained even after the job exits, i.e. + those metrics can be retrieved by daos_metrics even after job exits. + + . D_CLIENT_METRICS_DUMP_PATH + Set client side metrics dump path(file) for each client, so these metrics will be + dumped to the specified file when the job exits. diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 3766753c0596..df243b1dce98 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -18,6 +18,50 @@ static volatile int gdata_init_flag; struct crt_plugin_gdata crt_plugin_gdata; static bool g_prov_settings_applied[CRT_PROV_COUNT]; +/* List of the environment variables used in CaRT */ +static const char *crt_env_names[] = { + "D_PROVIDER", + "D_INTERFACE", + "D_DOMAIN", + "D_PORT", + "CRT_PHY_ADDR_STR", + "D_LOG_STDERR_IN_LOG", + "D_LOG_SIZE", + "D_LOG_FILE", + "D_LOG_FILE_APPEND_PID", + "D_LOG_MASK", + "DD_MASK", + "DD_STDERR", + "DD_SUBSYS", + "CRT_TIMEOUT", + "CRT_ATTACH_INFO_PATH", + "OFI_PORT", + "OFI_INTERFACE", + "OFI_DOMAIN", + "CRT_CREDIT_EP_CTX", + "CRT_CTX_SHARE_ADDR", + "CRT_CTX_NUM", + "D_FI_CONFIG", + "FI_UNIVERSE_SIZE", + "CRT_ENABLE_MEM_PIN", + "FI_OFI_RXM_USE_SRX", + "D_LOG_FLUSH", + "CRT_MRC_ENABLE", + "CRT_SECONDARY_PROVIDER", + "D_PROVIDER_AUTH_KEY", + "D_PORT_AUTO_ADJUST", + "D_POLL_TIMEOUT", + "D_LOG_FILE_APPEND_RANK", + "D_QUOTA_RPCS", + "D_POST_INIT", + "D_POST_INCR", + "DAOS_SIGNAL_REGISTER", + "D_CLIENT_METRICS_ENABLE", + "D_CLIENT_METRICS_RETAIN", + "D_CLIENT_METRICS_DUMP_PATH", + +}; + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -62,53 +106,19 @@ crt_lib_fini(void) static void dump_envariables(void) { - int i; - char *val; - static const char *var_names[] = {"D_PROVIDER", - "D_INTERFACE", - "D_DOMAIN", - "D_PORT", - "CRT_PHY_ADDR_STR", - "D_LOG_STDERR_IN_LOG", - "D_LOG_SIZE", - "D_LOG_FILE", - "D_LOG_FILE_APPEND_PID", - "D_LOG_MASK", - "DD_MASK", - "DD_STDERR", - "DD_SUBSYS", - "CRT_TIMEOUT", - "CRT_ATTACH_INFO_PATH", - "OFI_PORT", - "OFI_INTERFACE", - "OFI_DOMAIN", - "CRT_CREDIT_EP_CTX", - "CRT_CTX_SHARE_ADDR", - "CRT_CTX_NUM", - "D_FI_CONFIG", - "FI_UNIVERSE_SIZE", - "CRT_ENABLE_MEM_PIN", - "FI_OFI_RXM_USE_SRX", - "D_LOG_FLUSH", - "CRT_MRC_ENABLE", - "CRT_SECONDARY_PROVIDER", - "D_PROVIDER_AUTH_KEY", - "D_PORT_AUTO_ADJUST", - "D_POLL_TIMEOUT", - "D_LOG_FILE_APPEND_RANK", - "D_QUOTA_RPCS", - "D_POST_INIT", - "D_POST_INCR"}; + int i; D_INFO("-- ENVARS: --\n"); - for (i = 0; i < ARRAY_SIZE(var_names); i++) { - d_agetenv_str(&val, var_names[i]); + for (i = 0; i < ARRAY_SIZE(crt_env_names); i++) { + char *val = NULL; + + d_agetenv_str(&val, crt_env_names[i]); if (val == NULL) continue; - if (strcmp(var_names[i], "D_PROVIDER_AUTH_KEY") == 0) - D_INFO("%s = %s\n", var_names[i], "********"); + if (strcmp(crt_env_names[i], "D_PROVIDER_AUTH_KEY") == 0) + D_INFO("%s = %s\n", crt_env_names[i], "********"); else - D_INFO("%s = %s\n", var_names[i], val); + D_INFO("%s = %s\n", crt_env_names[i], val); d_freeenv_str(&val); } } diff --git a/src/client/api/SConscript b/src/client/api/SConscript index e12aa93eaa92..b7e93f516d53 100644 --- a/src/client/api/SConscript +++ b/src/client/api/SConscript @@ -1,7 +1,7 @@ """Build DAOS client""" LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c', - 'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c'] + 'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'metrics.c'] def scons(): diff --git a/src/client/api/init.c b/src/client/api/init.c index c93fd6393216..f574169d8c7b 100644 --- a/src/client/api/init.c +++ b/src/client/api/init.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -23,6 +23,7 @@ #include #include #include +#include #include "task_internal.h" #include @@ -219,6 +220,13 @@ daos_init(void) if (rc != 0) D_GOTO(out_pl, rc); + /** set up client telemetry */ + rc = dc_tm_init(); + if (rc != 0) { + /* should not be fatal */ + DL_WARN(rc, "failed to initialize client telemetry"); + } + /** set up pool */ rc = dc_pool_init(); if (rc != 0) @@ -242,6 +250,7 @@ daos_init(void) out_pool: dc_pool_fini(); out_mgmt: + dc_tm_fini(); dc_mgmt_fini(); out_pl: pl_fini(); @@ -291,6 +300,8 @@ daos_fini(void) D_GOTO(unlock, rc); } + /** clean up all registered per-module metrics */ + daos_metrics_fini(); dc_obj_fini(); dc_cont_fini(); dc_pool_fini(); @@ -301,6 +312,7 @@ daos_fini(void) D_ERROR("failed to disconnect some resources may leak, " DF_RC"\n", DP_RC(rc)); + dc_tm_fini(); dc_agent_fini(); dc_job_fini(); diff --git a/src/client/api/metrics.c b/src/client/api/metrics.c new file mode 100644 index 000000000000..2395d9b40f5e --- /dev/null +++ b/src/client/api/metrics.c @@ -0,0 +1,216 @@ +/* + * (C) Copyright 2020-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +#define D_LOGFAC DD_FAC(client) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define INIT_JOB_NUM 1024 +bool daos_client_metric; +bool daos_client_metric_retain; + +#define MAX_IDS_SIZE(num) (num * D_TM_METRIC_SIZE) +/* The client side metrics structure looks like + * root/job_id/pid/.... + */ + +static int +shm_chown(key_t key, uid_t new_owner) +{ + struct shmid_ds shmid_ds; + int shmid; + int rc; + + rc = shmget(key, 0, 0); + if (rc < 0) { + D_ERROR("shmget(0x%x) failed: %s (%d)\n", key, strerror(errno), errno); + return daos_errno2der(errno); + } + shmid = rc; + + rc = shmctl(shmid, IPC_STAT, &shmid_ds); + if (rc < 0) { + D_ERROR("shmctl(0x%x, IPC_STAT) failed: %s (%d)\n", shmid, strerror(errno), errno); + return daos_errno2der(errno); + } + + shmid_ds.shm_perm.uid = new_owner; + rc = shmctl(shmid, IPC_SET, &shmid_ds); + if (rc < 0) { + D_ERROR("shmctl(0x%x, IPC_SET) failed: %s (%d)\n", shmid, strerror(errno), errno); + return daos_errno2der(errno); + } + + return 0; +} + +static int +init_managed_root(const char *name, pid_t pid, int flags) +{ + uid_t agent_uid; + key_t key; + int rc; + + /* Set the key based on our pid so that it can be easily found. */ + key = pid - D_TM_SHARED_MEMORY_KEY; + rc = d_tm_init_with_name(key, MAX_IDS_SIZE(INIT_JOB_NUM), flags, name); + if (rc != 0) { + DL_ERROR(rc, "failed to initialize root for %s.", name); + return rc; + } + + /* Request that the agent adds our segment into the tree. */ + rc = dc_mgmt_tm_register(NULL, dc_jobid, pid, &agent_uid); + if (rc != 0) { + DL_ERROR(rc, "client telemetry setup failed."); + return rc; + } + + /* Change ownership of the segment so that the agent can manage it. */ + D_INFO("setting shm segment 0x%x to be owned by uid %d\n", pid, agent_uid); + rc = shm_chown(pid, agent_uid); + if (rc != 0) { + DL_ERROR(rc, "failed to chown shm segment."); + return rc; + } + + return 0; +} + +int +dc_tm_init(void) +{ + struct d_tm_node_t *started_at; + pid_t pid = getpid(); + int metrics_tag; + char root_name[D_TM_MAX_NAME_LEN]; + int rc; + + d_getenv_bool(DAOS_CLIENT_METRICS_ENABLE, &daos_client_metric); + if (!daos_client_metric && d_isenv_def(DAOS_CLIENT_METRICS_DUMP_PATH)) + daos_client_metric = true; + + if (!daos_client_metric) + return 0; + + D_INFO("Setting up client telemetry for %s/%d\n", dc_jobid, pid); + + rc = dc_tls_key_create(); + if (rc) + D_GOTO(out, rc); + + metrics_tag = D_TM_OPEN_OR_CREATE | D_TM_MULTIPLE_WRITER_LOCK; + d_getenv_bool(DAOS_CLIENT_METRICS_RETAIN, &daos_client_metric_retain); + if (daos_client_metric_retain) + metrics_tag |= D_TM_RETAIN_SHMEM; + + snprintf(root_name, sizeof(root_name), "%d", pid); + rc = init_managed_root(root_name, pid, metrics_tag); + if (rc != 0) { + DL_ERROR(rc, "failed to initialize client telemetry"); + D_GOTO(out, rc); + } + + rc = d_tm_add_metric(&started_at, D_TM_TIMESTAMP, "Timestamp of client startup", NULL, + "started_at"); + if (rc != 0) { + DL_ERROR(rc, "add metric started_at failed."); + D_GOTO(out, rc); + } + + d_tm_record_timestamp(started_at); +out: + if (rc != 0) { + daos_client_metric = false; + d_tm_fini(); + } + + return rc; +} + +static void +iter_dump(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format, + int opt_fields, void *arg) +{ + d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg); +} + +static int +dump_tm_file(const char *dump_path) +{ + struct d_tm_context *ctx; + struct d_tm_node_t *root; + char dirname[D_TM_MAX_NAME_LEN] = {0}; + uint32_t filter; + FILE *dump_file; + int rc = 0; + + dump_file = fopen(dump_path, "w+"); + if (dump_file == NULL) { + D_INFO("cannot open %s", dump_path); + return -DER_INVAL; + } + + filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO | + D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE; + + ctx = d_tm_open(DC_TM_JOB_ROOT_ID); + if (ctx == NULL) + D_GOTO(close, rc = -DER_NOMEM); + + snprintf(dirname, sizeof(dirname), "%s/%u", dc_jobid, getpid()); + root = d_tm_find_metric(ctx, dirname); + if (root == NULL) { + printf("No metrics found at: '%s'\n", dirname); + D_GOTO(close_ctx, rc = -DER_NONEXIST); + } + + d_tm_print_field_descriptors(0, dump_file); + + d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_CSV, 0, iter_dump, dump_file); + +close_ctx: + d_tm_close(&ctx); +close: + fclose(dump_file); + return rc; +} + +void +dc_tm_fini() +{ + char *dump_path; + int rc; + + if (!daos_client_metric) + return; + + rc = d_agetenv_str(&dump_path, DAOS_CLIENT_METRICS_DUMP_PATH); + if (rc != 0) + D_GOTO(out, rc); + if (dump_path != NULL) { + D_INFO("dump path is %s\n", dump_path); + dump_tm_file(dump_path); + } + d_freeenv_str(&dump_path); + +out: + dc_tls_fini(); + dc_tls_key_delete(); + + d_tm_fini(); +} diff --git a/src/common/SConscript b/src/common/SConscript index c61ecdeebe3a..38bd221793e2 100644 --- a/src/common/SConscript +++ b/src/common/SConscript @@ -9,7 +9,7 @@ COMMON_FILES = ['debug.c', 'mem.c', 'fail_loc.c', 'lru.c', 'dedup.c', 'profile.c', 'compression.c', 'compression_isal.c', 'compression_qat.c', 'multihash.c', 'multihash_isal.c', 'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c', - 'policy.c'] + 'policy.c', 'tls.c', 'metrics.c'] def build_daos_common(denv, client): diff --git a/src/common/metrics.c b/src/common/metrics.c new file mode 100644 index 000000000000..b6c88a3ea0d0 --- /dev/null +++ b/src/common/metrics.c @@ -0,0 +1,131 @@ +/** + * (C) Copyright 2016-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * It implements thread-local storage (TLS) for DAOS. + */ +#include +#include +#include + +struct metrics_list { + struct daos_module_metrics *mm_metrics; + d_list_t mm_list; + uint32_t mm_id; +}; + +/* Track list of loaded modules */ +D_LIST_HEAD(metrics_mod_list); +pthread_mutex_t metrics_mod_list_lock = PTHREAD_MUTEX_INITIALIZER; + +int +daos_metrics_init(enum daos_module_tag tag, uint32_t id, struct daos_module_metrics *metrics) +{ + struct metrics_list *ml; + + D_ALLOC_PTR(ml); + if (ml == NULL) + return -DER_NOMEM; + ml->mm_metrics = metrics; + ml->mm_id = id; + D_MUTEX_LOCK(&metrics_mod_list_lock); + d_list_add_tail(&ml->mm_list, &metrics_mod_list); + D_MUTEX_UNLOCK(&metrics_mod_list_lock); + + return 0; +} + +void +daos_metrics_fini(void) +{ + struct metrics_list *ml; + struct metrics_list *tmp; + + D_MUTEX_LOCK(&metrics_mod_list_lock); + d_list_for_each_entry_safe(ml, tmp, &metrics_mod_list, mm_list) { + d_list_del_init(&ml->mm_list); + D_FREE(ml); + } + D_MUTEX_UNLOCK(&metrics_mod_list_lock); +} + +void +daos_module_fini_metrics(enum dss_module_tag tag, void **metrics) +{ + struct metrics_list *ml; + + D_MUTEX_LOCK(&metrics_mod_list_lock); + d_list_for_each_entry(ml, &metrics_mod_list, mm_list) { + struct daos_module_metrics *met = ml->mm_metrics; + + if (met == NULL) + continue; + if ((met->dmm_tags & tag) == 0) + continue; + if (met->dmm_fini == NULL) + continue; + if (metrics[ml->mm_id] == NULL) + continue; + + met->dmm_fini(metrics[ml->mm_id]); + } + D_MUTEX_UNLOCK(&metrics_mod_list_lock); +} + +int +daos_module_init_metrics(enum dss_module_tag tag, void **metrics, const char *path, int tgt_id) +{ + struct metrics_list *ml; + + D_MUTEX_LOCK(&metrics_mod_list_lock); + d_list_for_each_entry(ml, &metrics_mod_list, mm_list) { + struct daos_module_metrics *met = ml->mm_metrics; + + if (met == NULL) + continue; + if ((met->dmm_tags & tag) == 0) + continue; + if (met->dmm_init == NULL) + continue; + + metrics[ml->mm_id] = met->dmm_init(path, tgt_id); + if (metrics[ml->mm_id] == NULL) { + D_ERROR("failed to allocate per-pool metrics for module %u\n", ml->mm_id); + D_MUTEX_UNLOCK(&metrics_mod_list_lock); + daos_module_fini_metrics(tag, metrics); + return -DER_NOMEM; + } + } + D_MUTEX_UNLOCK(&metrics_mod_list_lock); + + return 0; +} + +/** + * Query all modules for the number of per-pool metrics they create. + * + * \return Total number of metrics for all modules + */ +int +daos_module_nr_pool_metrics(void) +{ + struct metrics_list *ml; + int total = 0; + + d_list_for_each_entry(ml, &metrics_mod_list, mm_list) { + struct daos_module_metrics *met = ml->mm_metrics; + + if (met == NULL) + continue; + if (met->dmm_nr_metrics == NULL) + continue; + if (!(met->dmm_tags & DAOS_CLI_TAG)) + continue; + + total += met->dmm_nr_metrics(); + } + + return total; +} diff --git a/src/common/tls.c b/src/common/tls.c new file mode 100644 index 000000000000..89b9baf13e87 --- /dev/null +++ b/src/common/tls.c @@ -0,0 +1,227 @@ +/** + * (C) Copyright 2016-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * It implements thread-local storage (TLS) for DAOS. + */ +#include +#include + +/* The array remember all of registered module keys on one node. */ +static struct daos_module_key *daos_module_keys[DAOS_MODULE_KEYS_NR] = {NULL}; +pthread_mutex_t daos_module_keys_lock = PTHREAD_MUTEX_INITIALIZER; + +static __thread bool dc_tls_thread_init; + +static pthread_key_t dss_tls_key; +static pthread_key_t dc_tls_key; + +void +daos_register_key(struct daos_module_key *key) +{ + int i; + + D_MUTEX_LOCK(&daos_module_keys_lock); + for (i = 0; i < DAOS_MODULE_KEYS_NR; i++) { + if (daos_module_keys[i] == NULL) { + daos_module_keys[i] = key; + key->dmk_index = i; + break; + } + } + D_MUTEX_UNLOCK(&daos_module_keys_lock); + D_ASSERT(i < DAOS_MODULE_KEYS_NR); +} + +void +daos_unregister_key(struct daos_module_key *key) +{ + if (key == NULL) + return; + D_ASSERT(key->dmk_index >= 0); + D_ASSERT(key->dmk_index < DAOS_MODULE_KEYS_NR); + D_MUTEX_LOCK(&daos_module_keys_lock); + daos_module_keys[key->dmk_index] = NULL; + D_MUTEX_UNLOCK(&daos_module_keys_lock); +} + +struct daos_module_key * +daos_get_module_key(int index) +{ + D_ASSERT(index < DAOS_MODULE_KEYS_NR); + D_ASSERT(index >= 0); + + return daos_module_keys[index]; +} + +static int +daos_thread_local_storage_init(struct daos_thread_local_storage *dtls, int xs_id, int tgt_id) +{ + int rc = 0; + int i; + + if (dtls->dtls_values == NULL) { + D_ALLOC_ARRAY(dtls->dtls_values, DAOS_MODULE_KEYS_NR); + if (dtls->dtls_values == NULL) + return -DER_NOMEM; + } + + for (i = 0; i < DAOS_MODULE_KEYS_NR; i++) { + struct daos_module_key *dmk = daos_module_keys[i]; + + if (dmk != NULL && dtls->dtls_tag & dmk->dmk_tags) { + D_ASSERT(dmk->dmk_init != NULL); + dtls->dtls_values[i] = dmk->dmk_init(dtls->dtls_tag, xs_id, tgt_id); + if (dtls->dtls_values[i] == NULL) { + rc = -DER_NOMEM; + break; + } + } + } + return rc; +} + +static void +daos_thread_local_storage_fini(struct daos_thread_local_storage *dtls) +{ + int i; + + if (dtls->dtls_values != NULL) { + for (i = DAOS_MODULE_KEYS_NR - 1; i >= 0; i--) { + struct daos_module_key *dmk = daos_module_keys[i]; + + if (dmk != NULL && dtls->dtls_tag & dmk->dmk_tags) { + D_ASSERT(dtls->dtls_values[i] != NULL); + D_ASSERT(dmk->dmk_fini != NULL); + dmk->dmk_fini(dtls->dtls_tag, dtls->dtls_values[i]); + } + } + } + + D_FREE(dtls->dtls_values); +} + +/* + * Allocate daos_thread_local_storage for a particular thread on server and + * store the pointer in a thread-specific value which can be fetched at any + * time with daos_tls_get(). + */ +static struct daos_thread_local_storage * +daos_tls_init(int tag, int xs_id, int tgt_id, bool server) +{ + struct daos_thread_local_storage *dtls; + int rc; + + D_ALLOC_PTR(dtls); + if (dtls == NULL) + return NULL; + + dtls->dtls_tag = tag; + rc = daos_thread_local_storage_init(dtls, xs_id, tgt_id); + if (rc != 0) { + D_FREE(dtls); + return NULL; + } + + if (server) { + rc = pthread_setspecific(dss_tls_key, dtls); + } else { + rc = pthread_setspecific(dc_tls_key, dtls); + if (rc == 0) + dc_tls_thread_init = true; + } + + if (rc) { + D_ERROR("failed to initialize tls: %d\n", rc); + daos_thread_local_storage_fini(dtls); + D_FREE(dtls); + return NULL; + } + + return dtls; +} + +int +ds_tls_key_create(void) +{ + return pthread_key_create(&dss_tls_key, NULL); +} + +int +dc_tls_key_create(void) +{ + return pthread_key_create(&dc_tls_key, NULL); +} + +void +ds_tls_key_delete() +{ + pthread_key_delete(dss_tls_key); +} + +void +dc_tls_key_delete(void) +{ + pthread_key_delete(dc_tls_key); +} + +/* Free DTC for a particular thread. */ +static void +daos_tls_fini(struct daos_thread_local_storage *dtls, bool server) +{ + daos_thread_local_storage_fini(dtls); + D_FREE(dtls); + if (server) + pthread_setspecific(dss_tls_key, NULL); + else + pthread_setspecific(dc_tls_key, NULL); +} + +/* Allocate local per thread storage. */ +struct daos_thread_local_storage * +dc_tls_init(int tag, uint32_t pid) +{ + return daos_tls_init(tag, -1, pid, false); +} + +/* Free DTC for a particular thread. */ +void +dc_tls_fini(void) +{ + struct daos_thread_local_storage *dtls; + + dtls = (struct daos_thread_local_storage *)pthread_getspecific(dc_tls_key); + if (dtls != NULL) + daos_tls_fini(dtls, false); +} + +struct daos_thread_local_storage * +dc_tls_get(unsigned int tag) +{ + if (!dc_tls_thread_init) + return dc_tls_init(tag, getpid()); + + return (struct daos_thread_local_storage *)pthread_getspecific(dc_tls_key); +} + +struct daos_thread_local_storage * +dss_tls_get() +{ + return (struct daos_thread_local_storage *)pthread_getspecific(dss_tls_key); +} + +/* Allocate local per thread storage. */ +struct daos_thread_local_storage * +dss_tls_init(int tag, int xs_id, int tgt_id) +{ + return daos_tls_init(tag, xs_id, tgt_id, true); +} + +/* Free DTC for a particular thread. */ +void +dss_tls_fini(struct daos_thread_local_storage *dtls) +{ + daos_tls_fini(dtls, true); +} diff --git a/src/container/srv.c b/src/container/srv.c index 80650f7c16ca..05760d9439e6 100644 --- a/src/container/srv.c +++ b/src/container/srv.c @@ -12,6 +12,7 @@ #define D_LOGFAC DD_FAC(container) #include +#include #include #include "rpc.h" #include "srv_internal.h" @@ -142,11 +143,11 @@ struct dss_module_key cont_module_key = { .dmk_fini = dsm_tls_fini, }; -struct dss_module_metrics cont_metrics = { - .dmm_tags = DAOS_SYS_TAG, - .dmm_init = ds_cont_metrics_alloc, - .dmm_fini = ds_cont_metrics_free, - .dmm_nr_metrics = ds_cont_metrics_count, +struct daos_module_metrics cont_metrics = { + .dmm_tags = DAOS_SYS_TAG, + .dmm_init = ds_cont_metrics_alloc, + .dmm_fini = ds_cont_metrics_free, + .dmm_nr_metrics = ds_cont_metrics_count, }; struct dss_module cont_module = { diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go index 3a6f7a14368c..c9d08d197448 100644 --- a/src/control/cmd/daos_agent/config.go +++ b/src/control/cmd/daos_agent/config.go @@ -55,6 +55,14 @@ type Config struct { DisableAutoEvict bool `yaml:"disable_auto_evict,omitempty"` ExcludeFabricIfaces common.StringSet `yaml:"exclude_fabric_ifaces,omitempty"` FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"` + TelemetryPort int `yaml:"telemetry_port,omitempty"` + TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"` + TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"` +} + +// TelemetryExportEnabled returns true if client telemetry export is enabled. +func (c *Config) TelemetryExportEnabled() bool { + return c.TelemetryPort > 0 } // NUMAFabricConfig defines a list of fabric interfaces that belong to a NUMA @@ -89,6 +97,14 @@ func LoadConfig(cfgPath string) (*Config, error) { return nil, fmt.Errorf("invalid system name: %q", cfg.SystemName) } + if cfg.TelemetryRetain > 0 && cfg.TelemetryPort == 0 { + return nil, errors.New("telemetry_retain requires telemetry_port") + } + + if cfg.TelemetryEnabled && cfg.TelemetryPort == 0 { + return nil, errors.New("telemetry_enabled requires telemetry_port") + } + return cfg, nil } diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index 0dbdf4fc645d..cb777396ff1b 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,6 +8,7 @@ package main import ( "context" + "fmt" "net" "strings" "sync" @@ -22,6 +23,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/hardware" "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" + "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" ) @@ -36,17 +38,20 @@ type fabricScanFn func(ctx context.Context, providers ...string) (*NUMAFabric, e // NewInfoCache creates a new InfoCache with appropriate parameters set. func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryInvoker, cfg *Config) *InfoCache { ic := &InfoCache{ - log: log, - ignoreIfaces: cfg.ExcludeFabricIfaces, - client: client, - cache: cache.NewItemCache(log), - getAttachInfo: control.GetAttachInfo, - fabricScan: getFabricScanFn(log, cfg, hwprov.DefaultFabricScanner(log)), - netIfaces: net.Interfaces, - devClassGetter: hwprov.DefaultNetDevClassProvider(log), - devStateGetter: hwprov.DefaultNetDevStateProvider(log), + log: log, + ignoreIfaces: cfg.ExcludeFabricIfaces, + client: client, + cache: cache.NewItemCache(log), + getAttachInfoCb: control.GetAttachInfo, + fabricScan: getFabricScanFn(log, cfg, hwprov.DefaultFabricScanner(log)), + netIfaces: net.Interfaces, + devClassGetter: hwprov.DefaultNetDevClassProvider(log), + devStateGetter: hwprov.DefaultNetDevStateProvider(log), } + ic.clientTelemetryEnabled.Store(cfg.TelemetryEnabled) + ic.clientTelemetryRetain.Store(cfg.TelemetryRetain > 0) + if cfg.DisableCache { ic.DisableAttachInfoCache() ic.DisableFabricCache() @@ -198,12 +203,14 @@ type InfoCache struct { cache *cache.ItemCache fabricCacheDisabled atm.Bool attachInfoCacheDisabled atm.Bool + clientTelemetryEnabled atm.Bool + clientTelemetryRetain atm.Bool - getAttachInfo getAttachInfoFn - fabricScan fabricScanFn - netIfaces func() ([]net.Interface, error) - devClassGetter hardware.NetDevClassProvider - devStateGetter hardware.NetDevStateProvider + getAttachInfoCb getAttachInfoFn + fabricScan fabricScanFn + netIfaces func() ([]net.Interface, error) + devClassGetter hardware.NetDevClassProvider + devStateGetter hardware.NetDevStateProvider client control.UnaryInvoker attachInfoRefresh time.Duration @@ -292,6 +299,41 @@ func (c *InfoCache) EnableStaticFabricCache(ctx context.Context, nf *NUMAFabric) c.EnableFabricCache() } +func (c *InfoCache) getAttachInfo(ctx context.Context, rpcClient control.UnaryInvoker, req *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { + if c == nil { + return nil, errors.New("InfoCache is nil") + } + if c.getAttachInfoCb == nil { + return nil, errors.New("getAttachInfoFn is nil") + } + + resp, err := c.getAttachInfoCb(ctx, rpcClient, req) + if err != nil { + return nil, err + } + c.addTelemetrySettings(resp) + return resp, nil +} + +// addTelemetrySettings modifies the response by adding telemetry settings +// before returning it. +func (c *InfoCache) addTelemetrySettings(resp *control.GetAttachInfoResp) { + if c == nil || resp == nil { + return + } + + if c.clientTelemetryEnabled.IsTrue() { + resp.ClientNetHint.EnvVars = append(resp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsEnabledEnv), + ) + if c.clientTelemetryRetain.IsTrue() { + resp.ClientNetHint.EnvVars = append(resp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsRetainEnv), + ) + } + } +} + // GetAttachInfo fetches the attach info from the cache, and refreshes if necessary. func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.GetAttachInfoResp, error) { if c == nil { @@ -308,7 +350,8 @@ func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.Get } createItem := func() (cache.Item, error) { c.log.Debugf("cache miss for %s", sysAttachInfoKey(sys)) - return newCachedAttachInfo(c.attachInfoRefresh, sys, c.client, c.getAttachInfo), nil + cai := newCachedAttachInfo(c.attachInfoRefresh, sys, c.client, c.getAttachInfo) + return cai, nil } item, release, err := c.cache.GetOrCreate(ctx, sysAttachInfoKey(sys), createItem) diff --git a/src/control/cmd/daos_agent/infocache_test.go b/src/control/cmd/daos_agent/infocache_test.go index 54571d006a7b..e86c44bfc0ce 100644 --- a/src/control/cmd/daos_agent/infocache_test.go +++ b/src/control/cmd/daos_agent/infocache_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,20 +8,23 @@ package main import ( "context" + "fmt" "net" "testing" "time" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/pkg/errors" + "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/cache" "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/pkg/errors" ) type testInfoCacheParams struct { @@ -32,6 +35,8 @@ type testInfoCacheParams struct { mockNetDevStateGetter hardware.NetDevStateProvider disableFabricCache bool disableAttachInfoCache bool + enableClientTelemetry bool + retainClientTelemetry bool ctlInvoker control.Invoker cachedItems []cache.Item } @@ -43,16 +48,19 @@ func newTestInfoCache(t *testing.T, log logging.Logger, params testInfoCachePara } ic := &InfoCache{ - log: log, - getAttachInfo: params.mockGetAttachInfo, - fabricScan: params.mockScanFabric, - devClassGetter: params.mockNetDevClassGetter, - devStateGetter: params.mockNetDevStateGetter, - netIfaces: params.mockNetIfaces, - client: params.ctlInvoker, - cache: c, + log: log, + getAttachInfoCb: params.mockGetAttachInfo, + fabricScan: params.mockScanFabric, + devClassGetter: params.mockNetDevClassGetter, + devStateGetter: params.mockNetDevStateGetter, + netIfaces: params.mockNetIfaces, + client: params.ctlInvoker, + cache: c, } + ic.clientTelemetryEnabled.Store(params.enableClientTelemetry) + ic.clientTelemetryRetain.Store(params.retainClientTelemetry) + if ic.netIfaces == nil { ic.netIfaces = func() ([]net.Interface, error) { return []net.Interface{ @@ -714,6 +722,14 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { NetDevClass: uint32(hardware.Ether), }, } + telemEnabledResp := copyGetAttachInfoResp(ctlResp) + telemEnabledResp.ClientNetHint.EnvVars = append(telemEnabledResp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsEnabledEnv), + ) + telemRetainedResp := copyGetAttachInfoResp(telemEnabledResp) + telemRetainedResp.ClientNetHint.EnvVars = append(telemRetainedResp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsRetainEnv), + ) for name, tc := range map[string]struct { getInfoCache func(logging.Logger) *InfoCache @@ -734,7 +750,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { disableAttachInfoCache: true, }) }, - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expRemote: true, }, @@ -748,11 +764,45 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { expErr: errors.New("mock remote"), expRemote: true, }, + "cache disabled; client telemetry enabled": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + disableAttachInfoCache: true, + enableClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemEnabledResp, + expRemote: true, + }, + "cache enabled; client telemetry enabled": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + enableClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemEnabledResp, + expRemote: true, + expCached: true, + }, + "cache enabled; client telemetry enabled; client telemetry retained": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + enableClientTelemetry: true, + retainClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemRetainedResp, + expRemote: true, + expCached: true, + }, "enabled but empty": { getInfoCache: func(l logging.Logger) *InfoCache { return newTestInfoCache(t, l, testInfoCacheParams{}) }, - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expRemote: true, expCached: true, @@ -772,7 +822,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { fetch: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { return nil, errors.New("shouldn't call cached remote") }, - lastResponse: ctlResp, + lastResponse: copyGetAttachInfoResp(ctlResp), cacheItem: cacheItem{lastCached: time.Now()}, system: "test", }) @@ -790,7 +840,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { fetch: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { return nil, errors.New("shouldn't call cached remote") }, - lastResponse: ctlResp, + lastResponse: copyGetAttachInfoResp(ctlResp), cacheItem: cacheItem{lastCached: time.Now()}, system: build.DefaultSystemName, }) @@ -814,7 +864,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { return ic }, system: "somethingelse", - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expCached: true, expRemote: true, @@ -831,7 +881,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { calledRemote := false if ic != nil { - ic.getAttachInfo = func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { + ic.getAttachInfoCb = func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { calledRemote = true return tc.remoteResp, tc.remoteErr } diff --git a/src/control/cmd/daos_agent/main.go b/src/control/cmd/daos_agent/main.go index 73788a7cb496..1518207a3cbb 100644 --- a/src/control/cmd/daos_agent/main.go +++ b/src/control/cmd/daos_agent/main.go @@ -20,6 +20,7 @@ import ( "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/lib/atm" "github.com/daos-stack/daos/src/control/lib/control" + "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" "github.com/daos-stack/daos/src/control/logging" ) @@ -112,6 +113,17 @@ func parseOpts(args []string, opts *cliOptions, invoker control.Invoker, log *lo logCmd.SetLog(log) } + daosLogMask := daos.DefaultErrorMask + if opts.Debug { + log.SetLevel(logging.LogLevelTrace) + daosLogMask = daos.DefaultDebugMask + } + fini, err := daos.InitLogging(daosLogMask) + if err != nil { + return err + } + defer fini() + if jsonCmd, ok := cmd.(cmdutil.JSONOutputter); ok && opts.JSON { jsonCmd.EnableJSONOutput(os.Stdout, &wroteJSON) // disable output on stdout other than JSON @@ -194,7 +206,6 @@ func parseOpts(args []string, opts *cliOptions, invoker control.Invoker, log *lo return errors.Wrap(err, "Unable to load Certificate Data") } - var err error if cfg.AccessPoints, err = common.ParseHostList(cfg.AccessPoints, cfg.ControlPort); err != nil { return errors.Wrap(err, "Failed to parse config access_points") } diff --git a/src/control/cmd/daos_agent/mgmt_rpc.go b/src/control/cmd/daos_agent/mgmt_rpc.go index 17c07b4a2f62..75dc337e3138 100644 --- a/src/control/cmd/daos_agent/mgmt_rpc.go +++ b/src/control/cmd/daos_agent/mgmt_rpc.go @@ -25,6 +25,8 @@ import ( "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" "github.com/daos-stack/daos/src/control/logging" ) @@ -40,6 +42,7 @@ type mgmtModule struct { ctlInvoker control.Invoker cache *InfoCache monitor *procMon + cliMetricsSrc *promexp.ClientSource useDefaultNUMA bool numaGetter hardware.ProcessNUMAProvider @@ -71,6 +74,8 @@ func (mod *mgmtModule) HandleCall(ctx context.Context, session *drpc.Session, me switch method { case drpc.MethodGetAttachInfo: return mod.handleGetAttachInfo(ctx, req, cred.Pid) + case drpc.MethodSetupClientTelemetry: + return mod.handleSetupClientTelemetry(ctx, req, cred) case drpc.MethodNotifyPoolConnect: return nil, mod.handleNotifyPoolConnect(ctx, req, cred.Pid) case drpc.MethodNotifyPoolDisconnect: @@ -214,6 +219,33 @@ func (mod *mgmtModule) getFabricInterface(ctx context.Context, numaNode int, net return mod.cache.GetFabricDevice(ctx, numaNode, netDevClass, provider) } +func (mod *mgmtModule) handleSetupClientTelemetry(ctx context.Context, reqb []byte, cred *unix.Ucred) ([]byte, error) { + if len(reqb) == 0 { + return nil, errors.New("empty request") + } + + pbReq := new(mgmtpb.ClientTelemetryReq) + if err := proto.Unmarshal(reqb, pbReq); err != nil { + return nil, drpc.UnmarshalingPayloadFailure() + } + if pbReq.Jobid == "" { + return nil, errors.New("empty jobid") + } + if pbReq.ShmKey == 0 { + return nil, errors.New("unset shm key") + } + if cred == nil { + return nil, errors.New("nil user credentials") + } + + if err := telemetry.SetupClientRoot(ctx, pbReq.Jobid, int(cred.Pid), int(pbReq.ShmKey)); err != nil { + return nil, err + } + resp := &mgmtpb.ClientTelemetryResp{AgentUid: int32(unix.Getuid())} + mod.log.Tracef("%d: %s", cred.Pid, pblog.Debug(resp)) + return proto.Marshal(resp) +} + func (mod *mgmtModule) handleNotifyPoolConnect(ctx context.Context, reqb []byte, pid int32) error { pbReq := new(mgmtpb.PoolMonitorReq) if err := proto.Unmarshal(reqb, pbReq); err != nil { diff --git a/src/control/cmd/daos_agent/mgmt_rpc_test.go b/src/control/cmd/daos_agent/mgmt_rpc_test.go index 9bd85decf08d..59fcb507a810 100644 --- a/src/control/cmd/daos_agent/mgmt_rpc_test.go +++ b/src/control/cmd/daos_agent/mgmt_rpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -15,18 +15,22 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" + "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/testing/protocmp" "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/proto/convert" mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/drpc" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" ) @@ -388,3 +392,116 @@ func TestAgent_mgmtModule_RefreshCache(t *testing.T) { }) } } + +func TestAgent_handleSetupClientTelemetry(t *testing.T) { + testCreds := &unix.Ucred{ + Uid: 123, + Gid: 456, + } + testSysName := "test-sys" + testJobID := "test-job" + testShmKey := int32(42) + + for name, tc := range map[string]struct { + clientBytes []byte + clientReq *mgmtpb.ClientTelemetryReq + clientCred *unix.Ucred + expResp *mgmtpb.ClientTelemetryResp + expErr error + }{ + "nil client request": { + clientReq: nil, + clientCred: testCreds, + expErr: errors.New("empty request"), + }, + "garbage client request": { + clientBytes: []byte("invalid"), + clientCred: testCreds, + expErr: drpc.UnmarshalingPayloadFailure(), + }, + "unset jobid": { + clientReq: &mgmtpb.ClientTelemetryReq{ + Sys: testSysName, + Jobid: "", + ShmKey: testShmKey, + }, + clientCred: testCreds, + expErr: errors.New("empty jobid"), + }, + "unset shm key": { + clientReq: &mgmtpb.ClientTelemetryReq{ + Sys: testSysName, + Jobid: testJobID, + ShmKey: 0, + }, + clientCred: testCreds, + expErr: errors.New("unset shm key"), + }, + "nil user creds": { + clientReq: &mgmtpb.ClientTelemetryReq{ + Sys: testSysName, + Jobid: testJobID, + ShmKey: testShmKey, + }, + clientCred: nil, + expErr: errors.New("nil user credentials"), + }, + "success": { + clientReq: &mgmtpb.ClientTelemetryReq{ + Sys: testSysName, + Jobid: testJobID, + ShmKey: testShmKey, + }, + clientCred: testCreds, + expResp: &mgmtpb.ClientTelemetryResp{ + AgentUid: int32(unix.Getuid()), + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + mod := &mgmtModule{ + log: log, + } + + var reqBytes []byte + if len(tc.clientBytes) > 0 { + reqBytes = tc.clientBytes + } else { + var err error + reqBytes, err = proto.Marshal(tc.clientReq) + if err != nil { + t.Fatal(err) + } + } + + testID := uint32(telemetry.NextTestID(telemetry.AgentIDBase)) + telemetry.InitTestMetricsProducer(t, int(testID), 2048) + defer telemetry.CleanupTestMetricsProducer(t) + + parent := test.MustLogContext(t, log) + ctx, err := telemetry.Init(parent, testID) + if err != nil { + t.Fatal(err) + } + defer telemetry.Fini() + + gotResp, gotErr := mod.handleSetupClientTelemetry(ctx, reqBytes, tc.clientCred) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + expRespBytes, err := proto.Marshal(tc.expResp) + if err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(expRespBytes, gotResp, protocmp.Transform()); diff != "" { + t.Fatalf("-want, +got:\n%s", diff) + } + }) + } +} diff --git a/src/control/cmd/daos_agent/start.go b/src/control/cmd/daos_agent/start.go index cb5505234d52..e5416ee874bd 100644 --- a/src/control/cmd/daos_agent/start.go +++ b/src/control/cmd/daos_agent/start.go @@ -23,6 +23,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/hardware/hwloc" "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" "github.com/daos-stack/daos/src/control/lib/systemd" + "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" ) type ctxKey string @@ -98,15 +99,30 @@ func (cmd *startCmd) Execute(_ []string) error { procmon.startMonitoring(ctx) cmd.Debugf("started process monitor: %s", time.Since(procmonStart)) + var clientMetricSource *promexp.ClientSource + if cmd.cfg.TelemetryExportEnabled() { + if clientMetricSource, err = promexp.NewClientSource(ctx); err != nil { + return errors.Wrap(err, "unable to create client metrics source") + } + telemetryStart := time.Now() + shutdown, err := startPrometheusExporter(ctx, cmd, clientMetricSource, cmd.cfg) + if err != nil { + return errors.Wrap(err, "unable to start prometheus exporter") + } + defer shutdown() + cmd.Debugf("telemetry exporter started: %s", time.Since(telemetryStart)) + } + drpcRegStart := time.Now() drpcServer.RegisterRPCModule(NewSecurityModule(cmd.Logger, cmd.cfg.TransportConfig)) mgmtMod := &mgmtModule{ - log: cmd.Logger, - sys: cmd.cfg.SystemName, - ctlInvoker: cmd.ctlInvoker, - cache: cache, - numaGetter: hwprov.DefaultProcessNUMAProvider(cmd.Logger), - monitor: procmon, + log: cmd.Logger, + sys: cmd.cfg.SystemName, + ctlInvoker: cmd.ctlInvoker, + cache: cache, + numaGetter: hwprov.DefaultProcessNUMAProvider(cmd.Logger), + monitor: procmon, + cliMetricsSrc: clientMetricSource, } drpcServer.RegisterRPCModule(mgmtMod) cmd.Debugf("registered dRPC modules: %s", time.Since(drpcRegStart)) diff --git a/src/control/cmd/daos_agent/telemetry.go b/src/control/cmd/daos_agent/telemetry.go new file mode 100644 index 000000000000..4c0e2d35b4c7 --- /dev/null +++ b/src/control/cmd/daos_agent/telemetry.go @@ -0,0 +1,36 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package main + +import ( + "context" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" + "github.com/daos-stack/daos/src/control/logging" +) + +func startPrometheusExporter(ctx context.Context, log logging.Logger, cs *promexp.ClientSource, cfg *Config) (func(), error) { + expCfg := &promexp.ExporterConfig{ + Port: cfg.TelemetryPort, + Title: "DAOS Client Telemetry", + Register: func(ctx context.Context, log logging.Logger) error { + c, err := promexp.NewClientCollector(ctx, log, cs, &promexp.CollectorOpts{ + RetainDuration: cfg.TelemetryRetain, + }) + if err != nil { + return err + } + prometheus.MustRegister(c) + + return nil + }, + } + + return promexp.StartExporter(ctx, log, expCfg) +} diff --git a/src/control/common/proto/mgmt/svc.pb.go b/src/control/common/proto/mgmt/svc.pb.go index 444f64c57693..86c11e72f08d 100644 --- a/src/control/common/proto/mgmt/svc.pb.go +++ b/src/control/common/proto/mgmt/svc.pb.go @@ -1,13 +1,13 @@ // -// (C) Copyright 2018-2023 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.28.1 -// protoc v3.11.4 +// protoc-gen-go v1.31.0 +// protoc v3.21.12 // source: mgmt/svc.proto package mgmt @@ -990,6 +990,124 @@ func (x *PoolMonitorReq) GetJobid() string { return "" } +type ClientTelemetryReq struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Sys string `protobuf:"bytes,1,opt,name=sys,proto3" json:"sys,omitempty"` // DAOS system identifier + Jobid string `protobuf:"bytes,2,opt,name=jobid,proto3" json:"jobid,omitempty"` // Job ID used for client telemetry + ShmKey int32 `protobuf:"varint,3,opt,name=shm_key,json=shmKey,proto3" json:"shm_key,omitempty"` // Client's shared memory segment key +} + +func (x *ClientTelemetryReq) Reset() { + *x = ClientTelemetryReq{} + if protoimpl.UnsafeEnabled { + mi := &file_mgmt_svc_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ClientTelemetryReq) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ClientTelemetryReq) ProtoMessage() {} + +func (x *ClientTelemetryReq) ProtoReflect() protoreflect.Message { + mi := &file_mgmt_svc_proto_msgTypes[14] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ClientTelemetryReq.ProtoReflect.Descriptor instead. +func (*ClientTelemetryReq) Descriptor() ([]byte, []int) { + return file_mgmt_svc_proto_rawDescGZIP(), []int{14} +} + +func (x *ClientTelemetryReq) GetSys() string { + if x != nil { + return x.Sys + } + return "" +} + +func (x *ClientTelemetryReq) GetJobid() string { + if x != nil { + return x.Jobid + } + return "" +} + +func (x *ClientTelemetryReq) GetShmKey() int32 { + if x != nil { + return x.ShmKey + } + return 0 +} + +type ClientTelemetryResp struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS status code + AgentUid int32 `protobuf:"varint,2,opt,name=agent_uid,json=agentUid,proto3" json:"agent_uid,omitempty"` // UID of agent process +} + +func (x *ClientTelemetryResp) Reset() { + *x = ClientTelemetryResp{} + if protoimpl.UnsafeEnabled { + mi := &file_mgmt_svc_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ClientTelemetryResp) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ClientTelemetryResp) ProtoMessage() {} + +func (x *ClientTelemetryResp) ProtoReflect() protoreflect.Message { + mi := &file_mgmt_svc_proto_msgTypes[15] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ClientTelemetryResp.ProtoReflect.Descriptor instead. +func (*ClientTelemetryResp) Descriptor() ([]byte, []int) { + return file_mgmt_svc_proto_rawDescGZIP(), []int{15} +} + +func (x *ClientTelemetryResp) GetStatus() int32 { + if x != nil { + return x.Status + } + return 0 +} + +func (x *ClientTelemetryResp) GetAgentUid() int32 { + if x != nil { + return x.AgentUid + } + return 0 +} + type GroupUpdateReq_Engine struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -1003,7 +1121,7 @@ type GroupUpdateReq_Engine struct { func (x *GroupUpdateReq_Engine) Reset() { *x = GroupUpdateReq_Engine{} if protoimpl.UnsafeEnabled { - mi := &file_mgmt_svc_proto_msgTypes[14] + mi := &file_mgmt_svc_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1016,7 +1134,7 @@ func (x *GroupUpdateReq_Engine) String() string { func (*GroupUpdateReq_Engine) ProtoMessage() {} func (x *GroupUpdateReq_Engine) ProtoReflect() protoreflect.Message { - mi := &file_mgmt_svc_proto_msgTypes[14] + mi := &file_mgmt_svc_proto_msgTypes[16] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1065,7 +1183,7 @@ type GetAttachInfoResp_RankUri struct { func (x *GetAttachInfoResp_RankUri) Reset() { *x = GetAttachInfoResp_RankUri{} if protoimpl.UnsafeEnabled { - mi := &file_mgmt_svc_proto_msgTypes[15] + mi := &file_mgmt_svc_proto_msgTypes[17] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1078,7 +1196,7 @@ func (x *GetAttachInfoResp_RankUri) String() string { func (*GetAttachInfoResp_RankUri) ProtoMessage() {} func (x *GetAttachInfoResp_RankUri) ProtoReflect() protoreflect.Message { - mi := &file_mgmt_svc_proto_msgTypes[15] + mi := &file_mgmt_svc_proto_msgTypes[17] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1221,11 +1339,21 @@ var file_mgmt_svc_proto_rawDesc = []byte{ 0x6c, 0x65, 0x55, 0x55, 0x49, 0x44, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x70, 0x6f, 0x6f, 0x6c, 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x55, 0x55, 0x49, 0x44, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6a, 0x6f, 0x62, - 0x69, 0x64, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, - 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, - 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, - 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, 0x06, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x69, 0x64, 0x22, 0x55, 0x0a, 0x12, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x54, 0x65, 0x6c, 0x65, + 0x6d, 0x65, 0x74, 0x72, 0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, + 0x62, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6a, 0x6f, 0x62, 0x69, 0x64, + 0x12, 0x17, 0x0a, 0x07, 0x73, 0x68, 0x6d, 0x5f, 0x6b, 0x65, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x05, 0x52, 0x06, 0x73, 0x68, 0x6d, 0x4b, 0x65, 0x79, 0x22, 0x4a, 0x0a, 0x13, 0x43, 0x6c, 0x69, + 0x65, 0x6e, 0x74, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, + 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x61, 0x67, 0x65, 0x6e, + 0x74, 0x5f, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x61, 0x67, 0x65, + 0x6e, 0x74, 0x55, 0x69, 0x64, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, + 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, + 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, + 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, + 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -1241,7 +1369,7 @@ func file_mgmt_svc_proto_rawDescGZIP() []byte { } var file_mgmt_svc_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_mgmt_svc_proto_msgTypes = make([]protoimpl.MessageInfo, 16) +var file_mgmt_svc_proto_msgTypes = make([]protoimpl.MessageInfo, 18) var file_mgmt_svc_proto_goTypes = []interface{}{ (JoinResp_State)(0), // 0: mgmt.JoinResp.State (*DaosResp)(nil), // 1: mgmt.DaosResp @@ -1258,13 +1386,15 @@ var file_mgmt_svc_proto_goTypes = []interface{}{ (*PingRankReq)(nil), // 12: mgmt.PingRankReq (*SetRankReq)(nil), // 13: mgmt.SetRankReq (*PoolMonitorReq)(nil), // 14: mgmt.PoolMonitorReq - (*GroupUpdateReq_Engine)(nil), // 15: mgmt.GroupUpdateReq.Engine - (*GetAttachInfoResp_RankUri)(nil), // 16: mgmt.GetAttachInfoResp.RankUri + (*ClientTelemetryReq)(nil), // 15: mgmt.ClientTelemetryReq + (*ClientTelemetryResp)(nil), // 16: mgmt.ClientTelemetryResp + (*GroupUpdateReq_Engine)(nil), // 17: mgmt.GroupUpdateReq.Engine + (*GetAttachInfoResp_RankUri)(nil), // 18: mgmt.GetAttachInfoResp.RankUri } var file_mgmt_svc_proto_depIdxs = []int32{ - 15, // 0: mgmt.GroupUpdateReq.engines:type_name -> mgmt.GroupUpdateReq.Engine + 17, // 0: mgmt.GroupUpdateReq.engines:type_name -> mgmt.GroupUpdateReq.Engine 0, // 1: mgmt.JoinResp.state:type_name -> mgmt.JoinResp.State - 16, // 2: mgmt.GetAttachInfoResp.rank_uris:type_name -> mgmt.GetAttachInfoResp.RankUri + 18, // 2: mgmt.GetAttachInfoResp.rank_uris:type_name -> mgmt.GetAttachInfoResp.RankUri 9, // 3: mgmt.GetAttachInfoResp.client_net_hint:type_name -> mgmt.ClientNetHint 4, // [4:4] is the sub-list for method output_type 4, // [4:4] is the sub-list for method input_type @@ -1448,7 +1578,7 @@ func file_mgmt_svc_proto_init() { } } file_mgmt_svc_proto_msgTypes[14].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*GroupUpdateReq_Engine); i { + switch v := v.(*ClientTelemetryReq); i { case 0: return &v.state case 1: @@ -1460,6 +1590,30 @@ func file_mgmt_svc_proto_init() { } } file_mgmt_svc_proto_msgTypes[15].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ClientTelemetryResp); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mgmt_svc_proto_msgTypes[16].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GroupUpdateReq_Engine); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mgmt_svc_proto_msgTypes[17].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*GetAttachInfoResp_RankUri); i { case 0: return &v.state @@ -1478,7 +1632,7 @@ func file_mgmt_svc_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_mgmt_svc_proto_rawDesc, NumEnums: 1, - NumMessages: 16, + NumMessages: 18, NumExtensions: 0, NumServices: 0, }, diff --git a/src/control/common/test/utils.go b/src/control/common/test/utils.go index cd88b5acf25c..4d27fb78b2a2 100644 --- a/src/control/common/test/utils.go +++ b/src/control/common/test/utils.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2022 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -26,6 +26,8 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "golang.org/x/sys/unix" "google.golang.org/protobuf/testing/protocmp" + + "github.com/daos-stack/daos/src/control/logging" ) // AssertTrue asserts b is true @@ -408,3 +410,14 @@ func Context(t *testing.T) context.Context { t.Cleanup(cancel) return ctx } + +// MustLogContext returns a context containing the supplied logger. +// Canceled when the test is done. +func MustLogContext(t *testing.T, log logging.Logger) context.Context { + t.Helper() + ctx, err := logging.ToContext(Context(t), log) + if err != nil { + t.Fatal(err) + } + return ctx +} diff --git a/src/control/drpc/modules.go b/src/control/drpc/modules.go index 1a51bc2f67c3..0aacbae1d4a3 100644 --- a/src/control/drpc/modules.go +++ b/src/control/drpc/modules.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -157,6 +157,7 @@ func (m MgmtMethod) String() string { MethodPoolGetProp: "PoolGetProp", MethodPoolUpgrade: "PoolUpgrade", MethodLedManage: "LedManage", + MethodSetupClientTelemetry: "SetupClientTelemetry", }[m]; ok { return s } @@ -244,6 +245,8 @@ const ( MethodPoolUpgrade MgmtMethod = C.DRPC_METHOD_MGMT_POOL_UPGRADE // MethodLedManage defines a method to manage a VMD device LED state MethodLedManage MgmtMethod = C.DRPC_METHOD_MGMT_LED_MANAGE + // MethodSetupClientTelemetry defines a method to setup client telemetry + MethodSetupClientTelemetry MgmtMethod = C.DRPC_METHOD_MGMT_SETUP_CLIENT_TELEM ) type srvMethod int32 diff --git a/src/control/lib/daos/logging.go b/src/control/lib/daos/logging.go new file mode 100644 index 000000000000..9891adba0bed --- /dev/null +++ b/src/control/lib/daos/logging.go @@ -0,0 +1,47 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "os" + "strings" + + "github.com/pkg/errors" +) + +/* +#cgo LDFLAGS: -lgurt + +#include +*/ +import "C" + +const ( + // DefaultDebugMask defines the basic debug mask. + DefaultDebugMask = "DEBUG,MEM=ERR,OBJECT=ERR,PLACEMENT=ERR" + // DefaultInfoMask defines the basic info mask. + DefaultInfoMask = "INFO" + // DefaultErrorMask defines the basic error mask. + DefaultErrorMask = "ERROR" +) + +// InitLogging initializes the DAOS logging system. +func InitLogging(masks ...string) (func(), error) { + mask := strings.Join(masks, ",") + if mask == "" { + mask = DefaultInfoMask + } + os.Setenv("D_LOG_MASK", mask) + + if rc := C.daos_debug_init(nil); rc != 0 { + return func() {}, errors.Wrap(Status(rc), "daos_debug_init() failed") + } + + return func() { + C.daos_debug_fini() + }, nil +} diff --git a/src/control/lib/telemetry/promexp/client.go b/src/control/lib/telemetry/promexp/client.go new file mode 100644 index 000000000000..e6eefeaf3968 --- /dev/null +++ b/src/control/lib/telemetry/promexp/client.go @@ -0,0 +1,176 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "regexp" + "strconv" + "strings" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/logging" +) + +const ( + // defaultCleanupInterval is the default interval for pruning unused + // shared memory segments. + defaultCleanupInterval = 1 * time.Minute +) + +type ( + // ClientCollector is a metrics collector for DAOS client metrics. + ClientCollector struct { + metricsCollector + } + + // ClientSource is a metrics source for DAOS client metrics. + ClientSource struct { + MetricSource + cleanup func() + } +) + +func extractClientLabels(log logging.Logger, in string) (labels labelMap, name string) { + log.Tracef("in: %q", in) + + labels = make(labelMap) + compsIdx := 0 + comps := strings.Split(in, string(telemetry.PathSep)) + if len(comps) == 0 { + return labels, "" + } + + if strings.HasPrefix(comps[compsIdx], "ID") { + if len(comps) == 1 { + return labels, "" + } + compsIdx++ + } + + for i, label := range []string{"job", "pid", "tid"} { + if i > 0 { + // After jobid, we should have a pid and/or tid, and + // then move on to the engine labels. + _, err := strconv.Atoi(comps[compsIdx]) + if err != nil { + break + } + } + + if len(comps) == compsIdx+1 { + // If we have a weird path ending on a pid or tid, treat it + // as empty of labels. + if _, err := strconv.Atoi(comps[compsIdx]); err == nil && i > 0 { + return labelMap{}, "" + } + return labels, comps[compsIdx] + } + labels[label] = comps[compsIdx] + compsIdx++ + } + + var engLabels labelMap + engLabels, name = extractLabels(log, strings.Join(comps[compsIdx:], string(telemetry.PathSep))) + for k, v := range engLabels { + labels[k] = v + } + + return +} + +func newClientMetric(log logging.Logger, m telemetry.Metric) *sourceMetric { + labels, name := extractClientLabels(log, m.FullPath()) + baseName := "client_" + name + + return newSourceMetric(log, m, baseName, labels) +} + +// NewClientSource creates a new ClientSource for client metrics. +func NewClientSource(parent context.Context) (*ClientSource, error) { + ctx, err := telemetry.InitClientRoot(parent) + if err != nil { + return nil, errors.Wrap(err, "failed to init telemetry") + } + + go func(outer, inner context.Context) { + <-outer.Done() + telemetry.Detach(inner) + }(parent, ctx) + + return &ClientSource{ + MetricSource: MetricSource{ + ctx: ctx, + enabled: atm.NewBool(true), + tmSchema: telemetry.NewSchema(), + smSchema: newSourceMetricSchema(newClientMetric), + }, + }, nil +} + +// NewClientCollector creates a new ClientCollector instance. +func NewClientCollector(ctx context.Context, log logging.Logger, source *ClientSource, opts *CollectorOpts) (*ClientCollector, error) { + if opts == nil { + opts = defaultCollectorOpts() + } + + if opts.RetainDuration == 0 { + // Clients will clean up after themselves, but we still need to + // periodically remove the top-level jobid segments. + opts.RetainDuration = defaultCleanupInterval + } + + log.Debugf("pruning unused client metric segments every %s", opts.RetainDuration) + go func() { + pruneTicker := time.NewTicker(opts.RetainDuration) + defer pruneTicker.Stop() + + for { + select { + case <-ctx.Done(): + case <-pruneTicker.C: + source.PruneSegments(log, opts.RetainDuration) + } + } + }() + + c := &ClientCollector{ + metricsCollector: metricsCollector{ + log: log, + summary: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "client", + Subsystem: "exporter", + Name: "scrape_duration_seconds", + Help: "daos_client_exporter: Duration of a scrape job.", + }, + []string{"source", "result"}, + ), + collectFn: func(ch chan *sourceMetric) { + source.Collect(log, ch) + }, + }, + } + + for _, pat := range opts.Ignores { + re, err := regexp.Compile(pat) + if err != nil { + return nil, errors.Wrapf(err, "failed to compile %q", pat) + } + c.ignoredMetrics = append(c.ignoredMetrics, re) + } + + return c, nil +} diff --git a/src/control/lib/telemetry/promexp/client_test.go b/src/control/lib/telemetry/promexp/client_test.go new file mode 100644 index 000000000000..d0274f157b50 --- /dev/null +++ b/src/control/lib/telemetry/promexp/client_test.go @@ -0,0 +1,163 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package promexp + +import ( + "fmt" + "regexp" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestPromExp_extractClientLabels(t *testing.T) { + shmID := 256 + jobID := "testJob" + pid := "12345" + tid := "67890" + + testPath := func(suffix string) string { + return fmt.Sprintf("ID: %d/%s/%s/%s/%s", shmID, jobID, pid, tid, suffix) + } + + for name, tc := range map[string]struct { + input string + expName string + expLabels labelMap + }{ + "empty": { + expLabels: labelMap{}, + }, + "ID stripped": { + input: "ID: 123", + expLabels: labelMap{}, + }, + "weird truncation": { + input: "ID: 123/jobbo/6783/90", + expLabels: labelMap{}, + }, + "active update ops": { + input: testPath("io/ops/update/active"), + expName: "io_ops_update_active", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "tid": tid, + }, + }, + "fetch latency 1MB": { + input: testPath("io/latency/fetch/1MB"), + expName: "io_latency_fetch", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "tid": tid, + "size": "1MB", + }, + }, + "started_at": { + input: fmt.Sprintf("ID: %d/%s/%s/started_at", shmID, jobID, pid), + expName: "started_at", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + }, + }, + "pool ops": { + input: fmt.Sprintf("ID: %d/%s/%s/pool/%s/ops/foo", shmID, jobID, pid, test.MockPoolUUID(1)), + expName: "pool_ops_foo", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "pool": test.MockPoolUUID(1).String(), + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + labels, name := extractClientLabels(log, tc.input) + + test.AssertEqual(t, name, tc.expName, "") + if diff := cmp.Diff(labels, tc.expLabels); diff != "" { + t.Errorf("labels mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestPromExp_NewClientCollector(t *testing.T) { + for name, tc := range map[string]struct { + opts *CollectorOpts + expErr error + expResult *ClientCollector + }{ + "defaults": { + expResult: &ClientCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + }, + }, + }, + "opts with ignores": { + opts: &CollectorOpts{Ignores: []string{"one", "two"}}, + expResult: &ClientCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + ignoredMetrics: []*regexp.Regexp{ + regexp.MustCompile("one"), + regexp.MustCompile("two"), + }, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx := test.MustLogContext(t, log) + cs, err := NewClientSource(ctx) + if err != nil { + t.Fatal(err) + } + result, err := NewClientCollector(ctx, log, cs, tc.opts) + + test.CmpErr(t, tc.expErr, err) + + cmpOpts := []cmp.Option{ + cmpopts.IgnoreUnexported(MetricSource{}), + cmpopts.IgnoreUnexported(prometheus.SummaryVec{}), + cmpopts.IgnoreUnexported(prometheus.MetricVec{}), + cmpopts.IgnoreUnexported(regexp.Regexp{}), + cmp.AllowUnexported(ClientCollector{}), + cmp.AllowUnexported(metricsCollector{}), + cmp.FilterPath(func(p cmp.Path) bool { + // Ignore a few specific fields + return (strings.HasSuffix(p.String(), "log") || + strings.HasSuffix(p.String(), "sourceMutex") || + strings.HasSuffix(p.String(), "cleanupSource") || + strings.HasSuffix(p.String(), "collectFn")) + }, cmp.Ignore()), + } + if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { + t.Fatalf("(-want, +got)\n%s", diff) + } + }) + } +} diff --git a/src/control/lib/telemetry/promexp/collector.go b/src/control/lib/telemetry/promexp/collector.go index 03e6fa40dd50..ec70c0e8fbdb 100644 --- a/src/control/lib/telemetry/promexp/collector.go +++ b/src/control/lib/telemetry/promexp/collector.go @@ -7,445 +7,34 @@ // +build linux // +build amd64 arm64 -// - package promexp import ( - "context" - "fmt" "regexp" - "strings" - "sync" - "unicode" + "time" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/daos-stack/daos/src/control/lib/atm" "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" ) type ( - Collector struct { - log logging.Logger - summary *prometheus.SummaryVec - ignoredMetrics []*regexp.Regexp - sources []*EngineSource - cleanupSource map[uint32]func() - sourceMutex sync.RWMutex // To protect sources - } - + // CollectorOpts contains options for the metrics collector. CollectorOpts struct { - Ignores []string - } - - EngineSource struct { - ctx context.Context - tmMutex sync.RWMutex // To protect telemetry collection - Index uint32 - Rank uint32 - enabled atm.Bool - tmSchema *telemetry.Schema - rmSchema rankMetricSchema + Ignores []string + RetainDuration time.Duration } - rankMetricSchema struct { - mu sync.Mutex - rankMetrics map[string]*rankMetric - seen map[string]struct{} + metricsCollector struct { + log logging.Logger + summary *prometheus.SummaryVec + ignoredMetrics []*regexp.Regexp + collectFn func(ch chan *sourceMetric) } ) -func (s *rankMetricSchema) Prune() { - s.mu.Lock() - defer s.mu.Unlock() - - for id := range s.rankMetrics { - if _, found := s.seen[id]; !found { - delete(s.rankMetrics, id) - } - } - s.seen = make(map[string]struct{}) -} - -func (s *rankMetricSchema) add(log logging.Logger, rank uint32, metric telemetry.Metric) (rm *rankMetric) { - s.mu.Lock() - defer s.mu.Unlock() - - id := metric.FullPath() - s.seen[id] = struct{}{} - - var found bool - if rm, found = s.rankMetrics[id]; !found { - rm = newRankMetric(log, rank, metric) - s.rankMetrics[id] = rm - } else { - rm.resetVecs() - } - - return -} - -func NewEngineSource(parent context.Context, idx uint32, rank uint32) (*EngineSource, func(), error) { - ctx, err := telemetry.Init(parent, idx) - if err != nil { - return nil, nil, errors.Wrap(err, "failed to init telemetry") - } - - cleanupFn := func() { - telemetry.Detach(ctx) - } - - return &EngineSource{ - ctx: ctx, - Index: idx, - Rank: rank, - enabled: atm.NewBool(true), - tmSchema: telemetry.NewSchema(), - rmSchema: rankMetricSchema{ - rankMetrics: make(map[string]*rankMetric), - seen: make(map[string]struct{}), - }, - }, cleanupFn, nil -} - -func defaultCollectorOpts() *CollectorOpts { - return &CollectorOpts{} -} - -func NewCollector(log logging.Logger, opts *CollectorOpts, sources ...*EngineSource) (*Collector, error) { - if opts == nil { - opts = defaultCollectorOpts() - } - - c := &Collector{ - log: log, - sources: sources, - cleanupSource: make(map[uint32]func()), - summary: prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: "engine", - Subsystem: "exporter", - Name: "scrape_duration_seconds", - Help: "daos_exporter: Duration of a scrape job.", - }, - []string{"source", "result"}, - ), - } - - for _, pat := range opts.Ignores { - re, err := regexp.Compile(pat) - if err != nil { - return nil, errors.Wrapf(err, "failed to compile %q", pat) - } - c.ignoredMetrics = append(c.ignoredMetrics, re) - } - - return c, nil -} - -type labelMap map[string]string - -func (lm labelMap) keys() (keys []string) { - for label := range lm { - keys = append(keys, label) - } - - return -} - -func sanitizeMetricName(in string) string { - return strings.Map(func(r rune) rune { - switch { - // Valid names for Prometheus are limited to: - case r >= 'a' && r <= 'z': // lowercase letters - case r >= 'A' && r <= 'Z': // uppercase letters - case unicode.IsDigit(r): // digits - default: // sanitize any other character - return '_' - } - - return r - }, strings.TrimLeft(in, "/")) -} - -func matchLabel(labels labelMap, input, match, label string) bool { - if !strings.HasPrefix(input, match) { - return false - } - - splitStr := strings.SplitN(input, "_", 2) - if len(splitStr) == 2 { - labels[label] = splitStr[1] - return true - } - return false -} - -func appendName(cur, name string) string { - if cur == "" { - return name - } - return cur + "_" + name -} - -// extractLabels takes a "/"-separated DAOS metric name in order to -// create a normalized Prometheus name and label map. -// -// NB: Prometheus metric names should follow best practices as -// outlined at https://prometheus.io/docs/practices/naming/ -// -// In particular, a metric name should describe the measurement, -// not the entity the measurement is about. In other words, if 4 -// different entities share the same measurement, then there should -// be a single metric with a label that distinguishes between -// individual measurement values. -// -// Good: pool_started_at {pool="00000000-1111-2222-3333-4444444444"} -// Bad: pool_00000000_1111_2222_3333_4444444444_started_at -func extractLabels(in string) (labels labelMap, name string) { - labels = make(labelMap) - compsIdx := 0 - comps := strings.Split(in, string(telemetry.PathSep)) - if len(comps) == 0 { - return labels, in - } - - if strings.HasPrefix(comps[compsIdx], "ID") { - if len(comps) == 1 { - return labels, "" - } - compsIdx++ - } - - switch comps[compsIdx] { - case "pool": - name = "pool" - compsIdx++ - labels["pool"] = comps[compsIdx] - compsIdx++ - switch comps[compsIdx] { - case "ops": - compsIdx++ - name += "_ops_" + comps[compsIdx] - compsIdx++ - } - case "io": - name = "io" - compsIdx++ - switch comps[compsIdx] { - case "latency": - compsIdx++ - name += "_latency_" + comps[compsIdx] - compsIdx++ - labels["size"] = comps[compsIdx] - compsIdx++ - case "ops": - compsIdx++ - name += "_ops_" + comps[compsIdx] - compsIdx++ - default: - name += "_" + comps[compsIdx] - compsIdx++ - } - case "net": - compsIdx++ - if comps[compsIdx] == "uri" { - compsIdx++ - name = "net_uri_" + comps[compsIdx] - compsIdx++ - break - } - - name = "net" - labels["provider"] = comps[compsIdx] - compsIdx++ - case "nvme": - name = "nvme" - compsIdx++ - labels["device"] = comps[compsIdx] - compsIdx++ - } - - for { - if len(comps) == compsIdx { - break - } - - switch { - case matchLabel(labels, comps[compsIdx], "tgt_", "target"): - compsIdx++ - case matchLabel(labels, comps[compsIdx], "xs_", "xstream"): - compsIdx++ - case matchLabel(labels, comps[compsIdx], "ctx_", "context"): - compsIdx++ - default: - name = appendName(name, comps[compsIdx]) - compsIdx++ - } - } - - name = sanitizeMetricName(name) - return -} - -func (es *EngineSource) Collect(log logging.Logger, ch chan<- *rankMetric) { - if es == nil { - log.Error("nil engine source") - return - } - if !es.IsEnabled() { - return - } - if ch == nil { - log.Error("nil channel") - return - } - - es.tmMutex.RLock() - defer es.tmMutex.RUnlock() - - metrics := make(chan telemetry.Metric) - go func() { - if err := telemetry.CollectMetrics(es.ctx, es.tmSchema, metrics); err != nil { - log.Errorf("failed to collect metrics for engine rank %d: %s", es.Rank, err) - return - } - es.tmSchema.Prune() - }() - - for metric := range metrics { - ch <- es.rmSchema.add(log, es.Rank, metric) - } - es.rmSchema.Prune() -} - -// IsEnabled checks if the engine source is enabled. -func (es *EngineSource) IsEnabled() bool { - return es.enabled.IsTrue() -} - -// Enable enables the engine source. -func (es *EngineSource) Enable() { - es.enabled.SetTrue() -} - -// Disable disables the engine source. -func (es *EngineSource) Disable() { - es.enabled.SetFalse() -} - -type gvMap map[string]*prometheus.GaugeVec - -func (m gvMap) add(name, help string, labels labelMap) { - if _, found := m[name]; !found { - gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: name, - Help: help, - }, labels.keys()) - m[name] = gv - } -} - -func (m gvMap) set(name string, value float64, labels labelMap) error { - gv, found := m[name] - if !found { - return errors.Errorf("gauge vector %s not found", name) - } - gv.With(prometheus.Labels(labels)).Set(value) - - return nil -} - -type cvMap map[string]*prometheus.CounterVec - -func (m cvMap) add(name, help string, labels labelMap) { - if _, found := m[name]; !found { - cv := prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: name, - Help: help, - }, labels.keys()) - m[name] = cv - } -} - -func (m cvMap) set(name string, value float64, labels labelMap) error { - cv, found := m[name] - if !found { - return errors.Errorf("counter vector %s not found", name) - } - cv.With(prometheus.Labels(labels)).Add(value) - - return nil -} - -type rankMetric struct { - rank uint32 - metric telemetry.Metric - baseName string - labels labelMap - gvm gvMap - cvm cvMap -} - -func (rm *rankMetric) collect(ch chan<- prometheus.Metric) { - for _, gv := range rm.gvm { - gv.Collect(ch) - } - for _, cv := range rm.cvm { - cv.Collect(ch) - } -} - -func (rm *rankMetric) resetVecs() { - for _, gv := range rm.gvm { - gv.Reset() - } - for _, cv := range rm.cvm { - cv.Reset() - } -} - -func newRankMetric(log logging.Logger, rank uint32, m telemetry.Metric) *rankMetric { - rm := &rankMetric{ - metric: m, - rank: rank, - gvm: make(gvMap), - cvm: make(cvMap), - } - - var name string - rm.labels, name = extractLabels(m.FullPath()) - rm.labels["rank"] = fmt.Sprintf("%d", rm.rank) - rm.baseName = "engine_" + name - - desc := m.Desc() - - switch rm.metric.Type() { - case telemetry.MetricTypeGauge, telemetry.MetricTypeTimestamp, - telemetry.MetricTypeSnapshot: - rm.gvm.add(rm.baseName, desc, rm.labels) - case telemetry.MetricTypeStatsGauge, telemetry.MetricTypeDuration: - rm.gvm.add(rm.baseName, desc, rm.labels) - for _, ms := range getMetricStats(rm.baseName, rm.metric) { - if ms.isCounter { - rm.cvm.add(ms.name, ms.desc, rm.labels) - } else { - rm.gvm.add(ms.name, ms.desc, rm.labels) - } - } - case telemetry.MetricTypeCounter: - rm.cvm.add(rm.baseName, desc, rm.labels) - default: - log.Errorf("[%s]: metric type %d not supported", name, rm.metric.Type()) - } - - return rm -} - -func (c *Collector) isIgnored(name string) bool { +func (c *metricsCollector) isIgnored(name string) bool { for _, re := range c.ignoredMetrics { // TODO: We may want to look into removing the use of regexp here // in favor of a less-flexible but more efficient approach. @@ -458,121 +47,7 @@ func (c *Collector) isIgnored(name string) bool { return false } -type metricStat struct { - name string - desc string - value float64 - isCounter bool -} - -func getMetricStats(baseName string, m telemetry.Metric) (stats []*metricStat) { - ms, ok := m.(telemetry.StatsMetric) - if !ok { - return - } - - for name, s := range map[string]struct { - fn func() float64 - desc string - isCounter bool - }{ - "min": { - fn: func() float64 { return float64(ms.Min()) }, - desc: " (min value)", - }, - "max": { - fn: func() float64 { return float64(ms.Max()) }, - desc: " (max value)", - }, - "mean": { - fn: ms.Mean, - desc: " (mean)", - }, - "sum": { - fn: func() float64 { return float64(ms.Sum()) }, - desc: " (sum)", - }, - "stddev": { - fn: ms.StdDev, - desc: " (std dev)", - }, - "sumsquares": { - fn: ms.SumSquares, - desc: " (sum of squares)", - }, - "samples": { - fn: func() float64 { return float64(ms.SampleSize()) }, - desc: " (samples)", - isCounter: true, - }, - } { - stats = append(stats, &metricStat{ - name: baseName + "_" + name, - desc: m.Desc() + s.desc, - value: s.fn(), - isCounter: s.isCounter, - }) - } - - return -} - -// AddSource adds an EngineSource to the Collector. -func (c *Collector) AddSource(es *EngineSource, cleanup func()) { - if es == nil { - c.log.Error("attempted to add nil EngineSource") - return - } - - c.sourceMutex.Lock() - defer c.sourceMutex.Unlock() - - // If we attempt to add a duplicate, remove the old one. - c.removeSourceNoLock(es.Index) - - c.sources = append(c.sources, es) - if cleanup != nil { - c.cleanupSource[es.Index] = cleanup - } -} - -// RemoveSource removes an EngineSource with a given index from the Collector. -func (c *Collector) RemoveSource(engineIdx uint32) { - c.sourceMutex.Lock() - defer c.sourceMutex.Unlock() - - c.removeSourceNoLock(engineIdx) -} - -func (c *Collector) removeSourceNoLock(engineIdx uint32) { - for i, es := range c.sources { - if es.Index == engineIdx { - es.Disable() - c.sources = append(c.sources[:i], c.sources[i+1:]...) - - // Ensure that EngineSource isn't collecting during cleanup - es.tmMutex.Lock() - if cleanup, found := c.cleanupSource[engineIdx]; found && cleanup != nil { - cleanup() - } - es.tmMutex.Unlock() - delete(c.cleanupSource, engineIdx) - break - } - } -} - -func (c *Collector) getSources() []*EngineSource { - c.sourceMutex.RLock() - defer c.sourceMutex.RUnlock() - - sourceCopy := make([]*EngineSource, len(c.sources)) - _ = copy(sourceCopy, c.sources) - return sourceCopy -} - -// Collect collects metrics from all EngineSources. -func (c *Collector) Collect(ch chan<- prometheus.Metric) { +func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) { if c == nil { return } @@ -580,55 +55,57 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) { c.log.Error("passed a nil channel") return } + if c.collectFn == nil { + c.log.Error("collectFn is nil") + return + } - rankMetrics := make(chan *rankMetric) - go func(sources []*EngineSource) { - for _, source := range sources { - source.Collect(c.log, rankMetrics) - } - close(rankMetrics) - }(c.getSources()) + sourceMetrics := make(chan *sourceMetric) + go func() { + c.collectFn(sourceMetrics) + close(sourceMetrics) + }() - for rm := range rankMetrics { - if c.isIgnored(rm.baseName) { + for sm := range sourceMetrics { + if c.isIgnored(sm.baseName) { continue } var err error - switch rm.metric.Type() { + switch sm.metric.Type() { case telemetry.MetricTypeGauge, telemetry.MetricTypeTimestamp, telemetry.MetricTypeSnapshot: - err = rm.gvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels) + err = sm.gvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels) case telemetry.MetricTypeStatsGauge, telemetry.MetricTypeDuration: - if err = rm.gvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels); err != nil { + if err = sm.gvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels); err != nil { break } - for _, ms := range getMetricStats(rm.baseName, rm.metric) { + for _, ms := range getMetricStats(sm.baseName, sm.metric) { if ms.isCounter { - if err = rm.cvm.set(ms.name, ms.value, rm.labels); err != nil { + if err = sm.cvm.set(ms.name, ms.value, sm.labels); err != nil { break } } else { - if err = rm.gvm.set(ms.name, ms.value, rm.labels); err != nil { + if err = sm.gvm.set(ms.name, ms.value, sm.labels); err != nil { break } } } case telemetry.MetricTypeCounter: - err = rm.cvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels) + err = sm.cvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels) default: - c.log.Errorf("[%s]: metric type %d not supported", rm.baseName, rm.metric.Type()) + c.log.Errorf("[%s]: metric type %d not supported", sm.baseName, sm.metric.Type()) } if err != nil { - c.log.Errorf("[%s]: %s", rm.baseName, err) + c.log.Errorf("[%s]: %s", sm.baseName, err) continue } - rm.collect(ch) + sm.collect(ch) } } -func (c *Collector) Describe(ch chan<- *prometheus.Desc) { +func (c *metricsCollector) Describe(ch chan<- *prometheus.Desc) { c.summary.Describe(ch) } diff --git a/src/control/lib/telemetry/promexp/engine.go b/src/control/lib/telemetry/promexp/engine.go new file mode 100644 index 000000000000..bb0481f12a9a --- /dev/null +++ b/src/control/lib/telemetry/promexp/engine.go @@ -0,0 +1,271 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "fmt" + "regexp" + "strings" + "sync" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + // EngineCollector collects metrics from DAOS Engine sources. + EngineCollector struct { + metricsCollector + sources []*EngineSource + cleanupSource map[uint32]func() + sourceMutex sync.RWMutex // To protect sources + } + + // EngineSource provides metrics for a single DAOS Engine. + EngineSource struct { + MetricSource + Index uint32 + Rank uint32 + } +) + +// NewEngineSource initializes a new metrics source for a DAOS Engine. +func NewEngineSource(parent context.Context, idx uint32, rank uint32) (*EngineSource, func(), error) { + ctx, err := telemetry.Init(parent, idx) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to init telemetry") + } + + cleanupFn := func() { + telemetry.Detach(ctx) + } + + return &EngineSource{ + MetricSource: MetricSource{ + ctx: ctx, + enabled: atm.NewBool(true), + tmSchema: telemetry.NewSchema(), + smSchema: newSourceMetricSchema(func(l logging.Logger, m telemetry.Metric) *sourceMetric { + return newRankMetric(l, rank, m) + }), + }, + Index: idx, + Rank: rank, + }, cleanupFn, nil +} + +// NewEngineCollector initializes a new collector for DAOS Engine sources. +func NewEngineCollector(log logging.Logger, opts *CollectorOpts, sources ...*EngineSource) (*EngineCollector, error) { + if opts == nil { + opts = defaultCollectorOpts() + } + + c := &EngineCollector{ + metricsCollector: metricsCollector{ + log: log, + summary: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "engine", + Subsystem: "exporter", + Name: "scrape_duration_seconds", + Help: "daos_exporter: Duration of a scrape job.", + }, + []string{"source", "result"}, + ), + }, + sources: sources, + cleanupSource: make(map[uint32]func()), + } + + c.collectFn = func(metrics chan *sourceMetric) { + for _, source := range c.getSources() { + source.Collect(c.log, metrics) + } + } + + for _, pat := range opts.Ignores { + re, err := regexp.Compile(pat) + if err != nil { + return nil, errors.Wrapf(err, "failed to compile %q", pat) + } + c.ignoredMetrics = append(c.ignoredMetrics, re) + } + + return c, nil +} + +// extractLabels takes a "/"-separated DAOS metric name in order to +// create a normalized Prometheus name and label map. +// +// NB: Prometheus metric names should follow best practices as +// outlined at https://prometheus.io/docs/practices/naming/ +// +// In particular, a metric name should describe the measurement, +// not the entity the measurement is about. In other words, if 4 +// different entities share the same measurement, then there should +// be a single metric with a label that distinguishes between +// individual measurement values. +// +// Good: pool_started_at {pool="00000000-1111-2222-3333-4444444444"} +// Bad: pool_00000000_1111_2222_3333_4444444444_started_at +func extractLabels(log logging.Logger, in string) (labels labelMap, name string) { + log.Tracef("in: %q", in) + + labels = make(labelMap) + compsIdx := 0 + comps := strings.Split(in, string(telemetry.PathSep)) + if len(comps) == 0 { + return labels, "" + } + + if strings.HasPrefix(comps[compsIdx], "ID") { + if len(comps) == 1 { + return labels, "" + } + compsIdx++ + } + + switch comps[compsIdx] { + case "pool": + name = "pool" + compsIdx++ + labels["pool"] = comps[compsIdx] + compsIdx++ + switch comps[compsIdx] { + case "ops": + compsIdx++ + name += "_ops_" + comps[compsIdx] + compsIdx++ + } + case "io": + name = "io" + compsIdx++ + switch comps[compsIdx] { + case "latency": + compsIdx++ + name += "_latency_" + comps[compsIdx] + compsIdx++ + labels["size"] = comps[compsIdx] + compsIdx++ + case "ops": + compsIdx++ + name += "_ops_" + comps[compsIdx] + compsIdx++ + default: + name += "_" + comps[compsIdx] + compsIdx++ + } + case "net": + compsIdx++ + if comps[compsIdx] == "uri" { + compsIdx++ + name = "net_uri_" + comps[compsIdx] + compsIdx++ + break + } + + name = "net" + labels["provider"] = comps[compsIdx] + compsIdx++ + case "nvme": + name = "nvme" + compsIdx++ + labels["device"] = comps[compsIdx] + compsIdx++ + } + + for { + if len(comps) == compsIdx { + break + } + + switch { + case matchLabel(labels, comps[compsIdx], "tgt_", "target"): + compsIdx++ + case matchLabel(labels, comps[compsIdx], "xs_", "xstream"): + compsIdx++ + case matchLabel(labels, comps[compsIdx], "ctx_", "context"): + compsIdx++ + default: + name = appendName(name, comps[compsIdx]) + compsIdx++ + } + } + + name = sanitizeMetricName(name) + return +} + +func newRankMetric(log logging.Logger, rank uint32, m telemetry.Metric) *sourceMetric { + labels, name := extractLabels(log, m.FullPath()) + baseName := "engine_" + name + labels["rank"] = fmt.Sprintf("%d", rank) + + return newSourceMetric(log, m, baseName, labels) +} + +// AddSource adds an EngineSource to the Collector. +func (c *EngineCollector) AddSource(es *EngineSource, cleanup func()) { + if es == nil { + c.log.Error("attempted to add nil EngineSource") + return + } + + c.sourceMutex.Lock() + defer c.sourceMutex.Unlock() + + // If we attempt to add a duplicate, remove the old one. + c.removeSourceNoLock(es.Index) + + c.sources = append(c.sources, es) + if cleanup != nil { + c.cleanupSource[es.Index] = cleanup + } +} + +// RemoveSource removes an EngineSource with a given index from the Collector. +func (c *EngineCollector) RemoveSource(engineIdx uint32) { + c.sourceMutex.Lock() + defer c.sourceMutex.Unlock() + + c.removeSourceNoLock(engineIdx) +} + +func (c *EngineCollector) removeSourceNoLock(engineIdx uint32) { + for i, es := range c.sources { + if es.Index == engineIdx { + es.Disable() + c.sources = append(c.sources[:i], c.sources[i+1:]...) + + // Ensure that EngineSource isn't collecting during cleanup + es.tmMutex.Lock() + if cleanup, found := c.cleanupSource[engineIdx]; found && cleanup != nil { + cleanup() + } + es.tmMutex.Unlock() + delete(c.cleanupSource, engineIdx) + break + } + } +} + +func (c *EngineCollector) getSources() []*EngineSource { + c.sourceMutex.RLock() + defer c.sourceMutex.RUnlock() + + sourceCopy := make([]*EngineSource, len(c.sources)) + _ = copy(sourceCopy, c.sources) + return sourceCopy +} diff --git a/src/control/lib/telemetry/promexp/collector_test.go b/src/control/lib/telemetry/promexp/engine_test.go similarity index 88% rename from src/control/lib/telemetry/promexp/collector_test.go rename to src/control/lib/telemetry/promexp/engine_test.go index e50605a033a0..b21839b7ba02 100644 --- a/src/control/lib/telemetry/promexp/collector_test.go +++ b/src/control/lib/telemetry/promexp/engine_test.go @@ -2,11 +2,6 @@ // (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build linux && (amd64 || arm64) -// +build linux -// +build amd64 arm64 - // package promexp @@ -62,7 +57,10 @@ func TestPromexp_NewEngineSource(t *testing.T) { test.CmpErr(t, tc.expErr, err) - if diff := cmp.Diff(tc.expResult, result, cmpopts.IgnoreUnexported(EngineSource{})); diff != "" { + cmpOpts := cmp.Options{ + cmpopts.IgnoreUnexported(MetricSource{}), + } + if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { t.Fatalf("(-want, +got)\n%s", diff) } @@ -155,31 +153,20 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { for name, tc := range map[string]struct { es *EngineSource - resultChan chan *rankMetric + resultChan chan *sourceMetric expMetrics telemetry.TestMetricsMap }{ - "nil source": { - resultChan: make(chan *rankMetric), - }, "nil channel": { es: validSrc, }, - "bad source": { - es: &EngineSource{ - ctx: test.Context(t), - Rank: 123, - Index: testIdx + 1, - }, - resultChan: make(chan *rankMetric), - }, "success": { es: validSrc, - resultChan: make(chan *rankMetric), + resultChan: make(chan *sourceMetric), expMetrics: realMetrics, }, "disabled": { es: disabledSrc, - resultChan: make(chan *rankMetric), + resultChan: make(chan *sourceMetric), expMetrics: telemetry.TestMetricsMap{}, }, } { @@ -189,7 +176,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { go tc.es.Collect(log, tc.resultChan) - gotMetrics := []*rankMetric{} + gotMetrics := []*sourceMetric{} for { done := false select { @@ -206,7 +193,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { test.AssertEqual(t, len(tc.expMetrics), len(gotMetrics), "wrong number of metrics returned") for _, got := range gotMetrics { - test.AssertEqual(t, testRank, got.rank, "wrong rank") + test.AssertEqual(t, fmt.Sprintf("%d", testRank), got.labels["rank"], "wrong rank") expM, ok := tc.expMetrics[got.metric.Type()] if !ok { t.Fatalf("metric type %d not expected", got.metric.Type()) @@ -220,7 +207,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { } } -func TestPromExp_NewCollector(t *testing.T) { +func TestPromExp_NewEngineCollector(t *testing.T) { testSrc := []*EngineSource{ { Rank: 1, @@ -234,20 +221,24 @@ func TestPromExp_NewCollector(t *testing.T) { sources []*EngineSource opts *CollectorOpts expErr error - expResult *Collector + expResult *EngineCollector }{ "no sources": { - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, }, }, }, "defaults": { sources: testSrc, - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, }, sources: testSrc, }, @@ -255,15 +246,17 @@ func TestPromExp_NewCollector(t *testing.T) { "opts with ignores": { sources: testSrc, opts: &CollectorOpts{Ignores: []string{"one", "two"}}, - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + ignoredMetrics: []*regexp.Regexp{ + regexp.MustCompile("one"), + regexp.MustCompile("two"), + }, }, sources: testSrc, - ignoredMetrics: []*regexp.Regexp{ - regexp.MustCompile("one"), - regexp.MustCompile("two"), - }, }, }, "bad regexp in ignores": { @@ -276,21 +269,23 @@ func TestPromExp_NewCollector(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - result, err := NewCollector(log, tc.opts, tc.sources...) + result, err := NewEngineCollector(log, tc.opts, tc.sources...) test.CmpErr(t, tc.expErr, err) cmpOpts := []cmp.Option{ - cmpopts.IgnoreUnexported(EngineSource{}), + cmpopts.IgnoreUnexported(MetricSource{}), cmpopts.IgnoreUnexported(prometheus.SummaryVec{}), cmpopts.IgnoreUnexported(prometheus.MetricVec{}), cmpopts.IgnoreUnexported(regexp.Regexp{}), - cmp.AllowUnexported(Collector{}), + cmp.AllowUnexported(EngineCollector{}), + cmp.AllowUnexported(metricsCollector{}), cmp.FilterPath(func(p cmp.Path) bool { // Ignore a few specific fields return (strings.HasSuffix(p.String(), "log") || strings.HasSuffix(p.String(), "sourceMutex") || - strings.HasSuffix(p.String(), "cleanupSource")) + strings.HasSuffix(p.String(), "cleanupSource") || + strings.HasSuffix(p.String(), "collectFn")) }, cmp.Ignore()), } if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { @@ -338,7 +333,7 @@ func TestPromExp_Collector_Prune(t *testing.T) { } defer cleanup() - defaultCollector, err := NewCollector(log, nil, engSrc) + defaultCollector, err := NewEngineCollector(log, nil, engSrc) if err != nil { t.Fatalf("failed to create collector: %s", err.Error()) } @@ -357,12 +352,12 @@ func TestPromExp_Collector_Prune(t *testing.T) { } } - engSrc.rmSchema.mu.Lock() - for m := range engSrc.rmSchema.rankMetrics { - _, name := extractLabels(m) + engSrc.smSchema.mu.Lock() + for m := range engSrc.smSchema.sourceMetrics { + _, name := extractLabels(log, m) names = append(names, name) } - engSrc.rmSchema.mu.Unlock() + engSrc.smSchema.mu.Unlock() sort.Strings(names) return @@ -373,7 +368,7 @@ func TestPromExp_Collector_Prune(t *testing.T) { for _, m := range maps { for t, m := range m { if t != telemetry.MetricTypeDirectory && t != telemetry.MetricTypeLink { - _, name := extractLabels(m.FullPath()) + _, name := extractLabels(log, m.FullPath()) unique[name] = struct{}{} } } @@ -422,7 +417,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { } defer cleanup() - defaultCollector, err := NewCollector(log, nil, engSrc) + defaultCollector, err := NewEngineCollector(log, nil, engSrc) if err != nil { t.Fatalf("failed to create collector: %s", err.Error()) } @@ -433,7 +428,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { "engine_stats_gauge2", "engine_timer_duration", } - ignoreCollector, err := NewCollector(log, &CollectorOpts{ + ignoreCollector, err := NewEngineCollector(log, &CollectorOpts{ Ignores: ignores, }, engSrc) if err != nil { @@ -441,13 +436,10 @@ func TestPromExp_Collector_Collect(t *testing.T) { } for name, tc := range map[string]struct { - collector *Collector + collector *EngineCollector resultChan chan prometheus.Metric expMetricNames []string }{ - "nil collector": { - resultChan: make(chan prometheus.Metric), - }, "nil channel": { collector: defaultCollector, }, @@ -518,7 +510,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { } } -func TestPromExp_extractLabels(t *testing.T) { +func TestPromExp_extractEngineLabels(t *testing.T) { for name, tc := range map[string]struct { input string expName string @@ -632,7 +624,10 @@ func TestPromExp_extractLabels(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - labels, name := extractLabels(tc.input) + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + labels, name := extractLabels(log, tc.input) test.AssertEqual(t, name, tc.expName, "") if diff := cmp.Diff(labels, tc.expLabels); diff != "" { @@ -692,7 +687,7 @@ func TestPromExp_Collector_AddSource(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - collector, err := NewCollector(log, nil, tc.startSrc...) + collector, err := NewEngineCollector(log, nil, tc.startSrc...) if err != nil { t.Fatalf("failed to set up collector: %s", err) } @@ -795,7 +790,7 @@ func TestPromExp_Collector_RemoveSource(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - collector, err := NewCollector(log, nil, tc.startSrc...) + collector, err := NewEngineCollector(log, nil, tc.startSrc...) if err != nil { t.Fatalf("failed to set up collector: %s", err) } @@ -805,7 +800,10 @@ func TestPromExp_Collector_RemoveSource(t *testing.T) { collector.RemoveSource(tc.idx) - if diff := cmp.Diff(tc.expSrc, collector.sources, cmpopts.IgnoreUnexported(EngineSource{})); diff != "" { + cmpOpts := cmp.Options{ + cmpopts.IgnoreUnexported(MetricSource{}), + } + if diff := cmp.Diff(tc.expSrc, collector.sources, cmpOpts...); diff != "" { t.Fatalf("(-want, +got)\n%s", diff) } diff --git a/src/control/lib/telemetry/promexp/httpd.go b/src/control/lib/telemetry/promexp/httpd.go new file mode 100644 index 000000000000..2f4c86d485dc --- /dev/null +++ b/src/control/lib/telemetry/promexp/httpd.go @@ -0,0 +1,100 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + // RegMonFn defines a function signature for registering a Prometheus + // monitor. + RegMonFn func(context.Context, logging.Logger) error + + // ExporterConfig defines the configuration for the Prometheus exporter. + ExporterConfig struct { + Port int + Title string + Register RegMonFn + } +) + +const ( + // EngineTelemetryPort specifies the default port for engine telemetry. + EngineTelemetryPort = 9191 + // ClientTelemetryPort specifies the default port for client telemetry. + ClientTelemetryPort = 9192 +) + +// StartExporter starts the Prometheus exporter. +func StartExporter(ctx context.Context, log logging.Logger, cfg *ExporterConfig) (func(), error) { + if cfg == nil { + return nil, errors.New("invalid exporter config: nil config") + } + + if cfg.Port <= 0 { + return nil, errors.New("invalid exporter config: bad port") + } + + if cfg.Register == nil { + return nil, errors.New("invalid exporter config: nil register function") + } + + if err := cfg.Register(ctx, log); err != nil { + return nil, errors.Wrap(err, "failed to register client monitor") + } + + listenAddress := fmt.Sprintf("0.0.0.0:%d", cfg.Port) + + srv := http.Server{Addr: listenAddress} + http.Handle("/metrics", promhttp.HandlerFor( + prometheus.DefaultGatherer, promhttp.HandlerOpts{}, + )) + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + num, err := w.Write([]byte(fmt.Sprintf(` + %s + +

%s

+

Metrics

+ + `, cfg.Title, cfg.Title))) + if err != nil { + log.Errorf("%d: %s", num, err) + } + }) + + // http listener is a blocking call + go func() { + log.Infof("Listening on %s", listenAddress) + err := srv.ListenAndServe() + log.Infof("Prometheus web exporter stopped: %s", err.Error()) + }() + + return func() { + log.Debug("Shutting down Prometheus web exporter") + + // When this cleanup function is called, the original context + // will probably have already been canceled. + timedCtx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + if err := srv.Shutdown(timedCtx); err != nil { + log.Noticef("HTTP server didn't shut down within timeout: %s", err.Error()) + } + }, nil +} diff --git a/src/control/lib/telemetry/promexp/httpd_test.go b/src/control/lib/telemetry/promexp/httpd_test.go new file mode 100644 index 000000000000..db69e122b714 --- /dev/null +++ b/src/control/lib/telemetry/promexp/httpd_test.go @@ -0,0 +1,118 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package promexp_test + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + "testing" + "time" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestPromExp_StartExporter(t *testing.T) { + for name, tc := range map[string]struct { + cfg *promexp.ExporterConfig + expErr error + }{ + "nil cfg": { + expErr: errors.New("invalid exporter config"), + }, + "empty cfg invalid": { + cfg: &promexp.ExporterConfig{}, + expErr: errors.New("invalid exporter config"), + }, + "negative port": { + cfg: &promexp.ExporterConfig{ + Port: -1, + }, + expErr: errors.New("invalid exporter config"), + }, + "nil register fn": { + cfg: &promexp.ExporterConfig{ + Port: 1234, + }, + expErr: errors.New("invalid exporter config"), + }, + "register fn fails": { + cfg: &promexp.ExporterConfig{ + Port: 1234, + Register: func(context.Context, logging.Logger) error { + return errors.New("whoops") + }, + }, + expErr: errors.New("failed to register"), + }, + "success": { + cfg: &promexp.ExporterConfig{ + Port: promexp.ClientTelemetryPort, + Register: func(ctx context.Context, log logging.Logger) error { + return nil + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + if tc.cfg != nil { + tc.cfg.Title = t.Name() + } + cleanup, err := promexp.StartExporter(test.Context(t), log, tc.cfg) + test.CmpErr(t, tc.expErr, err) + if tc.expErr != nil { + return + } + + // Quick tests to make sure the exporter is listening and + // that our handlers are invoked. + var resp *http.Response + for { + var err error + resp, err = http.Get(fmt.Sprintf("http://localhost:%d/", tc.cfg.Port)) + if err == nil { + break + } + log.Errorf("failed to connect to exporter: %+v", err) + time.Sleep(100 * time.Millisecond) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(string(body), tc.cfg.Title) { + t.Fatalf("expected %q to contain %q", string(body), tc.cfg.Title) + } + resp.Body.Close() + + resp, err = http.Get(fmt.Sprintf("http://localhost:%d/metrics", tc.cfg.Port)) + if err != nil { + t.Fatal(err) + } + resp.Body.Close() + + cleanup() + time.Sleep(1 * time.Second) + + // Make sure the exporter is no longer listening. + _, err = http.Get(fmt.Sprintf("http://localhost:%d/", tc.cfg.Port)) + if err == nil { + t.Fatal("expected http Get to fail on closed port") + } + }) + } +} diff --git a/src/control/lib/telemetry/promexp/source.go b/src/control/lib/telemetry/promexp/source.go new file mode 100644 index 000000000000..2212b319ff7e --- /dev/null +++ b/src/control/lib/telemetry/promexp/source.go @@ -0,0 +1,214 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + sourceMetricSchema struct { + mu sync.Mutex + sourceMetrics map[string]*sourceMetric + seen map[string]struct{} + addFn func(logging.Logger, telemetry.Metric) *sourceMetric + } + + // MetricSource encapsulates the logic and data for collecting telemetry + // from a DAOS metrics source. + MetricSource struct { + ctx context.Context + tmMutex sync.RWMutex // To protect telemetry collection + enabled atm.Bool + tmSchema *telemetry.Schema + smSchema *sourceMetricSchema + } +) + +func newSourceMetricSchema(addFn func(logging.Logger, telemetry.Metric) *sourceMetric) *sourceMetricSchema { + return &sourceMetricSchema{ + sourceMetrics: make(map[string]*sourceMetric), + seen: make(map[string]struct{}), + addFn: addFn, + } +} + +// Prune removes any metrics that have not been seen since the last call to Prune. +func (s *sourceMetricSchema) Prune() { + s.mu.Lock() + defer s.mu.Unlock() + + for id := range s.sourceMetrics { + if _, found := s.seen[id]; !found { + delete(s.sourceMetrics, id) + } + } + s.seen = make(map[string]struct{}) +} + +func (s *sourceMetricSchema) add(log logging.Logger, metric telemetry.Metric) (sm *sourceMetric) { + s.mu.Lock() + defer s.mu.Unlock() + + id := metric.FullPath() + s.seen[id] = struct{}{} + + var found bool + if sm, found = s.sourceMetrics[id]; !found { + sm = s.addFn(log, metric) + s.sourceMetrics[id] = sm + } else { + sm.resetVecs() + } + + return +} + +func defaultCollectorOpts() *CollectorOpts { + return &CollectorOpts{} +} + +// sourceMetric defines a wrapper for the wrapped telemetry.Metric instance. +type sourceMetric struct { + metric telemetry.Metric + baseName string + labels labelMap + gvm gvMap + cvm cvMap +} + +// collect sends the metrics vectors in the sourceMetric struct to the provided channel. +func (bm *sourceMetric) collect(ch chan<- prometheus.Metric) { + for _, gv := range bm.gvm { + gv.Collect(ch) + } + for _, cv := range bm.cvm { + cv.Collect(ch) + } +} + +// resetVecs resets all the metrics vectors in the sourceMetric struct. +func (bm *sourceMetric) resetVecs() { + for _, gv := range bm.gvm { + gv.Reset() + } + for _, cv := range bm.cvm { + cv.Reset() + } +} + +// newSourceMetric initializes a new sourceMetric struct. +func newSourceMetric(log logging.Logger, m telemetry.Metric, baseName string, labels labelMap) *sourceMetric { + sm := &sourceMetric{ + metric: m, + baseName: baseName, + labels: labels, + gvm: make(gvMap), + cvm: make(cvMap), + } + + desc := m.Desc() + + switch sm.metric.Type() { + case telemetry.MetricTypeGauge, telemetry.MetricTypeTimestamp, + telemetry.MetricTypeSnapshot: + sm.gvm.add(sm.baseName, desc, sm.labels) + case telemetry.MetricTypeStatsGauge, telemetry.MetricTypeDuration: + sm.gvm.add(sm.baseName, desc, sm.labels) + for _, ms := range getMetricStats(sm.baseName, sm.metric) { + if ms.isCounter { + sm.cvm.add(ms.name, ms.desc, sm.labels) + } else { + sm.gvm.add(ms.name, ms.desc, sm.labels) + } + } + case telemetry.MetricTypeCounter: + sm.cvm.add(sm.baseName, desc, sm.labels) + default: + log.Errorf("[%s]: metric type %d not supported", baseName, sm.metric.Type()) + } + + return sm +} + +// IsEnabled checks if the source is enabled. +func (s *MetricSource) IsEnabled() bool { + return s.enabled.IsTrue() +} + +// Enable enables the source. +func (s *MetricSource) Enable() { + s.enabled.SetTrue() +} + +// Disable disables the source. +func (s *MetricSource) Disable() { + s.enabled.SetFalse() +} + +// Collect invokes telemetry.CollectMetrics() for the metrics context +// managed by this source. The collected metrics are sent to the provided channel. +func (s *MetricSource) Collect(log logging.Logger, ch chan<- *sourceMetric) { + if s == nil { + log.Error("nil source") + return + } + if !s.IsEnabled() { + return + } + if ch == nil { + log.Error("nil channel") + return + } + + s.tmMutex.RLock() + defer s.tmMutex.RUnlock() + + metrics := make(chan telemetry.Metric) + go func() { + if err := telemetry.CollectMetrics(s.ctx, s.tmSchema, metrics); err != nil { + log.Errorf("failed to collect metrics: %s", err) + return + } + s.tmSchema.Prune() + }() + + for metric := range metrics { + ch <- s.smSchema.add(log, metric) + } + s.smSchema.Prune() +} + +// PruneSegments prunes unused telemetry segments. +func (s *MetricSource) PruneSegments(log logging.Logger, maxSegAge time.Duration) { + if s == nil { + log.Error("nil source") + return + } + if !s.IsEnabled() { + return + } + + if err := telemetry.PruneUnusedSegments(s.ctx, maxSegAge); err != nil { + log.Errorf("failed to prune segments: %s", err) + return + } + + s.tmSchema.Prune() + s.smSchema.Prune() +} diff --git a/src/control/lib/telemetry/promexp/util.go b/src/control/lib/telemetry/promexp/util.go new file mode 100644 index 000000000000..6ddc46623d30 --- /dev/null +++ b/src/control/lib/telemetry/promexp/util.go @@ -0,0 +1,170 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "sort" + "strings" + "unicode" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/telemetry" +) + +type labelMap map[string]string + +func (lm labelMap) keys() (keys []string) { + for label := range lm { + keys = append(keys, label) + } + sort.Strings(keys) + + return +} + +func sanitizeMetricName(in string) string { + return strings.Map(func(r rune) rune { + switch { + // Valid names for Prometheus are limited to: + case r >= 'a' && r <= 'z': // lowercase letters + case r >= 'A' && r <= 'Z': // uppercase letters + case unicode.IsDigit(r): // digits + default: // sanitize any other character + return '_' + } + + return r + }, strings.TrimLeft(in, "/")) +} + +func matchLabel(labels labelMap, input, match, label string) bool { + if !strings.HasPrefix(input, match) { + return false + } + + splitStr := strings.SplitN(input, "_", 2) + if len(splitStr) == 2 { + labels[label] = splitStr[1] + return true + } + return false +} + +func appendName(cur, name string) string { + if cur == "" { + return name + } + return cur + "_" + name +} + +type gvMap map[string]*prometheus.GaugeVec + +func (m gvMap) add(name, help string, labels labelMap) { + if _, found := m[name]; !found { + gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: name, + Help: help, + }, labels.keys()) + m[name] = gv + } +} + +func (m gvMap) set(name string, value float64, labels labelMap) error { + gv, found := m[name] + if !found { + return errors.Errorf("gauge vector %s not found", name) + } + gv.With(prometheus.Labels(labels)).Set(value) + + return nil +} + +type cvMap map[string]*prometheus.CounterVec + +func (m cvMap) add(name, help string, labels labelMap) { + if _, found := m[name]; !found { + cv := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: name, + Help: help, + }, labels.keys()) + m[name] = cv + } +} + +func (m cvMap) set(name string, value float64, labels labelMap) error { + cv, found := m[name] + if !found { + return errors.Errorf("counter vector %s not found", name) + } + cv.With(prometheus.Labels(labels)).Add(value) + + return nil +} + +type metricStat struct { + name string + desc string + value float64 + isCounter bool +} + +func getMetricStats(baseName string, m telemetry.Metric) (stats []*metricStat) { + ms, ok := m.(telemetry.StatsMetric) + if !ok { + return []*metricStat{} + } + + for name, s := range map[string]struct { + fn func() float64 + desc string + isCounter bool + }{ + "min": { + fn: func() float64 { return float64(ms.Min()) }, + desc: " (min value)", + }, + "max": { + fn: func() float64 { return float64(ms.Max()) }, + desc: " (max value)", + }, + "mean": { + fn: ms.Mean, + desc: " (mean)", + }, + "sum": { + fn: func() float64 { return float64(ms.Sum()) }, + desc: " (sum)", + }, + "stddev": { + fn: ms.StdDev, + desc: " (std dev)", + }, + "sumsquares": { + fn: ms.SumSquares, + desc: " (sum of squares)", + }, + "samples": { + fn: func() float64 { return float64(ms.SampleSize()) }, + desc: " (samples)", + isCounter: true, + }, + } { + stats = append(stats, &metricStat{ + name: baseName + "_" + name, + desc: m.Desc() + s.desc, + value: s.fn(), + isCounter: s.isCounter, + }) + } + + return +} diff --git a/src/control/lib/telemetry/promexp/util_test.go b/src/control/lib/telemetry/promexp/util_test.go new file mode 100644 index 000000000000..104da9ec3836 --- /dev/null +++ b/src/control/lib/telemetry/promexp/util_test.go @@ -0,0 +1,135 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package promexp + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/telemetry" +) + +func TestPromExp_sanitizeMetricName(t *testing.T) { + for input, tc := range map[string]struct { + expOutput string + }{ + "": { + expOutput: "", + }, + "azAZ09": { + expOutput: "azAZ09", + }, + "/a-z A-Z 0-9/": { + expOutput: "a_z_A_Z_0_9_", + }, + } { + t.Run(input, func(t *testing.T) { + got := sanitizeMetricName(input) + if got != tc.expOutput { + t.Errorf("sanitizeMetricName(%q) = %q, want %q", input, got, tc.expOutput) + } + }) + } +} + +func TestPromExp_getMetricStats(t *testing.T) { + segID := telemetry.NextTestID(telemetry.PromexpIDBase) + telemetry.InitTestMetricsProducer(t, segID, 4096) + defer telemetry.CleanupTestMetricsProducer(t) + testValues := []uint64{1, 2, 3, 4, 5} + + ctx, err := telemetry.Init(test.Context(t), uint32(segID)) + if err != nil { + t.Fatalf("Init: %v", err) + } + + for name, tc := range map[string]struct { + baseName string + metric *telemetry.TestMetric + expStats []*metricStat + }{ + "non-stats gauge": { + baseName: "gauge", + metric: &telemetry.TestMetric{ + Name: "gauge", + Type: telemetry.MetricTypeGauge, + Cur: 1.0, + }, + expStats: []*metricStat{}, + }, + "stats gauge": { + baseName: "stats_gauge", + metric: &telemetry.TestMetric{ + Name: "stats_gauge", + Type: telemetry.MetricTypeStatsGauge, + Values: testValues, + }, + expStats: []*metricStat{ + { + name: "stats_gauge_min", + desc: " (min value)", + value: 1.0, + }, + { + name: "stats_gauge_max", + desc: " (max value)", + value: 5.0, + }, + { + name: "stats_gauge_mean", + desc: " (mean)", + value: 3.0, + }, + { + name: "stats_gauge_sum", + desc: " (sum)", + value: 15.0, + }, + { + name: "stats_gauge_samples", + desc: " (samples)", + value: 5, + isCounter: true, + }, + { + name: "stats_gauge_stddev", + desc: " (std dev)", + value: 1.58113883, + }, + { + name: "stats_gauge_sumsquares", + desc: " (sum of squares)", + value: 55, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + telemetry.AddTestMetric(t, tc.metric) + + m, err := tc.metric.GetMetric(ctx) + if err != nil { + t.Fatalf("GetMetric: %v", err) + } + + got := getMetricStats(tc.baseName, m) + cmpOpts := cmp.Options{ + cmp.AllowUnexported(metricStat{}), + cmpopts.EquateApprox(0.000000001, 0.0), + cmpopts.SortSlices(func(a, b *metricStat) bool { + return a.name < b.name + }), + } + if diff := cmp.Diff(got, tc.expStats, cmpOpts...); diff != "" { + t.Fatalf("(-want, +got)\n%s", diff) + } + }) + } +} diff --git a/src/control/lib/telemetry/shm.go b/src/control/lib/telemetry/shm.go new file mode 100644 index 000000000000..99fd95aaa3a1 --- /dev/null +++ b/src/control/lib/telemetry/shm.go @@ -0,0 +1,103 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package telemetry + +/* +#include +#include +#include +*/ +import "C" + +import ( + "time" + + "github.com/pkg/errors" +) + +type shmidStat struct { + id C.int + ds C.struct_shmid_ds +} + +// Size returns the size of segment in bytes. +func (s *shmidStat) Size() int { + return int(s.ds.shm_segsz) +} + +// Atime returns the time of last shmat(2). +func (s *shmidStat) Atime() time.Time { + return time.Unix(int64(s.ds.shm_atime), 0) +} + +// Dtime returns the time of last shmdt(2). +func (s *shmidStat) Dtime() time.Time { + return time.Unix(int64(s.ds.shm_dtime), 0) +} + +// Ctime returns the time of last shmctl(2) or creation time. +func (s *shmidStat) Ctime() time.Time { + return time.Unix(int64(s.ds.shm_ctime), 0) +} + +// Cpid returns the creator pid. +func (s *shmidStat) Cpid() int { + return int(s.ds.shm_cpid) +} + +// Lpid returns the last shmat(2)/shmdt(2) pid. +func (s *shmidStat) Lpid() int { + return int(s.ds.shm_lpid) +} + +// Nattach returns the number of attached processes. +func (s *shmidStat) Nattach() int { + return int(s.ds.shm_nattch) +} + +// C returns the C struct. +func (s *shmidStat) C() *C.struct_shmid_ds { + return &s.ds +} + +func shmStat(id C.int) (*shmidStat, error) { + st := shmidStat{ + id: id, + } + rc, err := C.shmctl(id, C.IPC_STAT, &st.ds) + if rc != 0 { + return nil, errors.Wrapf(err, "shmctl(IPC_STAT, %d)", id) + } + + return &st, nil +} + +func shmStatKey(key C.key_t) (*shmidStat, error) { + id, err := C.shmget(key, 0, 0) + if err != nil { + return nil, errors.Wrapf(err, "shmget(%d, 0, 0)", key) + } + + return shmStat(id) +} + +func shmChown(key C.key_t, uid C.uid_t, gid C.gid_t) error { + st, err := shmStatKey(key) + if err != nil { + return err + } + + st.ds.shm_perm.gid = gid + st.ds.shm_perm.uid = uid + + rc, err := C.shmctl(st.id, C.IPC_SET, st.C()) + if rc != 0 { + return errors.Wrapf(err, "shmctl(IPC_SET, %d)", st.id) + } + + return nil +} diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go index da93ffa55a40..9a626c85634b 100644 --- a/src/control/lib/telemetry/telemetry.go +++ b/src/control/lib/telemetry/telemetry.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -14,8 +14,28 @@ package telemetry /* #cgo LDFLAGS: -lgurt -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_consumer.h" +#include +#include +#include +#include + +static int +rm_ephemeral_dir(const char *path) +{ + return d_tm_del_ephemeral_dir(path); +} + +static int +add_ephemeral_dir(struct d_tm_node_t **node, size_t size_bytes, char *path) +{ + return d_tm_add_ephemeral_dir(node, size_bytes, path); +} + +static int +attach_segment_path(key_t key, char *path) +{ + return d_tm_attach_path_segment(key, path); +} */ import "C" @@ -25,12 +45,19 @@ import ( "io" "os" "path/filepath" + "sort" + "strconv" "strings" "sync" "time" "unsafe" "github.com/pkg/errors" + "golang.org/x/sys/unix" + + "github.com/daos-stack/daos/src/control/common" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/logging" ) type MetricType int @@ -46,6 +73,11 @@ const ( MetricTypeDirectory MetricType = C.D_TM_DIRECTORY MetricTypeLink MetricType = C.D_TM_LINK + ClientJobRootID = C.DC_TM_JOB_ROOT_ID + ClientJobMax = 1024 + ClientMetricsEnabledEnv = C.DAOS_CLIENT_METRICS_ENABLE + ClientMetricsRetainEnv = C.DAOS_CLIENT_METRICS_RETAIN + BadUintVal = ^uint64(0) BadFloatVal = float64(BadUintVal) BadIntVal = int64(BadUintVal >> 1) @@ -81,7 +113,7 @@ type ( type ( handle struct { sync.RWMutex - idx uint32 + id uint32 rank *uint32 ctx *C.struct_d_tm_context root *C.struct_d_tm_node_t @@ -109,6 +141,34 @@ const ( handleKey telemetryKey = "handle" ) +func (mt MetricType) String() string { + strFmt := func(name string) string { + numStr := strconv.Itoa(int(mt)) + return name + " (" + numStr + ")" + } + + switch mt { + case MetricTypeDirectory: + return strFmt("directory") + case MetricTypeCounter: + return strFmt("counter") + case MetricTypeTimestamp: + return strFmt("timestamp") + case MetricTypeSnapshot: + return strFmt("snapshot") + case MetricTypeDuration: + return strFmt("duration") + case MetricTypeGauge: + return strFmt("gauge") + case MetricTypeStatsGauge: + return strFmt("gauge (stats)") + case MetricTypeLink: + return strFmt("link") + default: + return strFmt("unknown") + } +} + func (h *handle) isValid() bool { return h != nil && h.ctx != nil && h.root != nil } @@ -295,24 +355,43 @@ func collectGarbageLoop(ctx context.Context, ticker *time.Ticker) { } } +func initClientRoot(parent context.Context, shmID uint32) (context.Context, error) { + if parent == nil { + return nil, errors.New("nil parent context") + } + + shmSize := C.ulong(ClientJobMax * C.D_TM_METRIC_SIZE) + + rc := C.d_tm_init(C.int(shmID), shmSize, C.D_TM_OPEN_OR_CREATE) + if rc != 0 { + return nil, errors.Errorf("failed to init client root: %s", daos.Status(rc)) + } + + return Init(parent, shmID) +} + +func InitClientRoot(ctx context.Context) (context.Context, error) { + return initClientRoot(ctx, ClientJobRootID) +} + // Init initializes the telemetry bindings -func Init(parent context.Context, idx uint32) (context.Context, error) { +func Init(parent context.Context, id uint32) (context.Context, error) { if parent == nil { return nil, errors.New("nil parent context") } - tmCtx := C.d_tm_open(C.int(idx)) + tmCtx := C.d_tm_open(C.int(id)) if tmCtx == nil { - return nil, errors.Errorf("no shared memory segment found for idx: %d", idx) + return nil, errors.Errorf("no shared memory segment found for key: %d", id) } root := C.d_tm_get_root(tmCtx) if root == nil { - return nil, errors.Errorf("no root node found in shared memory segment for idx: %d", idx) + return nil, errors.Errorf("no root node found in shared memory segment for key: %d", id) } handle := &handle{ - idx: idx, + id: id, ctx: tmCtx, root: root, } @@ -323,6 +402,11 @@ func Init(parent context.Context, idx uint32) (context.Context, error) { return newCtx, nil } +// Fini releases resources claimed by Init(). +func Fini() { + C.d_tm_fini() +} + // Detach detaches from the telemetry handle func Detach(ctx context.Context) { if hdl, err := getHandle(ctx); err == nil { @@ -333,6 +417,38 @@ func Detach(ctx context.Context) { } } +func addEphemeralDir(path string, shmSize uint64) error { + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + if rc := C.add_ephemeral_dir(nil, C.ulong(shmSize), cPath); rc != 0 { + return daos.Status(rc) + } + + return nil +} + +// SetupClientRoot performs the necessary actions to get the client telemetry +// segment linked into the agent-managed tree. +func SetupClientRoot(ctx context.Context, jobid string, pid, shm_key int) error { + log := logging.FromContext(ctx) + + if err := addEphemeralDir(jobid, ClientJobMax*C.D_TM_METRIC_SIZE); err != nil { + if err != daos.Exists { + return errors.Wrapf(err, "failed to add client job path %q", jobid) + } + } + + pidPath := filepath.Join(jobid, string(PathSep), strconv.Itoa(pid)) + cPidPath := C.CString(pidPath) + defer C.free(unsafe.Pointer(cPidPath)) + if rc := C.attach_segment_path(C.key_t(shm_key), cPidPath); rc != 0 { + return errors.Wrapf(daos.Status(rc), "failed to attach client segment 0x%x at %q", shm_key, pidPath) + } + + log.Tracef("attached client segment @ %q (key: 0x%x)", pidPath, shm_key) + return nil +} + type Schema struct { mu sync.RWMutex metrics map[string]Metric @@ -413,10 +529,12 @@ func NewSchema() *Schema { } -func visit(hdl *handle, s *Schema, node *C.struct_d_tm_node_t, pathComps string, out chan<- Metric) { +type procNodeFn func(hdl *handle, id string, node *C.struct_d_tm_node_t) + +func visit(hdl *handle, node *C.struct_d_tm_node_t, pathComps string, procLinks bool, procNode procNodeFn) { var next *C.struct_d_tm_node_t - if node == nil { + if node == nil || procNode == nil { return } name := C.GoString(C.d_tm_get_name(hdl.ctx, node)) @@ -425,29 +543,30 @@ func visit(hdl *handle, s *Schema, node *C.struct_d_tm_node_t, pathComps string, id = name } - cType := node.dtn_type - switch cType { + switch node.dtn_type { case C.D_TM_DIRECTORY: next = C.d_tm_get_child(hdl.ctx, node) if next != nil { - visit(hdl, s, next, id, out) + visit(hdl, next, id, procLinks, procNode) } case C.D_TM_LINK: next = C.d_tm_follow_link(hdl.ctx, node) if next != nil { + if procLinks { + // Use next to get the linked shm key + procNode(hdl, id, next) + } + // link leads to a directory with the same name - visit(hdl, s, next, pathComps, out) + visit(hdl, next, pathComps, procLinks, procNode) } default: - m := s.Add(hdl, id, cType, node) - if m != nil { - out <- m - } + procNode(hdl, id, node) } next = C.d_tm_get_sibling(hdl.ctx, node) if next != nil && next != node { - visit(hdl, s, next, pathComps, out) + visit(hdl, next, pathComps, procLinks, procNode) } } @@ -465,8 +584,98 @@ func CollectMetrics(ctx context.Context, s *Schema, out chan<- Metric) error { return errors.New("invalid handle") } - node := hdl.root - visit(hdl, s, node, "", out) + procNode := func(hdl *handle, id string, node *C.struct_d_tm_node_t) { + m := s.Add(hdl, id, node.dtn_type, node) + if m != nil { + out <- m + } + } + + visit(hdl, hdl.root, "", false, procNode) + + return nil +} + +// PruneUnusedSegments removes shared memory segments associated with +// unused ephemeral subdirectories. +func PruneUnusedSegments(ctx context.Context, maxSegAge time.Duration) error { + log := logging.FromContext(ctx) + + hdl, err := getHandle(ctx) + if err != nil { + return err + } + hdl.Lock() + defer hdl.Unlock() + + if !hdl.isValid() { + return errors.New("invalid handle") + } + + var toPrune []string + procNode := func(hdl *handle, id string, node *C.struct_d_tm_node_t) { + if node == nil || node.dtn_type != C.D_TM_DIRECTORY { + return + } + + path := id + comps := strings.SplitN(path, string(PathSep), 2) + if strings.HasPrefix(comps[0], "ID:") && len(comps) > 1 { + path = comps[1] + } + + st, err := shmStatKey(node.dtn_shmem_key) + if err != nil { + log.Errorf("failed to shmStat(%s): %s", path, err) + return + } + + log.Tracef("path:%s shmid:%d spid:%d cpid:%d lpid:%d age:%s", + path, st.id, os.Getpid(), st.Cpid(), st.Lpid(), time.Since(st.Ctime())) + + // If the creator process was someone other than us, and it's still + // around, don't mess with the segment. + if _, err := common.GetProcName(st.Cpid()); err == nil && st.Cpid() != unix.Getpid() { + return + } + + if time.Since(st.Ctime()) <= maxSegAge { + return + } + + log.Tracef("adding %s to prune list", path) + toPrune = append(toPrune, path) + } + + visit(hdl, hdl.root, "", true, procNode) + + sort.Sort(sort.Reverse(sort.StringSlice(toPrune))) + for _, path := range toPrune { + log.Tracef("pruning %s", path) + if err := removeLink(hdl, path); err != nil { + log.Errorf("failed to prune %s: %s", path, err) + } + } + + return nil +} + +func removeLink(hdl *handle, path string) error { + _, err := findNode(hdl, path) + if err != nil { + return err + } + + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + rc := C.rm_ephemeral_dir(cPath) + if rc != 0 { + return errors.Wrapf(daos.Status(rc), "failed to remove link %q", path) + } + + if _, err := findNode(hdl, path); err == nil { + return errors.Errorf("failed to remove %s", path) + } return nil } diff --git a/src/control/lib/telemetry/telemetry_test.go b/src/control/lib/telemetry/telemetry_test.go index a645f0e60e4d..bc63cc813998 100644 --- a/src/control/lib/telemetry/telemetry_test.go +++ b/src/control/lib/telemetry/telemetry_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -9,6 +9,9 @@ package telemetry import ( "context" "fmt" + "os" + "os/exec" + "strconv" "sync" "testing" "time" @@ -16,6 +19,7 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" ) func TestTelemetry_Init(t *testing.T) { @@ -50,7 +54,7 @@ func TestTelemetry_Init(t *testing.T) { t.Fatalf("can't get handle from result ctx: %v", err) } - test.AssertEqual(t, uint32(producerID), hdl.idx, "handle.idx doesn't match shmem ID") + test.AssertEqual(t, uint32(producerID), hdl.id, "handle.idx doesn't match shmem ID") hdl.RLock() defer hdl.RUnlock() @@ -179,6 +183,106 @@ func TestTelemetry_GetRank(t *testing.T) { } } +func childErrExit(err error) { + if err == nil { + err = errors.New("unknown error") + } + fmt.Fprintf(os.Stderr, "CHILD ERROR: %s\n", err) + os.Exit(1) +} + +const ( + childModeEnvVar = "TEST_CHILD_MODE" + childModeLinkTest = "CHILD_MODE_LINK_TEST" + childShmIDEnvVar = "TEST_CHILD_SHM_ID" +) + +func TestMain(m *testing.M) { + mode := os.Getenv(childModeEnvVar) + switch mode { + case "": + // default; run the test binary + os.Exit(m.Run()) + case childModeLinkTest: + runChildTelemProc() + default: + childErrExit(errors.Errorf("Unknown child mode: %q", mode)) + } +} + +func runChildTelemProc() { + pid := os.Getpid() + shmID, err := strconv.Atoi(os.Getenv(childShmIDEnvVar)) + if err != nil { + childErrExit(err) + } + + jobDir := TestMetricsMap{ + MetricTypeDirectory: &TestMetric{ + Name: "job", + }, + } + pidLink := TestMetricsMap{ + MetricTypeLink: &TestMetric{ + Name: fmt.Sprintf("job/%d", pid), + }, + } + startedAt := TestMetricsMap{ + MetricTypeTimestamp: &TestMetric{ + Name: fmt.Sprintf("job/%d/started_at", pid), + }, + } + + t := &testing.T{} + + InitTestMetricsProducer(t, shmID, 1024) + + AddTestMetrics(t, jobDir) + AddTestMetrics(t, pidLink) + AddTestMetrics(t, startedAt) + + if t.Failed() { + childErrExit(errors.New("test failed")) + } +} + +func TestTelemetry_PruneSegments(t *testing.T) { + shmID := uint32(NextTestID()) + + cmd := exec.Command(os.Args[0]) + cmd.Env = append(os.Environ(), + fmt.Sprintf("%s=%s", childModeEnvVar, childModeLinkTest), + fmt.Sprintf("%s=%d", childShmIDEnvVar, shmID), + ) + if out, err := cmd.CombinedOutput(); err != nil { + t.Errorf("child failed: %s", out) + t.Fatal(err) + } + + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx, err := initClientRoot(test.MustLogContext(t, log), shmID) + if err != nil { + t.Fatal(err) + } + defer func() { + Fini() + }() + + path := fmt.Sprintf("job/%d/started_at", cmd.Process.Pid) + _, err = GetTimestamp(ctx, path) + test.CmpErr(t, nil, err) + + err = PruneUnusedSegments(ctx, time.Nanosecond) + test.CmpErr(t, nil, err) + + _, err = GetTimestamp(ctx, path) + if err == nil { + t.Fatal("expected GetTimestamp() to fail after prune") + } +} + func TestTelemetry_CollectMetrics(t *testing.T) { testMetrics := TestMetricsMap{ MetricTypeCounter: &TestMetric{ diff --git a/src/control/lib/telemetry/test_helpers.go b/src/control/lib/telemetry/test_helpers.go index c0cbdda72ef1..bc014eb2502c 100644 --- a/src/control/lib/telemetry/test_helpers.go +++ b/src/control/lib/telemetry/test_helpers.go @@ -19,6 +19,8 @@ import ( "testing" "time" + "github.com/pkg/errors" + "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/daos" ) @@ -60,6 +62,7 @@ var nextIDMutex sync.Mutex const ( telemetryIDBase = 100 PromexpIDBase = 200 + AgentIDBase = 300 ) // NextTestID gets the next available ID for a shmem segment. This helps avoid @@ -80,6 +83,7 @@ func NextTestID(base ...int) int { type ( TestMetric struct { + Type MetricType Name string path string desc string @@ -87,6 +91,7 @@ type ( min uint64 max uint64 Cur float64 // value - may be exact or approximate + Values []uint64 sum uint64 mean float64 stddev float64 @@ -106,6 +111,25 @@ func (tm *TestMetric) FullPath() string { return fullName } +func (tm *TestMetric) GetMetric(ctx context.Context) (Metric, error) { + switch tm.Type { + case MetricTypeCounter: + return GetCounter(ctx, tm.FullPath()) + case MetricTypeTimestamp: + return GetTimestamp(ctx, tm.FullPath()) + case MetricTypeSnapshot: + return GetSnapshot(ctx, tm.FullPath()) + case MetricTypeDuration: + return GetDuration(ctx, tm.FullPath()) + case MetricTypeGauge: + return GetGauge(ctx, tm.FullPath()) + case MetricTypeStatsGauge: + return GetStatsGauge(ctx, tm.FullPath()) + default: + return nil, errors.Errorf("unsupported metric type %s", tm.Type) + } +} + func InitTestMetricsProducer(t *testing.T, id int, size uint64) { t.Helper() @@ -115,65 +139,82 @@ func InitTestMetricsProducer(t *testing.T, id int, size uint64) { } } +func AddTestMetric(t *testing.T, tm *TestMetric) { + t.Helper() + + fullName := tm.FullPath() + switch tm.Type { + case MetricTypeGauge: + rc := C.add_metric(&tm.node, C.D_TM_GAUGE, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + C.d_tm_set_gauge(tm.node, C.uint64_t(tm.Cur)) + case MetricTypeStatsGauge: + rc := C.add_metric(&tm.node, C.D_TM_STATS_GAUGE, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", tm.Name, daos.Status(rc)) + } + + vals := make([]uint64, len(tm.Values)) + if len(tm.Values) > 0 { + copy(vals, tm.Values) + } else { + vals = []uint64{tm.min, tm.max, uint64(tm.Cur)} + } + t.Logf("setting values for %s: %+v\n", tm.FullPath(), vals) + + for _, val := range vals { + C.d_tm_set_gauge(tm.node, C.uint64_t(val)) + t.Logf("set %s to %d\n", tm.FullPath(), val) + } + case MetricTypeCounter: + rc := C.add_metric(&tm.node, C.D_TM_COUNTER, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + C.d_tm_inc_counter(tm.node, C.ulong(tm.Cur)) + case MetricTypeDuration: + rc := C.add_metric(&tm.node, C.D_TM_DURATION|C.D_TM_CLOCK_REALTIME, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + C.d_tm_mark_duration_start(tm.node, C.D_TM_CLOCK_REALTIME) + time.Sleep(time.Duration(tm.Cur)) + C.d_tm_mark_duration_end(tm.node) + case MetricTypeTimestamp: + rc := C.add_metric(&tm.node, C.D_TM_TIMESTAMP, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + C.d_tm_record_timestamp(tm.node) + case MetricTypeSnapshot: + rc := C.add_metric(&tm.node, C.D_TM_TIMER_SNAPSHOT|C.D_TM_CLOCK_REALTIME, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + C.d_tm_take_timer_snapshot(tm.node, C.D_TM_CLOCK_REALTIME) + case MetricTypeDirectory: + rc := C.add_metric(&tm.node, C.D_TM_DIRECTORY, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + case MetricTypeLink: + rc := C.add_eph_dir(&tm.node, 1024, C.CString(fullName)) + if rc != 0 { + t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) + } + default: + t.Fatalf("metric type %s not supported", tm.Type) + } +} + func AddTestMetrics(t *testing.T, testMetrics TestMetricsMap) { t.Helper() for mt, tm := range testMetrics { - fullName := tm.FullPath() - switch mt { - case MetricTypeGauge: - rc := C.add_metric(&tm.node, C.D_TM_GAUGE, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - C.d_tm_set_gauge(tm.node, C.uint64_t(tm.Cur)) - case MetricTypeStatsGauge: - rc := C.add_metric(&tm.node, C.D_TM_STATS_GAUGE, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", tm.Name, daos.Status(rc)) - } - for _, val := range []uint64{tm.min, tm.max, uint64(tm.Cur)} { - C.d_tm_set_gauge(tm.node, C.uint64_t(val)) - } - case MetricTypeCounter: - rc := C.add_metric(&tm.node, C.D_TM_COUNTER, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - C.d_tm_inc_counter(tm.node, C.ulong(tm.Cur)) - case MetricTypeDuration: - rc := C.add_metric(&tm.node, C.D_TM_DURATION|C.D_TM_CLOCK_REALTIME, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - C.d_tm_mark_duration_start(tm.node, C.D_TM_CLOCK_REALTIME) - time.Sleep(time.Duration(tm.Cur)) - C.d_tm_mark_duration_end(tm.node) - case MetricTypeTimestamp: - rc := C.add_metric(&tm.node, C.D_TM_TIMESTAMP, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - C.d_tm_record_timestamp(tm.node) - case MetricTypeSnapshot: - rc := C.add_metric(&tm.node, C.D_TM_TIMER_SNAPSHOT|C.D_TM_CLOCK_REALTIME, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - C.d_tm_take_timer_snapshot(tm.node, C.D_TM_CLOCK_REALTIME) - case MetricTypeDirectory: - rc := C.add_metric(&tm.node, C.D_TM_DIRECTORY, C.CString(tm.desc), C.CString(tm.units), C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - case MetricTypeLink: - rc := C.add_eph_dir(&tm.node, 1024, C.CString(fullName)) - if rc != 0 { - t.Fatalf("failed to add %s: %s", fullName, daos.Status(rc)) - } - default: - t.Fatalf("metric type %d not supported", mt) - } + tm.Type = mt + AddTestMetric(t, tm) } } diff --git a/src/control/server/telemetry.go b/src/control/server/telemetry.go index f7f094ffe7e9..4b2f624aff2a 100644 --- a/src/control/server/telemetry.go +++ b/src/control/server/telemetry.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2022 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,13 +8,9 @@ package server import ( "context" - "fmt" - "net/http" - "time" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" @@ -27,7 +23,7 @@ func regPromEngineSources(ctx context.Context, log logging.Logger, engines []Eng return nil } - c, err := promexp.NewCollector(log, &promexp.CollectorOpts{}) + c, err := promexp.NewEngineCollector(log, &promexp.CollectorOpts{}) if err != nil { return err } @@ -73,45 +69,13 @@ func regPromEngineSources(ctx context.Context, log logging.Logger, engines []Eng } func startPrometheusExporter(ctx context.Context, log logging.Logger, port int, engines []Engine) (func(), error) { - if err := regPromEngineSources(ctx, log, engines); err != nil { - return nil, err + expCfg := &promexp.ExporterConfig{ + Port: port, + Title: "DAOS Engine Telemetry", + Register: func(ctx context.Context, log logging.Logger) error { + return regPromEngineSources(ctx, log, engines) + }, } - listenAddress := fmt.Sprintf("0.0.0.0:%d", port) - - srv := http.Server{Addr: listenAddress} - http.Handle("/metrics", promhttp.HandlerFor( - prometheus.DefaultGatherer, promhttp.HandlerOpts{}, - )) - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - num, err := w.Write([]byte(` - DAOS Exporter - -

DAOS Exporter

-

Metrics

- - `)) - if err != nil { - log.Errorf("%d: %s", num, err) - } - }) - - // http listener is a blocking call - go func() { - log.Infof("Listening on %s", listenAddress) - err := srv.ListenAndServe() - log.Infof("Prometheus web exporter stopped: %s", err.Error()) - }() - - return func() { - log.Debug("Shutting down Prometheus web exporter") - - // When this cleanup function is called, the original context - // will probably have already been canceled. - timedCtx, cancel := context.WithTimeout(context.Background(), 1*time.Second) - defer cancel() - if err := srv.Shutdown(timedCtx); err != nil { - log.Noticef("HTTP server didn't shut down within timeout: %s", err.Error()) - } - }, nil + return promexp.StartExporter(ctx, log, expCfg) } diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 2936acfcbeaf..18e32463bb0b 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -9,6 +9,7 @@ #define D_LOGFAC DD_FAC(dtx) #include +#include #include #include #include @@ -132,11 +133,11 @@ dtx_metrics_count(void) return (sizeof(struct dtx_pool_metrics) / sizeof(struct d_tm_node_t *)); } -struct dss_module_metrics dtx_metrics = { - .dmm_tags = DAOS_TGT_TAG, - .dmm_init = dtx_metrics_alloc, - .dmm_fini = dtx_metrics_free, - .dmm_nr_metrics = dtx_metrics_count, +struct daos_module_metrics dtx_metrics = { + .dmm_tags = DAOS_TGT_TAG, + .dmm_init = dtx_metrics_alloc, + .dmm_fini = dtx_metrics_free, + .dmm_nr_metrics = dtx_metrics_count, }; static void diff --git a/src/engine/SConscript b/src/engine/SConscript index ceb00a409d09..e94b6a83dd61 100644 --- a/src/engine/SConscript +++ b/src/engine/SConscript @@ -29,7 +29,7 @@ def scons(): 'drpc_handler.c', 'drpc_listener.c', 'drpc_progress.c', 'init.c', 'module.c', 'srv_cli.c', 'profile.c', 'rpc.c', - 'server_iv.c', 'srv.c', 'srv.pb-c.c', 'tls.c', + 'server_iv.c', 'srv.c', 'srv.pb-c.c', 'sched.c', 'ult.c', 'event.pb-c.c', 'srv_metrics.c'] + libdaos_tgts diff --git a/src/engine/init.c b/src/engine/init.c index c4dfb6e19970..d639456eeb15 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "srv_internal.h" #include "drpc_internal.h" #include @@ -628,14 +629,14 @@ server_id_cb(uint32_t *tid, uint64_t *uid) } if (tid != NULL) { - struct dss_thread_local_storage *dtc; - struct dss_module_info *dmi; + struct daos_thread_local_storage *dtc; + struct daos_module_info *dmi; int index = daos_srv_modkey.dmk_index; - /* Avoid assertion in dss_module_key_get() */ + /* Avoid assertion in daos_module_key_get() */ dtc = dss_tls_get(); if (dtc != NULL && index >= 0 && index < DAOS_MODULE_KEYS_NR && - dss_module_keys[index] == &daos_srv_modkey) { + daos_get_module_key(index) == &daos_srv_modkey) { dmi = dss_get_module_info(); if (dmi != NULL) *tid = dmi->dmi_xs_id; diff --git a/src/engine/module.c b/src/engine/module.c index ce33609aeba3..4ee74235ff52 100644 --- a/src/engine/module.c +++ b/src/engine/module.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include "drpc_handler.h" @@ -387,7 +388,7 @@ dss_module_init_metrics(enum dss_module_tag tag, void **metrics, struct loaded_mod *mod; d_list_for_each_entry(mod, &loaded_mod_list, lm_lk) { - struct dss_module_metrics *met = mod->lm_dss_mod->sm_metrics; + struct daos_module_metrics *met = mod->lm_dss_mod->sm_metrics; if (met == NULL) continue; @@ -415,7 +416,7 @@ dss_module_fini_metrics(enum dss_module_tag tag, void **metrics) struct loaded_mod *mod; d_list_for_each_entry(mod, &loaded_mod_list, lm_lk) { - struct dss_module_metrics *met = mod->lm_dss_mod->sm_metrics; + struct daos_module_metrics *met = mod->lm_dss_mod->sm_metrics; if (met == NULL) continue; @@ -442,7 +443,7 @@ dss_module_nr_pool_metrics(void) int total = 0, nr; d_list_for_each_entry(mod, &loaded_mod_list, lm_lk) { - struct dss_module_metrics *met = mod->lm_dss_mod->sm_metrics; + struct daos_module_metrics *met = mod->lm_dss_mod->sm_metrics; if (met == NULL) continue; diff --git a/src/engine/srv.c b/src/engine/srv.c index 986d8ed04c4d..e0c985c38f63 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -364,9 +364,9 @@ wait_all_exited(struct dss_xstream *dx, struct dss_module_info *dmi) static void dss_srv_handler(void *arg) { - struct dss_xstream *dx = (struct dss_xstream *)arg; - struct dss_thread_local_storage *dtc; - struct dss_module_info *dmi; + struct dss_xstream *dx = (struct dss_xstream *)arg; + struct daos_thread_local_storage *dtc; + struct dss_module_info *dmi; int rc; bool track_mem = false; bool signal_caller = true; @@ -1300,7 +1300,7 @@ dss_srv_fini(bool force) vos_standalone_tls_fini(); /* fall through */ case XD_INIT_TLS_REG: - pthread_key_delete(dss_tls_key); + ds_tls_key_delete(); /* fall through */ case XD_INIT_ULT_BARRIER: ABT_cond_free(&xstream_data.xd_ult_barrier); @@ -1402,7 +1402,7 @@ dss_srv_init(void) xstream_data.xd_init_step = XD_INIT_ULT_BARRIER; /* register xstream-local storage key */ - rc = pthread_key_create(&dss_tls_key, NULL); + rc = ds_tls_key_create(); if (rc) { rc = dss_abterr2der(rc); D_ERROR("Failed to register storage key: "DF_RC"\n", DP_RC(rc)); diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index 8621175b44fd..1d4278a98cfa 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -319,10 +319,6 @@ sched_create_thread(struct dss_xstream *dx, void (*func)(void *), void *arg, return dss_abterr2der(rc); } -/* tls.c */ -void dss_tls_fini(struct dss_thread_local_storage *dtls); -struct dss_thread_local_storage *dss_tls_init(int tag, int xs_id, int tgt_id); - /* server_iv.c */ void ds_iv_init(void); void ds_iv_fini(void); diff --git a/src/engine/tls.c b/src/engine/tls.c deleted file mode 100644 index 90ea6cce7c58..000000000000 --- a/src/engine/tls.c +++ /dev/null @@ -1,155 +0,0 @@ -/** - * (C) Copyright 2016-2021 Intel Corporation. - * - * SPDX-License-Identifier: BSD-2-Clause-Patent - */ -/** - * This file is part of the DAOS server. It implements thread-local storage - * (TLS) for DAOS service threads. - */ -#define D_LOGFAC DD_FAC(server) - -#include -#include "srv_internal.h" - -/* The array remember all of registered module keys on one node. */ -struct dss_module_key *dss_module_keys[DAOS_MODULE_KEYS_NR] = { NULL }; - -pthread_mutex_t dss_module_keys_lock = PTHREAD_MUTEX_INITIALIZER; - -void -dss_register_key(struct dss_module_key *key) -{ - int i; - - D_MUTEX_LOCK(&dss_module_keys_lock); - for (i = 0; i < DAOS_MODULE_KEYS_NR; i++) { - if (dss_module_keys[i] == NULL) { - dss_module_keys[i] = key; - key->dmk_index = i; - break; - } - } - D_MUTEX_UNLOCK(&dss_module_keys_lock); - D_ASSERT(i < DAOS_MODULE_KEYS_NR); -} - -void -dss_unregister_key(struct dss_module_key *key) -{ - if (key == NULL) - return; - D_ASSERT(key->dmk_index >= 0); - D_ASSERT(key->dmk_index < DAOS_MODULE_KEYS_NR); - D_MUTEX_LOCK(&dss_module_keys_lock); - dss_module_keys[key->dmk_index] = NULL; - D_MUTEX_UNLOCK(&dss_module_keys_lock); -} - -/** - * Init thread context - * - * \param[in]dtls Init the thread context to allocate the - * local thread variable for each module. - * - * \retval 0 if initialization succeeds - * \retval negative errno if initialization fails - */ -static int -dss_thread_local_storage_init(struct dss_thread_local_storage *dtls, - int xs_id, int tgt_id) -{ - int rc = 0; - int i; - - if (dtls->dtls_values == NULL) { - D_ALLOC_ARRAY(dtls->dtls_values, - (int)ARRAY_SIZE(dss_module_keys)); - if (dtls->dtls_values == NULL) - return -DER_NOMEM; - } - - for (i = 0; i < DAOS_MODULE_KEYS_NR; i++) { - struct dss_module_key *dmk = dss_module_keys[i]; - - if (dmk != NULL && dtls->dtls_tag & dmk->dmk_tags) { - D_ASSERT(dmk->dmk_init != NULL); - dtls->dtls_values[i] = dmk->dmk_init(dtls->dtls_tag, xs_id, tgt_id); - if (dtls->dtls_values[i] == NULL) { - rc = -DER_NOMEM; - break; - } - } - } - return rc; -} - -/** - * Finish module context - * - * \param[in]dtls Finish the thread context to free the - * local thread variable for each module. - */ -static void -dss_thread_local_storage_fini(struct dss_thread_local_storage *dtls) -{ - int i; - - if (dtls->dtls_values != NULL) { - for (i = DAOS_MODULE_KEYS_NR - 1; i >= 0; i--) { - struct dss_module_key *dmk = dss_module_keys[i]; - - if (dmk != NULL && dtls->dtls_tag & dmk->dmk_tags) { - D_ASSERT(dtls->dtls_values[i] != NULL); - D_ASSERT(dmk->dmk_fini != NULL); - dmk->dmk_fini(dtls->dtls_tag, dtls->dtls_values[i]); - } - } - } - - D_FREE(dtls->dtls_values); -} - -pthread_key_t dss_tls_key; - -/* - * Allocate dss_thread_local_storage for a particular thread and - * store the pointer in a thread-specific value which can be - * fetched at any time with dss_tls_get(). - */ -struct dss_thread_local_storage * -dss_tls_init(int tag, int xs_id, int tgt_id) -{ - struct dss_thread_local_storage *dtls; - int rc; - - D_ALLOC_PTR(dtls); - if (dtls == NULL) - return NULL; - - dtls->dtls_tag = tag; - rc = dss_thread_local_storage_init(dtls, xs_id, tgt_id); - if (rc != 0) { - D_FREE(dtls); - return NULL; - } - - rc = pthread_setspecific(dss_tls_key, dtls); - if (rc) { - D_ERROR("failed to initialize tls: %d\n", rc); - dss_thread_local_storage_fini(dtls); - D_FREE(dtls); - return NULL; - } - - return dtls; -} - -/* Free DTC for a particular thread. */ -void -dss_tls_fini(struct dss_thread_local_storage *dtls) -{ - dss_thread_local_storage_fini(dtls); - D_FREE(dtls); - pthread_setspecific(dss_tls_key, NULL); -} diff --git a/src/gurt/examples/telem_consumer_example.c b/src/gurt/examples/telem_consumer_example.c index 6b7b1653a163..cac33fc7077f 100644 --- a/src/gurt/examples/telem_consumer_example.c +++ b/src/gurt/examples/telem_consumer_example.c @@ -147,6 +147,13 @@ void read_metrics(struct d_tm_context *ctx, struct d_tm_node_t *root, d_tm_list_free(head); } +static void +iter_print(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format, + int opt_fields, void *arg) +{ + d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg); +} + int main(int argc, char **argv) { @@ -177,8 +184,8 @@ main(int argc, char **argv) filter = (D_TM_COUNTER | D_TM_TIMESTAMP | D_TM_TIMER_SNAPSHOT | D_TM_DURATION | D_TM_GAUGE | D_TM_DIRECTORY); show_meta = true; - d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_STANDARD, - D_TM_INCLUDE_METADATA, D_TM_ITER_READ, stdout); + d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_STANDARD, D_TM_INCLUDE_METADATA, iter_print, + stdout); sprintf(dirname, "manually added"); filter = (D_TM_COUNTER | D_TM_TIMESTAMP | D_TM_TIMER_SNAPSHOT | diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c index f91d1e72919f..6bd3a4952465 100644 --- a/src/gurt/telemetry.c +++ b/src/gurt/telemetry.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -16,9 +16,11 @@ #include #include #include -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_producer.h" -#include "gurt/telemetry_consumer.h" +#include +#include +#include +#include +#include /** minimal list of shared memory regions with a global ID */ struct shmem_region_list { @@ -31,12 +33,17 @@ struct shmem_region_list { struct d_tm_shmem_hdr { uint64_t sh_base_addr; /** address of this struct */ key_t sh_key; /** key to access region */ - bool sh_deleted; /** marked for deletion */ + uint32_t sh_deleted : 1, /** marked for deletion */ + sh_multiple_writer : 1; /** require lock to protect */ uint8_t sh_reserved[3]; /** for alignment */ uint64_t sh_bytes_total; /** total size of region */ uint64_t sh_bytes_free; /** free bytes in this region */ void *sh_free_addr; /** start of free space */ struct d_tm_node_t *sh_root; /** root of metric tree */ + + /* lock to protect update, mostly for create and remove ephemeral dir */ + pthread_mutex_t sh_multiple_writer_lock; + /** * List of all ephemeral regions attached to this shmem region. */ @@ -69,8 +76,10 @@ static struct d_tm_shmem { struct d_tm_context *ctx; /** context for the producer */ struct d_tm_node_t *root; /** root node of shmem */ pthread_mutex_t add_lock; /** for synchronized access */ - bool sync_access; /** whether to sync access */ - bool retain; /** retain shmem region on exit */ + uint32_t retain : 1, /** retain shmem region during exit */ + sync_access : 1, /** enable sync access to shmem */ + retain_non_empty : 1, /** retain shmem region if it is not empty */ + multiple_writer_lock : 1; /** lock for multiple writer */ int id; /** Instance ID */ } tm_shmem; @@ -168,13 +177,49 @@ d_tm_get_name(struct d_tm_context *ctx, struct d_tm_node_t *node) static int d_tm_lock_shmem(void) { - return D_MUTEX_LOCK(&tm_shmem.add_lock); + struct d_tm_context *ctx = tm_shmem.ctx; + int rc; + + if (tm_shmem.multiple_writer_lock) { + rc = D_MUTEX_LOCK(&ctx->shmem_root->sh_multiple_writer_lock); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to take multiple writer lock"); + return rc; + } + } + + rc = D_MUTEX_LOCK(&tm_shmem.add_lock); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to take shared memory lock"); + if (tm_shmem.multiple_writer_lock) + D_MUTEX_UNLOCK(&ctx->shmem_root->sh_multiple_writer_lock); + return rc; + } + + return 0; } static int d_tm_unlock_shmem(void) { - return D_MUTEX_UNLOCK(&tm_shmem.add_lock); + struct d_tm_context *ctx = tm_shmem.ctx; + int rc; + + rc = D_MUTEX_UNLOCK(&tm_shmem.add_lock); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to release shared memory lock"); + return rc; + } + + if (tm_shmem.multiple_writer_lock) { + rc = D_MUTEX_UNLOCK(&ctx->shmem_root->sh_multiple_writer_lock); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to release multiple writer lock"); + return rc; + } + } + + return 0; } /* @@ -200,6 +245,8 @@ attach_shmem(key_t key, size_t size, int flags, struct d_tm_shmem_hdr **shmem) return -DER_SHMEM_PERMS; } + D_INFO("%s shmid %d key 0x%x addr %p\n", size > 0 ? "allocated" : "attached", shmid, key, + addr); *shmem = addr; return shmid; } @@ -208,7 +255,6 @@ static int new_shmem(key_t key, size_t size, struct d_tm_shmem_hdr **shmem) { int rc; - D_INFO("creating new shared memory segment, key=0x%x, size=%lu\n", key, size); rc = attach_shmem(key, size, IPC_CREAT | 0660, shmem); @@ -331,7 +377,7 @@ close_local_shmem_entry(struct local_shmem_list *entry, bool destroy) { d_list_del(&entry->link); if (destroy) - entry->region->sh_deleted = true; + entry->region->sh_deleted = 1; close_shmem(entry->region); if (destroy) @@ -529,7 +575,7 @@ init_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t *node, D_ERROR("cannot allocate node name [%s]\n", name); return -DER_NO_SHMEM; } - strncpy(node->dtn_name, name, buff_len); + strncpy(conv_ptr(shmem, node->dtn_name), name, buff_len); node->dtn_shmem_key = shmem->sh_key; node->dtn_child = NULL; /* may be reinitializing an existing node, in which case we shouldn't @@ -557,6 +603,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode, const char *name) { struct d_tm_node_t *node = NULL; + struct d_tm_node_t *tmp; int rc = DER_SUCCESS; if (shmem == NULL || newnode == NULL || name == NULL) { @@ -569,14 +616,19 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode, rc = -DER_NO_SHMEM; goto out; } - rc = init_node(shmem, node, name); + + tmp = conv_ptr(shmem, node); + + rc = init_node(shmem, tmp, name); if (rc != 0) goto out; - node->dtn_metric = NULL; - node->dtn_sibling = NULL; - *newnode = node; + tmp->dtn_metric = NULL; + tmp->dtn_sibling = NULL; + *newnode = node; out: + if (rc != 0) + DL_ERROR(rc, "failed to alloc node for %s", name); return rc; } @@ -624,10 +676,10 @@ add_child(struct d_tm_node_t **newnode, struct d_tm_node_t *parent, * 1) a previously-cleared link node that can be reused, or * 2) the right place to attach a newly allocated node. */ - child = parent->dtn_child; + child = conv_ptr(shmem, parent->dtn_child); while (child != NULL && !is_cleared_link(tm_shmem.ctx, child)) { sibling = child; - child = child->dtn_sibling; + child = conv_ptr(shmem, child->dtn_sibling); } if (is_cleared_link(tm_shmem.ctx, child)) { @@ -657,6 +709,7 @@ add_child(struct d_tm_node_t **newnode, struct d_tm_node_t *parent, else sibling->dtn_sibling = *newnode; + *newnode = conv_ptr(shmem, *newnode); return 0; failure: @@ -751,7 +804,7 @@ destroy_shmem_with_key(key_t key) /** * Initialize an instance of the telemetry and metrics API for the producer - * process. + * process with the root set to the provided name. * * \param[in] id Identifies the producer process amongst others * on the same machine. @@ -763,6 +816,7 @@ destroy_shmem_with_key(key_t key) * Use D_TM_RETAIN_SHMEM to retain the shared * memory segment created for these metrics after * this process exits. + * \param[in] root_name The name of this node in the telemetry tree. * * \return DER_SUCCESS Success * -DER_NO_SHMEM Out of shared memory @@ -770,41 +824,70 @@ destroy_shmem_with_key(key_t key) * -DER_INVAL Invalid \a flag(s) */ int -d_tm_init(int id, uint64_t mem_size, int flags) +d_tm_init_with_name(int id, uint64_t mem_size, int flags, const char *root_name) { - struct d_tm_shmem_hdr *new_shmem; + struct d_tm_shmem_hdr *new_shmem = NULL; key_t key; - int shmid; - char tmp[D_TM_MAX_NAME_LEN]; + int shmid; int rc = DER_SUCCESS; + if (root_name == NULL || strnlen(root_name, D_TM_MAX_NAME_LEN) == 0) { + D_ERROR("root name cannot be empty\n"); + return -DER_INVAL; + } + + if (strnlen(root_name, D_TM_MAX_NAME_LEN) == D_TM_MAX_NAME_LEN) { + D_ERROR("root name too long (max=%d)\n", D_TM_MAX_NAME_LEN); + return -DER_EXCEEDS_PATH_LEN; + } + memset(&tm_shmem, 0, sizeof(tm_shmem)); - if ((flags & ~(D_TM_SERIALIZATION | D_TM_RETAIN_SHMEM)) != 0) { - D_ERROR("Invalid flags\n"); + if ((flags & ~(D_TM_SERIALIZATION | D_TM_RETAIN_SHMEM | D_TM_RETAIN_SHMEM_IF_NON_EMPTY | + D_TM_OPEN_OR_CREATE | D_TM_MULTIPLE_WRITER_LOCK)) != 0) { + D_ERROR("Invalid flags 0x%x\n", flags); rc = -DER_INVAL; goto failure; } if (flags & D_TM_SERIALIZATION) { - tm_shmem.sync_access = true; + tm_shmem.sync_access = 1; D_INFO("Serialization enabled for id %d\n", id); } if (flags & D_TM_RETAIN_SHMEM) { - tm_shmem.retain = true; + tm_shmem.retain = 1; D_INFO("Retaining shared memory for id %d\n", id); } + if (flags & D_TM_RETAIN_SHMEM_IF_NON_EMPTY) { + tm_shmem.retain_non_empty = 1; + D_INFO("Retaining shared memory for id %d if not empty\n", id); + } + + if (flags & D_TM_MULTIPLE_WRITER_LOCK) { + tm_shmem.multiple_writer_lock = 1; + D_INFO("Require multiple write protection for id %d\n", id); + } + tm_shmem.id = id; - snprintf(tmp, sizeof(tmp), "ID: %d", id); key = d_tm_get_srv_key(id); - rc = destroy_shmem_with_key(key); - if (rc != 0) - goto failure; - rc = create_shmem(tmp, key, mem_size, &shmid, &new_shmem); - if (rc != 0) - goto failure; + if (flags & D_TM_OPEN_OR_CREATE) { + rc = open_shmem(key, &new_shmem); + if (rc > 0) { + D_ASSERT(new_shmem != NULL); + shmid = rc; + } + } + + if (new_shmem == NULL) { + rc = destroy_shmem_with_key(key); + if (rc != 0) + goto failure; + rc = create_shmem(root_name, key, mem_size, &shmid, &new_shmem); + if (rc != 0) + goto failure; + } rc = alloc_ctx(&tm_shmem.ctx, new_shmem, shmid); if (rc != 0) @@ -831,19 +914,76 @@ d_tm_init(int id, uint64_t mem_size, int flags) return rc; } +/** + * Initialize an instance of the telemetry and metrics API for the producer + * process. + * + * \param[in] id Identifies the producer process amongst others + * on the same machine. + * \param[in] mem_size Size in bytes of the shared memory segment that + * is allocated. + * \param[in] flags Optional flags to control initialization. + * Use D_TM_SERIALIZATION to enable read/write + * synchronization of individual nodes. + * Use D_TM_RETAIN_SHMEM to retain the shared + * memory segment created for these metrics after + * this process exits. + * + * \return DER_SUCCESS Success + * -DER_NO_SHMEM Out of shared memory + * -DER_EXCEEDS_PATH_LEN Root node name exceeds path len + * -DER_INVAL Invalid \a flag(s) + */ +int +d_tm_init(int id, uint64_t mem_size, int flags) +{ + char tmp[D_TM_MAX_NAME_LEN]; + + snprintf(tmp, sizeof(tmp), "ID: %d", id); + + return d_tm_init_with_name(id, mem_size, flags, tmp); +} + +/* Check if all children are invalid */ +static bool +is_node_empty(struct d_tm_node_t *node) +{ + struct d_tm_context *ctx = tm_shmem.ctx; + struct d_tm_shmem_hdr *shmem; + struct d_tm_node_t *child; + + shmem = get_shmem_for_key(ctx, node->dtn_shmem_key); + child = conv_ptr(shmem, node->dtn_child); + while (child != NULL && !is_cleared_link(ctx, child)) { + child = conv_ptr(shmem, child->dtn_sibling); + if (child->dtn_name != NULL) + return false; + } + + return true; +} + /** * Releases resources claimed by init */ void d_tm_fini(void) { - bool destroy_shmem = false; + bool destroy_shmem = true; if (tm_shmem.ctx == NULL) goto out; - if (!tm_shmem.retain) - destroy_shmem = true; + if (tm_shmem.retain) + destroy_shmem = false; + + if (tm_shmem.retain_non_empty) { + struct d_tm_node_t *root; + + root = d_tm_get_root(tm_shmem.ctx); + if (!is_node_empty(root)) + destroy_shmem = false; + } /* close with the option to destroy the shmem region if needed */ close_all_shmem(tm_shmem.ctx, destroy_shmem); @@ -1451,9 +1591,9 @@ _reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node) return DER_SUCCESS; } -static void -reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, - char *path, int format, int opt_fields, FILE *stream) +void +d_tm_reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, + int format, int opt_fields, FILE *stream) { char *name = NULL; @@ -1467,7 +1607,7 @@ reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, switch (node->dtn_type) { case D_TM_LINK: node = d_tm_follow_link(ctx, node); - reset_node(ctx, node, level, path, format, opt_fields, stream); + d_tm_reset_node(ctx, node, level, path, format, opt_fields, stream); break; case D_TM_DIRECTORY: case D_TM_COUNTER: @@ -1507,20 +1647,18 @@ reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, * Choose D_TM_CSV for comma separated values. * \param[in] opt_fields A bitmask. Set D_TM_INCLUDE_* as desired for * the optional output fields. - * \param[in] show_timestamp Set to true to print the timestamp the metric - * was read by the consumer. - * \param[in] stream Direct output to this stream (stdout, stderr) + * \param[in] iter_cb iterate callback. + * \param[in] cb_arg argument for iterate callback. */ void -d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, - int level, int filter, char *path, int format, - int opt_fields, uint32_t ops, FILE *stream) +d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, int filter, char *path, + int format, int opt_fields, d_tm_iter_cb_t iter_cb, void *cb_arg) { struct d_tm_shmem_hdr *shmem = NULL; char *fullpath = NULL; char *parent_name = NULL; - if ((node == NULL) || (stream == NULL)) + if (node == NULL) return; if (node->dtn_type == D_TM_LINK) { @@ -1533,14 +1671,8 @@ d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, if (shmem == NULL) return; - if (node->dtn_type & filter) { - if (ops & D_TM_ITER_READ) - d_tm_print_node(ctx, node, level, path, format, - opt_fields, stream); - if (ops & D_TM_ITER_RESET) - reset_node(ctx, node, level, path, format, - opt_fields, stream); - } + if (node->dtn_type & filter) + iter_cb(ctx, node, level, path, format, opt_fields, cb_arg); parent_name = conv_ptr(shmem, node->dtn_name); node = node->dtn_child; @@ -1555,8 +1687,8 @@ d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, else D_ASPRINTF(fullpath, "%s/%s", path, parent_name); - d_tm_iterate(ctx, node, level + 1, filter, fullpath, format, - opt_fields, ops, stream); + d_tm_iterate(ctx, node, level + 1, filter, fullpath, format, opt_fields, iter_cb, + cb_arg); D_FREE(fullpath); node = node->dtn_sibling; node = conv_ptr(shmem, node); @@ -2105,6 +2237,29 @@ is_initialized(void) tm_shmem.ctx->shmem_root != NULL; } +/* + * Get a pointer to the last token in the path without modifying the original + * string. + */ +static const char * +get_last_token(const char *path) +{ + const char *substr = path; + const char *ch; + bool next_token = false; + + for (ch = path; *ch != '\0'; ch++) { + if (*ch == '/') { + next_token = true; + } else if (next_token) { + substr = ch; + next_token = false; + } + } + + return substr; +} + static int add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, char *desc, char *units, char *path) @@ -2113,6 +2268,7 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, struct d_tm_node_t *parent_node; struct d_tm_node_t *temp = NULL; struct d_tm_shmem_hdr *shmem; + struct d_tm_metric_t *metric; char *token; char *rest; char *unit_string; @@ -2154,11 +2310,11 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, } } - temp->dtn_metric->dtm_stats = NULL; + metric = conv_ptr(shmem, temp->dtn_metric); + metric->dtm_stats = NULL; if (has_stats(temp)) { - temp->dtn_metric->dtm_stats = - shmalloc(shmem, sizeof(struct d_tm_stats_t)); - if (temp->dtn_metric->dtm_stats == NULL) { + metric->dtm_stats = shmalloc(shmem, sizeof(struct d_tm_stats_t)); + if (metric->dtm_stats == NULL) { rc = -DER_NO_SHMEM; goto out; } @@ -2175,14 +2331,14 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, if (buff_len > 0) { buff_len += 1; /** make room for the trailing null */ - temp->dtn_metric->dtm_desc = shmalloc(shmem, buff_len); - if (temp->dtn_metric->dtm_desc == NULL) { + metric->dtm_desc = shmalloc(shmem, buff_len); + if (metric->dtm_desc == NULL) { rc = -DER_NO_SHMEM; goto out; } - strncpy(temp->dtn_metric->dtm_desc, desc, buff_len); + strncpy(conv_ptr(shmem, metric->dtm_desc), desc, buff_len); } else { - temp->dtn_metric->dtm_desc = NULL; + metric->dtm_desc = NULL; } unit_string = units; @@ -2216,14 +2372,14 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, if (buff_len > 0) { buff_len += 1; /** make room for the trailing null */ - temp->dtn_metric->dtm_units = shmalloc(shmem, buff_len); - if (temp->dtn_metric->dtm_units == NULL) { + metric->dtm_units = shmalloc(shmem, buff_len); + if (metric->dtm_units == NULL) { rc = -DER_NO_SHMEM; goto out; } - strncpy(temp->dtn_metric->dtm_units, unit_string, buff_len); + strncpy(conv_ptr(shmem, metric->dtm_units), unit_string, buff_len); } else { - temp->dtn_metric->dtm_units = NULL; + metric->dtm_units = NULL; } temp->dtn_protect = false; @@ -2344,26 +2500,35 @@ int d_tm_add_metric(struct d_tm_node_t **node, int metric_type, char *desc, } static void -invalidate_link_node(struct d_tm_node_t *node) +invalidate_link_node(struct d_tm_shmem_hdr *parent, struct d_tm_node_t *node) { if (node == NULL || node->dtn_type != D_TM_LINK) return; node->dtn_name = NULL; - if (node->dtn_metric != NULL) - node->dtn_metric->dtm_data.value = 0; + if (node->dtn_metric != NULL) { + struct d_tm_metric_t *link_metric; + + link_metric = conv_ptr(parent, node->dtn_metric); + link_metric->dtm_data.value = 0; + } } static int get_free_region_entry(struct d_tm_shmem_hdr *shmem, struct shmem_region_list **entry) { + d_list_t *cur; + d_list_t *head; + d_list_t *next; struct shmem_region_list *tmp; D_ASSERT(shmem != NULL); D_ASSERT(entry != NULL); - d_list_for_each_entry(tmp, &shmem->sh_subregions, rl_link) { + head = &shmem->sh_subregions; + for (cur = conv_ptr(shmem, head->next); cur != head; cur = conv_ptr(shmem, cur->next)) { + tmp = d_list_entry(cur, __typeof__(*tmp), rl_link); if (tmp->rl_link_node == NULL) { *entry = tmp; return 0; @@ -2376,7 +2541,23 @@ get_free_region_entry(struct d_tm_shmem_hdr *shmem, shmem->sh_key); return -DER_NO_SHMEM; } - d_list_add(&tmp->rl_link, &shmem->sh_subregions); + + next = conv_ptr(shmem, head->next); + /* NB: sh_subregions is initialized by D_INIT_LIST_HEAD(), so it is not shmem address */ + if (d_list_empty(&shmem->sh_subregions)) + cur = (d_list_t *)(shmem->sh_base_addr + + (uint64_t)(&((struct d_tm_shmem_hdr *)(0))->sh_subregions)); + else + cur = head->next; + + head->next = &tmp->rl_link; + next->prev = &tmp->rl_link; + + tmp = conv_ptr(shmem, tmp); + tmp->rl_link.next = cur; + tmp->rl_link.prev = + (d_list_t *)(shmem->sh_base_addr + + (uint64_t)(&((struct d_tm_shmem_hdr *)(0))->sh_subregions)); *entry = tmp; return 0; @@ -2413,27 +2594,199 @@ get_unique_shmem_key(const char *path, int id) return (key_t)d_hash_string_u32(salted, sizeof(salted)); } +static int +shm_stat_key(key_t key, struct shmid_ds *shminfo, int *shmid_ptr) +{ + int shmid; + int rc; + + if (unlikely(shminfo == NULL)) { + D_ERROR("NULL shminfo\n"); + return -DER_INVAL; + } + + rc = shmget(key, 0, 0); + if (rc < 0) { + D_ERROR("shmget(0x%x) failed: %s (%d)\n", key, strerror(errno), errno); + return daos_errno2der(errno); + } + shmid = rc; + + rc = shmctl(shmid, IPC_STAT, shminfo); + if (rc < 0) { + D_ERROR("shmctl(%d, IPC_STAT) failed: %s (%d)\n", shmid, strerror(errno), errno); + return daos_errno2der(errno); + } + + if (shmid_ptr != NULL) + *shmid_ptr = shmid; + + return 0; +} + /* - * Get a pointer to the last token in the path without modifying the original - * string. + * Set the child segment's ownership to match the parent segment. + * Needed in the client telemetry case where the client is allowing + * the agent to manage its telemetry segments. */ -static const char * -get_last_token(const char *path) +static int +sync_attached_segment_uid(char *path, key_t child_key) { - const char *substr = path; - const char *ch; - bool next_token = false; + struct d_tm_node_t *link_node; + struct d_tm_context *ctx = tm_shmem.ctx; + struct shmid_ds shminfo = {0}; + uid_t o_uid; + int child_shmid; + int rc; + + if (unlikely(path == NULL)) { + D_ERROR("NULL inputs\n"); + return -DER_INVAL; + } - for (ch = path; *ch != '\0'; ch++) { - if (*ch == '/') { - next_token = true; - } else if (next_token) { - substr = ch; - next_token = false; - } + link_node = d_tm_find_metric(ctx, path); + if (link_node == NULL) { + D_ERROR("nonexistent metric: %s", path); + D_GOTO(out, rc = -DER_NONEXIST); } - return substr; + rc = shm_stat_key(link_node->dtn_shmem_key, &shminfo, NULL); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to stat parent segment"); + goto out; + } + o_uid = shminfo.shm_perm.uid; + + rc = shm_stat_key(child_key, &shminfo, &child_shmid); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to stat child segment"); + goto out; + } + + if (o_uid == shminfo.shm_perm.uid) + D_GOTO(out, rc = 0); + + shminfo.shm_perm.uid = o_uid; + rc = shmctl(child_shmid, IPC_SET, &shminfo); + if (rc != 0) { + DL_ERROR(rc, "failed to set child segment ownership"); + } + +out: + return rc; +} + +static int +attach_path_segment(key_t key, char *path) +{ + struct d_tm_node_t *link_node; + struct d_tm_context *ctx = tm_shmem.ctx; + struct d_tm_shmem_hdr *parent_shmem; + struct d_tm_metric_t *link_metric; + struct shmem_region_list *region_entry; + int rc; + + if (unlikely(path == NULL)) { + D_ERROR("NULL inputs\n"); + D_GOTO(fail, rc = -DER_INVAL); + } + + /* Add a link to the new region */ + rc = add_metric(ctx, &link_node, D_TM_LINK, NULL, NULL, path); + if (unlikely(rc != 0)) { + D_ERROR("can't set up the link node, " DF_RC "\n", DP_RC(rc)); + D_GOTO(fail, rc); + } + + /* track attached regions within the parent shmem */ + parent_shmem = get_shmem_for_key(ctx, link_node->dtn_shmem_key); + if (unlikely(parent_shmem == NULL)) { + D_ERROR("failed to get parent shmem pointer\n"); + D_GOTO(fail_link, rc = -DER_NO_SHMEM); + } + + D_ASSERT(link_node->dtn_type == D_TM_LINK); + link_metric = conv_ptr(parent_shmem, link_node->dtn_metric); + link_metric->dtm_data.value = key; + + rc = get_free_region_entry(parent_shmem, ®ion_entry); + if (unlikely(rc != 0)) + D_GOTO(fail_link, rc); + region_entry->rl_key = key; + region_entry->rl_link_node = link_node; + + if (tm_shmem.multiple_writer_lock) + D_MUTEX_UNLOCK(&ctx->shmem_root->sh_multiple_writer_lock); + + return 0; +fail_link: + invalidate_link_node(parent_shmem, link_node); +fail: + return rc; +} + +/** + * Attach an existing telemetry segment into the tree at the path designated + * by fmt. This segment will be treated the same as an ephemeral directory + * that can be deleted later along with its children. + * + * \param[in] key Key to the shared memory segment + * \param[in] fmt Path constructed via variadic arguments + * + * \return 0 Success + * -DER_INVAL Invalid input + * -DER_EXIST Requested path already exists + */ +int +d_tm_attach_path_segment(key_t key, const char *fmt, ...) +{ + struct d_tm_node_t *link_node; + struct d_tm_context *ctx = tm_shmem.ctx; + va_list args; + char path[D_TM_MAX_NAME_LEN] = {0}; + int rc; + + if (!is_initialized()) + D_GOTO(fail, rc = -DER_UNINIT); + + if (unlikely(fmt == NULL)) { + D_ERROR("NULL inputs\n"); + D_GOTO(fail, rc = -DER_INVAL); + } + + if (strnlen(fmt, D_TM_MAX_NAME_LEN) == 0) { + D_ERROR("cannot attach segment at root\n"); + D_GOTO(fail, rc = -DER_INVAL); + } + + va_start(args, fmt); + rc = parse_path_fmt(path, sizeof(path), fmt, args); + va_end(args); + if (unlikely(rc != 0)) + D_GOTO(fail, rc); + + rc = d_tm_lock_shmem(); + if (rc != 0) + D_GOTO(fail, rc); + + link_node = d_tm_find_metric(ctx, path); + if (link_node != NULL) { + D_INFO("metric [%s] already exists\n", path); + D_GOTO(fail_unlock, rc = -DER_EXIST); + } + + rc = attach_path_segment(key, path); + if (unlikely(rc != 0)) + D_GOTO(fail_unlock, rc); + + d_tm_unlock_shmem(); + return 0; +fail_unlock: + d_tm_unlock_shmem(); +fail: + if (rc != -DER_EXIST) + DL_ERROR(rc, "Failed to add path segment [%s] for key %d", path, key); + return rc; } /** @@ -2455,12 +2808,9 @@ int d_tm_add_ephemeral_dir(struct d_tm_node_t **node, size_t size_bytes, const char *fmt, ...) { - struct d_tm_node_t *new_node; - struct d_tm_node_t *link_node; - struct d_tm_context *ctx = tm_shmem.ctx; - struct d_tm_shmem_hdr *parent_shmem; - struct d_tm_shmem_hdr *new_shmem; - struct shmem_region_list *region_entry; + struct d_tm_node_t *new_node; + struct d_tm_context *ctx = tm_shmem.ctx; + struct d_tm_shmem_hdr *new_shmem; va_list args; key_t key; char path[D_TM_MAX_NAME_LEN] = {0}; @@ -2495,57 +2845,52 @@ d_tm_add_ephemeral_dir(struct d_tm_node_t **node, size_t size_bytes, rc = d_tm_lock_shmem(); if (unlikely(rc != 0)) { D_ERROR("failed to get producer mutex\n"); - D_GOTO(fail, rc); + D_GOTO(fail_unlock, rc); } new_node = d_tm_find_metric(ctx, path); if (new_node != NULL) { - D_ERROR("metric [%s] already exists\n", path); + D_INFO("metric [%s] already exists\n", path); D_GOTO(fail_unlock, rc = -DER_EXIST); } key = get_unique_shmem_key(path, tm_shmem.id); rc = create_shmem(get_last_token(path), key, size_bytes, &new_shmid, &new_shmem); - if (rc != 0) + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to create shmem for %s", path); D_GOTO(fail_unlock, rc); + } new_node = new_shmem->sh_root; /* track at the process level */ rc = track_open_shmem(ctx, new_shmem, new_shmid, key); - if (rc != 0) + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to track shmem for %s", path); D_GOTO(fail_shmem, rc); + } - /* Add a link to the new region */ - rc = add_metric(ctx, &link_node, D_TM_LINK, NULL, NULL, path); - if (rc != 0) { - D_ERROR("can't set up the link node, " DF_RC "\n", DP_RC(rc)); - D_GOTO(fail_tracking, rc); + rc = attach_path_segment(key, path); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to attach 0x%x at %s", key, path); + D_GOTO(fail_attach, rc); } - D_ASSERT(link_node->dtn_type == D_TM_LINK); - link_node->dtn_metric->dtm_data.value = key; - /* track attached regions within the parent shmem */ - parent_shmem = get_shmem_for_key(ctx, link_node->dtn_shmem_key); - if (parent_shmem == NULL) { - D_ERROR("failed to get parent shmem pointer\n"); - D_GOTO(fail_link, rc = -DER_NO_SHMEM); + rc = sync_attached_segment_uid(path, key); + if (unlikely(rc != 0)) { + DL_ERROR(rc, "failed to sync %s permissions", path); + D_GOTO(fail_sync, rc); } - rc = get_free_region_entry(parent_shmem, ®ion_entry); - if (rc != 0) - D_GOTO(fail_link, rc); - region_entry->rl_key = key; - region_entry->rl_link_node = link_node; if (node != NULL) *node = new_node; d_tm_unlock_shmem(); return 0; - -fail_link: - invalidate_link_node(link_node); -fail_tracking: +fail_sync: + d_tm_del_ephemeral_dir(path); + goto fail_unlock; /* shmem will be closed/destroyed already */ +fail_attach: close_shmem_for_key(ctx, key, true); goto fail_unlock; /* shmem will be closed/destroyed already */ fail_shmem: @@ -2554,17 +2899,21 @@ d_tm_add_ephemeral_dir(struct d_tm_node_t **node, size_t size_bytes, fail_unlock: d_tm_unlock_shmem(); fail: - D_ERROR("Failed to add ephemeral dir [%s]: " DF_RC "\n", path, - DP_RC(rc)); + if (rc != -DER_EXIST) + DL_ERROR(rc, "Failed to add ephemeral dir [%s]", path); return rc; } static void clear_region_entry_for_key(struct d_tm_shmem_hdr *shmem, key_t key) { + d_list_t *cur; + d_list_t *head; struct shmem_region_list *tmp; - d_list_for_each_entry(tmp, &shmem->sh_subregions, rl_link) { + head = &shmem->sh_subregions; + for (cur = conv_ptr(shmem, head->next); cur != head; cur = conv_ptr(shmem, cur->next)) { + tmp = d_list_entry(cur, __typeof__(*tmp), rl_link); if (tmp->rl_key == key) { D_DEBUG(DB_TRACE, "cleared shmem metadata for key 0x%x\n", key); @@ -2583,6 +2932,8 @@ rm_ephemeral_dir(struct d_tm_context *ctx, struct d_tm_node_t *link) struct d_tm_shmem_hdr *parent_shmem; struct d_tm_shmem_hdr *shmem; struct d_tm_node_t *node; + d_list_t *cur; + d_list_t *head; struct shmem_region_list *curr; key_t key; int rc = 0; @@ -2616,8 +2967,10 @@ rm_ephemeral_dir(struct d_tm_context *ctx, struct d_tm_node_t *link) } /* delete sub-regions recursively */ - d_list_for_each_entry(curr, &shmem->sh_subregions, rl_link) { - rc = rm_ephemeral_dir(ctx, curr->rl_link_node); + head = &shmem->sh_subregions; + for (cur = conv_ptr(shmem, head->next); cur != head; cur = conv_ptr(shmem, cur->next)) { + curr = d_list_entry(cur, __typeof__(*curr), rl_link); + rc = rm_ephemeral_dir(ctx, conv_ptr(shmem, curr->rl_link_node)); if (rc != 0) /* nothing much we can do to recover here */ D_ERROR("error removing tmp dir [%s]: "DF_RC"\n", link->dtn_name, DP_RC(rc)); @@ -2629,11 +2982,35 @@ rm_ephemeral_dir(struct d_tm_context *ctx, struct d_tm_node_t *link) out_link: /* invalidate since the link node can't be deleted from parent */ - invalidate_link_node(link); + invalidate_link_node(parent_shmem, link); out: return rc; } +static int +try_del_ephemeral_dir(char *path, bool force) +{ + struct d_tm_context *ctx = tm_shmem.ctx; + struct d_tm_node_t *link; + int rc = 0; + + rc = d_tm_lock_shmem(); + if (unlikely(rc != 0)) { + D_ERROR("failed to get producer mutex\n"); + D_GOTO(unlock, rc); + } + + link = get_node(ctx, path); + if (!force && !is_node_empty(link)) + D_GOTO(unlock, rc == -DER_BUSY); + + rc = rm_ephemeral_dir(ctx, link); + +unlock: + d_tm_unlock_shmem(); + + return rc; +} /** * Deletes an ephemeral metrics directory from the metric tree. * @@ -2645,11 +3022,9 @@ rm_ephemeral_dir(struct d_tm_context *ctx, struct d_tm_node_t *link) int d_tm_del_ephemeral_dir(const char *fmt, ...) { - struct d_tm_context *ctx = tm_shmem.ctx; - struct d_tm_node_t *link; - va_list args; - char path[D_TM_MAX_NAME_LEN] = {0}; - int rc = 0; + va_list args; + char path[D_TM_MAX_NAME_LEN] = {0}; + int rc = 0; if (!is_initialized()) D_GOTO(out, rc = -DER_UNINIT); @@ -2665,16 +3040,45 @@ d_tm_del_ephemeral_dir(const char *fmt, ...) if (rc != 0) D_GOTO(out, rc); - rc = d_tm_lock_shmem(); - if (unlikely(rc != 0)) { - D_ERROR("failed to get producer mutex\n"); - D_GOTO(out, rc); + rc = try_del_ephemeral_dir(path, true); +out: + if (rc != 0) + D_ERROR("Failed to remove ephemeral dir: " DF_RC "\n", DP_RC(rc)); + else + D_INFO("Removed ephemeral directory [%s]\n", path); + return rc; +} + +/** + * Deletes an ephemeral metrics directory from the metric tree, only if it is empty. + * + * \param[in] fmt Used to construct the path to be removed + * + * \return 0 Success + * -DER_INVAL Invalid input + */ +int +d_tm_try_del_ephemeral_dir(const char *fmt, ...) +{ + va_list args; + char path[D_TM_MAX_NAME_LEN] = {0}; + int rc = 0; + + if (!is_initialized()) + D_GOTO(out, rc = -DER_UNINIT); + + if (fmt == NULL || strnlen(fmt, D_TM_MAX_NAME_LEN) == 0) { + D_ERROR("telemetry root cannot be deleted\n"); + D_GOTO(out, rc = -DER_INVAL); } - link = get_node(ctx, path); - rc = rm_ephemeral_dir(ctx, link); + va_start(args, fmt); + rc = parse_path_fmt(path, sizeof(path), fmt, args); + va_end(args); + if (rc != 0) + D_GOTO(out, rc); - d_tm_unlock_shmem(); + rc = try_del_ephemeral_dir(path, false); out: if (rc != 0) D_ERROR("Failed to remove ephemeral dir: " DF_RC "\n", @@ -3538,6 +3942,7 @@ allocate_shared_memory(key_t key, size_t mem_size, { int shmid; struct d_tm_shmem_hdr *header; + int rc; D_ASSERT(shmem != NULL); @@ -3559,8 +3964,17 @@ allocate_shared_memory(key_t key, size_t mem_size, D_INIT_LIST_HEAD(&header->sh_subregions); - D_DEBUG(DB_MEM, "Created shared memory region for key 0x%x, size=%lu\n", - key, mem_size); + if (tm_shmem.multiple_writer_lock) { + rc = D_MUTEX_INIT(&header->sh_multiple_writer_lock, NULL); + if (rc) { + DL_ERROR(rc, "multiple writer lock failed"); + return -DER_NO_SHMEM; + } + } + + D_DEBUG(DB_MEM, + "Created shared memory region for key 0x%x, size=%lu header %p base %p free %p\n", + key, mem_size, header, (void *)header->sh_base_addr, (void *)header->sh_free_addr); *shmem = header; @@ -3664,10 +4078,9 @@ shmalloc(struct d_tm_shmem_hdr *shmem, int length) shmem->sh_bytes_free -= length; shmem->sh_free_addr += length; - D_DEBUG(DB_TRACE, - "Allocated %d bytes. Now %" PRIu64 " remain\n", - length, shmem->sh_bytes_free); - memset(new_mem, 0, length); + D_DEBUG(DB_TRACE, "Allocated %d bytes. Now %" PRIu64 " remain %p/%p\n", length, + shmem->sh_bytes_free, shmem, new_mem); + memset(conv_ptr(shmem, new_mem), 0, length); return new_mem; } diff --git a/src/gurt/tests/test_gurt_telem_producer.c b/src/gurt/tests/test_gurt_telem_producer.c index bf3db9d19c95..32d4c4f7b893 100644 --- a/src/gurt/tests/test_gurt_telem_producer.c +++ b/src/gurt/tests/test_gurt_telem_producer.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2020-2022 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1226,6 +1226,13 @@ test_verify_object_count(void **state) assert_int_equal(num, exp_total); } +static void +iter_print(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format, + int opt_fields, void *arg) +{ + d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg); +} + static void test_print_metrics(void **state) { @@ -1238,15 +1245,15 @@ test_print_metrics(void **state) filter = (D_TM_COUNTER | D_TM_TIMESTAMP | D_TM_TIMER_SNAPSHOT | D_TM_DURATION | D_TM_GAUGE | D_TM_DIRECTORY); - d_tm_iterate(cli_ctx, node, 0, filter, NULL, D_TM_STANDARD, - D_TM_INCLUDE_METADATA, D_TM_ITER_READ, stdout); + d_tm_iterate(cli_ctx, node, 0, filter, NULL, D_TM_STANDARD, D_TM_INCLUDE_METADATA, + iter_print, stdout); d_tm_print_field_descriptors(D_TM_INCLUDE_TIMESTAMP | D_TM_INCLUDE_METADATA, stdout); filter &= ~D_TM_DIRECTORY; - d_tm_iterate(cli_ctx, node, 0, filter, NULL, D_TM_CSV, - D_TM_INCLUDE_METADATA, D_TM_ITER_READ, stdout); + d_tm_iterate(cli_ctx, node, 0, filter, NULL, D_TM_CSV, D_TM_INCLUDE_METADATA, iter_print, + stdout); } static void diff --git a/src/include/daos/drpc_modules.h b/src/include/daos/drpc_modules.h index 69aaf568673c..a8821d9f079f 100644 --- a/src/include/daos/drpc_modules.h +++ b/src/include/daos/drpc_modules.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -33,43 +33,44 @@ enum drpc_sec_agent_method { }; enum drpc_mgmt_method { - DRPC_METHOD_MGMT_KILL_RANK = 201, - DRPC_METHOD_MGMT_SET_RANK = 202, - DRPC_METHOD_MGMT_GET_ATTACH_INFO = 206, - DRPC_METHOD_MGMT_POOL_CREATE = 207, - DRPC_METHOD_MGMT_POOL_DESTROY = 208, - DRPC_METHOD_MGMT_SET_UP = 209, - DRPC_METHOD_MGMT_BIO_HEALTH_QUERY = 210, - DRPC_METHOD_MGMT_SMD_LIST_DEVS = 211, - DRPC_METHOD_MGMT_SMD_LIST_POOLS = 212, - DRPC_METHOD_MGMT_POOL_GET_ACL = 213, - DRPC_METHOD_MGMT_POOL_OVERWRITE_ACL = 215, - DRPC_METHOD_MGMT_POOL_UPDATE_ACL = 216, - DRPC_METHOD_MGMT_POOL_DELETE_ACL = 217, - DRPC_METHOD_MGMT_PREP_SHUTDOWN = 218, - DRPC_METHOD_MGMT_DEV_SET_FAULTY = 220, - DRPC_METHOD_MGMT_DEV_REPLACE = 221, - DRPC_METHOD_MGMT_LIST_CONTAINERS = 222, - DRPC_METHOD_MGMT_POOL_QUERY = 223, - DRPC_METHOD_MGMT_POOL_SET_PROP = 224, - DRPC_METHOD_MGMT_PING_RANK = 225, - DRPC_METHOD_MGMT_REINTEGRATE = 226, - DRPC_METHOD_MGMT_CONT_SET_OWNER = 227, - DRPC_METHOD_MGMT_EXCLUDE = 228, - DRPC_METHOD_MGMT_EXTEND = 229, - DRPC_METHOD_MGMT_POOL_EVICT = 230, - DRPC_METHOD_MGMT_DRAIN = 231, - DRPC_METHOD_MGMT_GROUP_UPDATE = 232, - DRPC_METHOD_MGMT_NOTIFY_EXIT = 233, - DRPC_METHOD_MGMT_NOTIFY_POOL_CONNECT = 235, - DRPC_METHOD_MGMT_NOTIFY_POOL_DISCONNECT = 236, - DRPC_METHOD_MGMT_POOL_GET_PROP = 237, - DRPC_METHOD_MGMT_SET_LOG_MASKS = 238, - DRPC_METHOD_MGMT_POOL_UPGRADE = 239, - DRPC_METHOD_MGMT_POOL_QUERY_TARGETS = 240, - DRPC_METHOD_MGMT_LED_MANAGE = 241, + DRPC_METHOD_MGMT_KILL_RANK = 201, + DRPC_METHOD_MGMT_SET_RANK = 202, + DRPC_METHOD_MGMT_GET_ATTACH_INFO = 206, + DRPC_METHOD_MGMT_POOL_CREATE = 207, + DRPC_METHOD_MGMT_POOL_DESTROY = 208, + DRPC_METHOD_MGMT_SET_UP = 209, + DRPC_METHOD_MGMT_BIO_HEALTH_QUERY = 210, + DRPC_METHOD_MGMT_SMD_LIST_DEVS = 211, + DRPC_METHOD_MGMT_SMD_LIST_POOLS = 212, + DRPC_METHOD_MGMT_POOL_GET_ACL = 213, + DRPC_METHOD_MGMT_POOL_OVERWRITE_ACL = 215, + DRPC_METHOD_MGMT_POOL_UPDATE_ACL = 216, + DRPC_METHOD_MGMT_POOL_DELETE_ACL = 217, + DRPC_METHOD_MGMT_PREP_SHUTDOWN = 218, + DRPC_METHOD_MGMT_DEV_SET_FAULTY = 220, + DRPC_METHOD_MGMT_DEV_REPLACE = 221, + DRPC_METHOD_MGMT_LIST_CONTAINERS = 222, + DRPC_METHOD_MGMT_POOL_QUERY = 223, + DRPC_METHOD_MGMT_POOL_SET_PROP = 224, + DRPC_METHOD_MGMT_PING_RANK = 225, + DRPC_METHOD_MGMT_REINTEGRATE = 226, + DRPC_METHOD_MGMT_CONT_SET_OWNER = 227, + DRPC_METHOD_MGMT_EXCLUDE = 228, + DRPC_METHOD_MGMT_EXTEND = 229, + DRPC_METHOD_MGMT_POOL_EVICT = 230, + DRPC_METHOD_MGMT_DRAIN = 231, + DRPC_METHOD_MGMT_GROUP_UPDATE = 232, + DRPC_METHOD_MGMT_NOTIFY_EXIT = 233, + DRPC_METHOD_MGMT_NOTIFY_POOL_CONNECT = 235, + DRPC_METHOD_MGMT_NOTIFY_POOL_DISCONNECT = 236, + DRPC_METHOD_MGMT_POOL_GET_PROP = 237, + DRPC_METHOD_MGMT_SET_LOG_MASKS = 238, + DRPC_METHOD_MGMT_POOL_UPGRADE = 239, + DRPC_METHOD_MGMT_POOL_QUERY_TARGETS = 240, + DRPC_METHOD_MGMT_LED_MANAGE = 241, + DRPC_METHOD_MGMT_SETUP_CLIENT_TELEM = 242, - NUM_DRPC_MGMT_METHODS /* Must be last */ + NUM_DRPC_MGMT_METHODS /* Must be last */ }; enum drpc_srv_method { diff --git a/src/include/daos/metrics.h b/src/include/daos/metrics.h new file mode 100644 index 000000000000..a0b6f16f144f --- /dev/null +++ b/src/include/daos/metrics.h @@ -0,0 +1,82 @@ +/** + * (C) Copyright 2016-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is part of daos + * + * src/include/daos/metrics.h + */ + +#ifndef __DAOS_METRICS_H__ +#define __DAOS_METRICS_H__ + +#include +#include +#include +#include + +#define DC_TM_JOB_ROOT_ID 256 +/* For now TLS is only enabled if metrics are enabled */ +#define DAOS_CLIENT_METRICS_DUMP_PATH "D_CLIENT_METRICS_DUMP_PATH" +#define DAOS_CLIENT_METRICS_ENABLE "D_CLIENT_METRICS_ENABLE" +#define DAOS_CLIENT_METRICS_RETAIN "D_CLIENT_METRICS_RETAIN" +extern bool daos_client_metric; +extern bool daos_client_metric_retain; + +struct daos_module_metrics { + /* Indicate where the keys should be instantiated */ + enum daos_module_tag dmm_tags; + + /** + * allocate metrics with path to ephemeral shmem for to the + * newly-created pool + */ + void *(*dmm_init)(const char *path, int tgt_id); + void (*dmm_fini)(void *data); + + /** + * Get the number of metrics allocated by this module in total (including all targets). + */ + int (*dmm_nr_metrics)(void); +}; + +/* Estimate of bytes per typical metric node */ +#define NODE_BYTES \ + (sizeof(struct d_tm_node_t) + sizeof(struct d_tm_metric_t) + 64 /* buffer for metadata */) +/* Estimate of bytes per histogram bucket */ +#define BUCKET_BYTES (sizeof(struct d_tm_bucket_t) + NODE_BYTES) +/* + Estimate of bytes per metric. + This is a generous high-water mark assuming most metrics are not using + histograms. May need adjustment if the balance of metrics changes. +*/ +#define PER_METRIC_BYTES \ + (NODE_BYTES + sizeof(struct d_tm_stats_t) + sizeof(struct d_tm_histogram_t) + BUCKET_BYTES) + +int +daos_metrics_init(enum daos_module_tag tag, uint32_t id, struct daos_module_metrics *metrics); +void +daos_metrics_fini(void); +int +daos_module_init_metrics(enum dss_module_tag tag, void **metrics, const char *path, int tgt_id); +void +daos_module_fini_metrics(enum dss_module_tag tag, void **metrics); + +int +daos_module_nr_pool_metrics(void); + +/** + * Called during library initialization to init metrics. + */ +int +dc_tm_init(void); + +/** + * Called during library finalization to free metrics resources + */ +void +dc_tm_fini(void); + +#endif /*__DAOS_METRICS_H__*/ diff --git a/src/include/daos/mgmt.h b/src/include/daos/mgmt.h index 4d999428c8c7..eee326c761bc 100644 --- a/src/include/daos/mgmt.h +++ b/src/include/daos/mgmt.h @@ -10,6 +10,7 @@ #ifndef __DC_MGMT_H__ #define __DC_MGMT_H__ +#include #include #include #include @@ -71,6 +72,9 @@ int dc_mgmt_net_get_num_srv_ranks(void); int dc_mgmt_get_sys_info(const char *sys, struct daos_sys_info **info); void dc_mgmt_put_sys_info(struct daos_sys_info *info); +int + dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid); + int dc_get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, Mgmt__GetAttachInfoResp **respp); diff --git a/src/include/daos/pool.h b/src/include/daos/pool.h index 5764e9d4002c..0807dcfcf0d8 100644 --- a/src/include/daos/pool.h +++ b/src/include/daos/pool.h @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -91,6 +92,7 @@ struct dc_pool { pthread_rwlock_t dp_map_lock; struct pool_map *dp_map; tse_task_t *dp_map_task; + void **dp_metrics; /* highest known pool map version */ uint32_t dp_map_version_known; uint32_t dp_disconnecting:1, diff --git a/src/include/daos/tls.h b/src/include/daos/tls.h new file mode 100644 index 000000000000..8e9628b39daa --- /dev/null +++ b/src/include/daos/tls.h @@ -0,0 +1,121 @@ +/** + * (C) Copyright 2016-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is part of daos + * + * src/include/daos/tls.h + */ + +#ifndef __DAOS_TLS_H__ +#define __DAOS_TLS_H__ + +#include +#include + +/** + * Stackable Module API + * Provides a modular interface to load and register server-side code on + * demand. A module is composed of: + * - a set of request handlers which are registered when the module is loaded. + * - a server-side API (see header files suffixed by "_srv") used for + * inter-module direct calls. + * + * For now, all loaded modules are assumed to be trustful, but sandboxes can be + * implemented in the future. + */ +/* + * Thead-local storage + */ +struct daos_thread_local_storage { + uint32_t dtls_tag; + void **dtls_values; +}; + +enum daos_module_tag { + DAOS_SYS_TAG = 1 << 0, /** only run on system xstream */ + DAOS_TGT_TAG = 1 << 1, /** only run on target xstream */ + DAOS_RDB_TAG = 1 << 2, /** only run on rdb xstream */ + DAOS_OFF_TAG = 1 << 3, /** only run on offload/helper xstream */ + DAOS_CLI_TAG = 1 << 4, /** only run on client stack */ + DAOS_SERVER_TAG = 0xff, /** run on all xstream */ +}; + +/* The module key descriptor for each xstream */ +struct daos_module_key { + /* Indicate where the keys should be instantiated */ + enum daos_module_tag dmk_tags; + + /* The position inside the daos_module_keys */ + int dmk_index; + /* init keys for context */ + void *(*dmk_init)(int tags, int xs_id, int tgt_id); + + /* fini keys for context */ + void (*dmk_fini)(int tags, void *data); +}; + +#define DAOS_MODULE_KEYS_NR 10 +struct daos_thread_local_storage * +dss_tls_get(void); +struct daos_thread_local_storage * +dc_tls_get(unsigned int tag); + +int +ds_tls_key_create(void); +int +dc_tls_key_create(void); +void +ds_tls_key_delete(void); +void +dc_tls_key_delete(void); + +struct daos_module_key * +daos_get_module_key(int index); + +/** + * Get value from context by the key + * + * Get value inside dtls by key. So each module will use this API to + * retrieve their own value in the thread context. + * + * \param[in] dtls the thread context. + * \param[in] key key used to retrieve the dtls_value. + * + * \retval the dtls_value retrieved by key. + */ +static inline void * +daos_module_key_get(struct daos_thread_local_storage *dtls, struct daos_module_key *key) +{ + D_ASSERT(key->dmk_index >= 0); + D_ASSERT(key->dmk_index < DAOS_MODULE_KEYS_NR); + D_ASSERT(daos_get_module_key(key->dmk_index) == key); + D_ASSERT(dtls != NULL); + + return dtls->dtls_values[key->dmk_index]; +} + +#define dss_module_key_get daos_module_key_get +#define dss_register_key daos_register_key +#define dss_unregister_key daos_unregister_key +#define dss_module_info daos_module_info +#define dss_module_tag daos_module_tag +#define dss_module_key daos_module_key +#define dss_thread_local_storage daos_thread_local_storage + +void +daos_register_key(struct daos_module_key *key); +void +daos_unregister_key(struct daos_module_key *key); +struct daos_thread_local_storage * +dc_tls_init(int tag, uint32_t pid); +void +dc_tls_fini(void); +struct daos_thread_local_storage * +dss_tls_init(int tag, int xs_id, int tgt_id); +void +dss_tls_fini(struct daos_thread_local_storage *dtls); + +#endif /*__DAOS_TLS_H__*/ diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 06a927b8d3f0..116c486e9439 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -54,84 +55,6 @@ extern unsigned int dss_instance_idx; /** Bypass for the nvme health check */ extern bool dss_nvme_bypass_health_check; -/** - * Stackable Module API - * Provides a modular interface to load and register server-side code on - * demand. A module is composed of: - * - a set of request handlers which are registered when the module is loaded. - * - a server-side API (see header files suffixed by "_srv") used for - * inter-module direct calls. - * - * For now, all loaded modules are assumed to be trustful, but sandboxes can be - * implemented in the future. - */ -/* - * Thead-local storage - */ -struct dss_thread_local_storage { - uint32_t dtls_tag; - void **dtls_values; -}; - -enum dss_module_tag { - DAOS_SYS_TAG = 1 << 0, /** only run on system xstream */ - DAOS_TGT_TAG = 1 << 1, /** only run on target xstream */ - DAOS_RDB_TAG = 1 << 2, /** only run on rdb xstream */ - DAOS_OFF_TAG = 1 << 3, /** only run on offload/helper xstream */ - DAOS_SERVER_TAG = 0xff, /** run on all xstream */ -}; - -/* The module key descriptor for each xstream */ -struct dss_module_key { - /* Indicate where the keys should be instantiated */ - enum dss_module_tag dmk_tags; - - /* The position inside the dss_module_keys */ - int dmk_index; - /* init keys for context */ - void *(*dmk_init)(int tags, int xs_id, int tgt_id); - - /* fini keys for context */ - void (*dmk_fini)(int tags, void *data); -}; - -extern pthread_key_t dss_tls_key; -extern struct dss_module_key *dss_module_keys[]; -#define DAOS_MODULE_KEYS_NR 10 - -static inline struct dss_thread_local_storage * -dss_tls_get() -{ - return (struct dss_thread_local_storage *) - pthread_getspecific(dss_tls_key); -} - -/** - * Get value from context by the key - * - * Get value inside dtls by key. So each module will use this API to - * retrieve their own value in the thread context. - * - * \param[in] dtls the thread context. - * \param[in] key key used to retrieve the dtls_value. - * - * \retval the dtls_value retrieved by key. - */ -static inline void * -dss_module_key_get(struct dss_thread_local_storage *dtls, - struct dss_module_key *key) -{ - D_ASSERT(key->dmk_index >= 0); - D_ASSERT(key->dmk_index < DAOS_MODULE_KEYS_NR); - D_ASSERT(dss_module_keys[key->dmk_index] == key); - D_ASSERT(dtls != NULL); - - return dtls->dtls_values[key->dmk_index]; -} - -void dss_register_key(struct dss_module_key *key); -void dss_unregister_key(struct dss_module_key *key); - /** pthread names are limited to 16 chars */ #define DSS_XS_NAME_LEN (32) @@ -172,7 +95,7 @@ static inline struct dss_module_info * dss_get_module_info(void) { struct dss_module_info *dmi; - struct dss_thread_local_storage *dtc; + struct daos_thread_local_storage *dtc; dtc = dss_tls_get(); dmi = (struct dss_module_info *) @@ -419,23 +342,6 @@ struct dss_module_ops { int srv_profile_stop(); int srv_profile_start(char *path, int avg); -struct dss_module_metrics { - /* Indicate where the keys should be instantiated */ - enum dss_module_tag dmm_tags; - - /** - * allocate metrics with path to ephemeral shmem for to the - * newly-created pool - */ - void *(*dmm_init)(const char *path, int tgt_id); - void (*dmm_fini)(void *data); - - /** - * Get the number of metrics allocated by this module in total (including all targets). - */ - int (*dmm_nr_metrics)(void); -}; - /** * Each module should provide a dss_module structure which defines the module * interface. The name of the allocated structure must be the library name @@ -481,7 +387,7 @@ struct dss_module { struct dss_module_ops *sm_mod_ops; /* Per-pool metrics (optional) */ - struct dss_module_metrics *sm_metrics; + struct daos_module_metrics *sm_metrics; }; /** diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h index 12039c24a731..efb838befaef 100644 --- a/src/include/gurt/telemetry_common.h +++ b/src/include/gurt/telemetry_common.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -145,9 +145,12 @@ enum { }; enum { - D_TM_SERVER_PROCESS = 0x000, - D_TM_SERIALIZATION = 0x001, - D_TM_RETAIN_SHMEM = 0x002, + D_TM_SERVER_PROCESS = 0x000, + D_TM_SERIALIZATION = 0x001, + D_TM_RETAIN_SHMEM = 0x002, + D_TM_RETAIN_SHMEM_IF_NON_EMPTY = 0x004, + D_TM_OPEN_OR_CREATE = 0x008, + D_TM_MULTIPLE_WRITER_LOCK = 0x010, }; /** Output formats */ diff --git a/src/include/gurt/telemetry_consumer.h b/src/include/gurt/telemetry_consumer.h index f0b1d706be71..138633ced918 100644 --- a/src/include/gurt/telemetry_consumer.h +++ b/src/include/gurt/telemetry_consumer.h @@ -49,12 +49,21 @@ int d_tm_list(struct d_tm_context *ctx, struct d_tm_nodeList_t **head, int d_tm_list_subdirs(struct d_tm_context *ctx, struct d_tm_nodeList_t **head, struct d_tm_node_t *node, uint64_t *node_count, int max_depth); -void d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, - int level, int filter, char *path, int format, - int opt_fields, uint32_t ops, FILE *stream); + +typedef void (*d_tm_iter_cb_t)(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, + char *path, int format, int opt_fields, void *cb_arg); + +void +d_tm_iterate(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, int filter, char *path, + int format, int opt_fields, d_tm_iter_cb_t iter_cb, void *cb_arg); void d_tm_print_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *name, int format, int opt_fields, FILE *stream); + +void + d_tm_reset_node(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, + int format, int opt_fields, FILE *stream); + void d_tm_print_field_descriptors(int opt_fields, FILE *stream); void d_tm_print_counter(uint64_t val, char *name, int format, char *units, int opt_fields, FILE *stream); diff --git a/src/include/gurt/telemetry_producer.h b/src/include/gurt/telemetry_producer.h index 21f506fba383..0046acf12409 100644 --- a/src/include/gurt/telemetry_producer.h +++ b/src/include/gurt/telemetry_producer.h @@ -1,11 +1,12 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ #ifndef __TELEMETRY_PRODUCER_H__ #define __TELEMETRY_PRODUCER_H__ +#include #include /* Developer facing server API to write data */ @@ -23,12 +24,19 @@ void d_tm_dec_gauge(struct d_tm_node_t *metric, uint64_t value); /* Other server functions */ int d_tm_init(int id, uint64_t mem_size, int flags); +int + d_tm_init_with_name(int id, uint64_t mem_size, int flags, const char *root_name); int d_tm_init_histogram(struct d_tm_node_t *node, char *path, int num_buckets, int initial_width, int multiplier); int d_tm_add_metric(struct d_tm_node_t **node, int metric_type, char *desc, char *units, const char *fmt, ...); int d_tm_add_ephemeral_dir(struct d_tm_node_t **node, size_t size_bytes, const char *fmt, ...); +int + d_tm_attach_path_segment(key_t key, const char *fmt, ...); int d_tm_del_ephemeral_dir(const char *fmt, ...); +int + d_tm_try_del_ephemeral_dir(const char *fmt, ...); void d_tm_fini(void); + #endif /* __TELEMETRY_PRODUCER_H__ */ diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index aa640f4c99f0..2eff852eb9f1 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -24,6 +24,7 @@ #include "rpc.h" #include #include +#include int dc_cp(tse_task_t *task, void *data) @@ -1180,6 +1181,90 @@ dc_mgmt_pool_find(struct dc_mgmt_sys *sys, const char *label, uuid_t puuid, return rc; } +int +dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid) +{ + struct drpc_alloc alloc = PROTO_ALLOCATOR_INIT(alloc); + struct drpc *ctx; + Mgmt__ClientTelemetryReq req = MGMT__CLIENT_TELEMETRY_REQ__INIT; + Mgmt__ClientTelemetryResp *resp; + uint8_t *reqb; + size_t reqb_size; + Drpc__Call *dreq; + Drpc__Response *dresp; + int rc; + + if (owner_uid == NULL) + return -DER_INVAL; + + /* Connect to daos_agent. */ + D_ASSERT(dc_agent_sockpath != NULL); + rc = drpc_connect(dc_agent_sockpath, &ctx); + if (rc != -DER_SUCCESS) { + DL_ERROR(rc, "failed to connect to %s ", dc_agent_sockpath); + D_GOTO(out, 0); + } + + req.sys = (char *)sys; + req.jobid = dc_jobid; + req.shm_key = shm_key; + + reqb_size = mgmt__client_telemetry_req__get_packed_size(&req); + D_ALLOC(reqb, reqb_size); + if (reqb == NULL) { + D_GOTO(out_ctx, rc = -DER_NOMEM); + } + mgmt__client_telemetry_req__pack(&req, reqb); + + rc = drpc_call_create(ctx, DRPC_MODULE_MGMT, DRPC_METHOD_MGMT_SETUP_CLIENT_TELEM, &dreq); + if (rc != 0) { + D_FREE(reqb); + goto out_ctx; + } + dreq->body.len = reqb_size; + dreq->body.data = reqb; + + /* Make the call and get the response. */ + rc = drpc_call(ctx, R_SYNC, dreq, &dresp); + if (rc != 0) { + DL_ERROR(rc, "Sending client telemetry setup request failed"); + goto out_dreq; + } + if (dresp->status != DRPC__STATUS__SUCCESS) { + D_ERROR("Client telemetry setup request unsuccessful: %d\n", dresp->status); + rc = -DER_UNINIT; + goto out_dresp; + } + + resp = mgmt__client_telemetry_resp__unpack(&alloc.alloc, dresp->body.len, dresp->body.data); + if (alloc.oom) + D_GOTO(out_dresp, rc = -DER_NOMEM); + if (resp == NULL) { + D_ERROR("failed to unpack SetupClientTelemetry response\n"); + rc = -DER_NOMEM; + goto out_dresp; + } + if (resp->status != 0) { + D_ERROR("SetupClientTelemetry(%s) failed: " DF_RC "\n", req.sys, + DP_RC(resp->status)); + rc = resp->status; + goto out_resp; + } + + *owner_uid = resp->agent_uid; + +out_resp: + mgmt__client_telemetry_resp__free_unpacked(resp, &alloc.alloc); +out_dresp: + drpc_response_free(dresp); +out_dreq: + drpc_call_free(dreq); +out_ctx: + drpc_close(ctx); +out: + return rc; +} + /** * Initialize management interface */ diff --git a/src/mgmt/svc.pb-c.c b/src/mgmt/svc.pb-c.c index c599d8f8aaf8..f8e4e7e52998 100644 --- a/src/mgmt/svc.pb-c.c +++ b/src/mgmt/svc.pb-c.c @@ -649,6 +649,86 @@ void mgmt__pool_monitor_req__free_unpacked assert(message->base.descriptor == &mgmt__pool_monitor_req__descriptor); protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator); } +void +mgmt__client_telemetry_req__init(Mgmt__ClientTelemetryReq *message) +{ + static const Mgmt__ClientTelemetryReq init_value = MGMT__CLIENT_TELEMETRY_REQ__INIT; + *message = init_value; +} +size_t +mgmt__client_telemetry_req__get_packed_size(const Mgmt__ClientTelemetryReq *message) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_req__descriptor); + return protobuf_c_message_get_packed_size((const ProtobufCMessage *)(message)); +} +size_t +mgmt__client_telemetry_req__pack(const Mgmt__ClientTelemetryReq *message, uint8_t *out) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_req__descriptor); + return protobuf_c_message_pack((const ProtobufCMessage *)message, out); +} +size_t +mgmt__client_telemetry_req__pack_to_buffer(const Mgmt__ClientTelemetryReq *message, + ProtobufCBuffer *buffer) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_req__descriptor); + return protobuf_c_message_pack_to_buffer((const ProtobufCMessage *)message, buffer); +} +Mgmt__ClientTelemetryReq * +mgmt__client_telemetry_req__unpack(ProtobufCAllocator *allocator, size_t len, const uint8_t *data) +{ + return (Mgmt__ClientTelemetryReq *)protobuf_c_message_unpack( + &mgmt__client_telemetry_req__descriptor, allocator, len, data); +} +void +mgmt__client_telemetry_req__free_unpacked(Mgmt__ClientTelemetryReq *message, + ProtobufCAllocator *allocator) +{ + if (!message) + return; + assert(message->base.descriptor == &mgmt__client_telemetry_req__descriptor); + protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator); +} +void +mgmt__client_telemetry_resp__init(Mgmt__ClientTelemetryResp *message) +{ + static const Mgmt__ClientTelemetryResp init_value = MGMT__CLIENT_TELEMETRY_RESP__INIT; + *message = init_value; +} +size_t +mgmt__client_telemetry_resp__get_packed_size(const Mgmt__ClientTelemetryResp *message) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_resp__descriptor); + return protobuf_c_message_get_packed_size((const ProtobufCMessage *)(message)); +} +size_t +mgmt__client_telemetry_resp__pack(const Mgmt__ClientTelemetryResp *message, uint8_t *out) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_resp__descriptor); + return protobuf_c_message_pack((const ProtobufCMessage *)message, out); +} +size_t +mgmt__client_telemetry_resp__pack_to_buffer(const Mgmt__ClientTelemetryResp *message, + ProtobufCBuffer *buffer) +{ + assert(message->base.descriptor == &mgmt__client_telemetry_resp__descriptor); + return protobuf_c_message_pack_to_buffer((const ProtobufCMessage *)message, buffer); +} +Mgmt__ClientTelemetryResp * +mgmt__client_telemetry_resp__unpack(ProtobufCAllocator *allocator, size_t len, const uint8_t *data) +{ + return (Mgmt__ClientTelemetryResp *)protobuf_c_message_unpack( + &mgmt__client_telemetry_resp__descriptor, allocator, len, data); +} +void +mgmt__client_telemetry_resp__free_unpacked(Mgmt__ClientTelemetryResp *message, + ProtobufCAllocator *allocator) +{ + if (!message) + return; + assert(message->base.descriptor == &mgmt__client_telemetry_resp__descriptor); + protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator); +} static const ProtobufCFieldDescriptor mgmt__daos_resp__field_descriptors[1] = { { @@ -1740,3 +1820,77 @@ const ProtobufCMessageDescriptor mgmt__pool_monitor_req__descriptor = (ProtobufCMessageInit) mgmt__pool_monitor_req__init, NULL,NULL,NULL /* reserved[123] */ }; +static const ProtobufCFieldDescriptor mgmt__client_telemetry_req__field_descriptors[3] = { + { + "sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Mgmt__ClientTelemetryReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "jobid", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Mgmt__ClientTelemetryReq, jobid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "shm_key", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */ + offsetof(Mgmt__ClientTelemetryReq, shm_key), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, +}; +static const unsigned mgmt__client_telemetry_req__field_indices_by_name[] = { + 1, /* field[1] = jobid */ + 2, /* field[2] = shm_key */ + 0, /* field[0] = sys */ +}; +static const ProtobufCIntRange mgmt__client_telemetry_req__number_ranges[1 + 1] = {{1, 0}, {0, 3}}; +const ProtobufCMessageDescriptor mgmt__client_telemetry_req__descriptor = { + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "mgmt.ClientTelemetryReq", + "ClientTelemetryReq", + "Mgmt__ClientTelemetryReq", + "mgmt", + sizeof(Mgmt__ClientTelemetryReq), + 3, + mgmt__client_telemetry_req__field_descriptors, + mgmt__client_telemetry_req__field_indices_by_name, + 1, + mgmt__client_telemetry_req__number_ranges, + (ProtobufCMessageInit)mgmt__client_telemetry_req__init, + NULL, + NULL, + NULL /* reserved[123] */ +}; +static const ProtobufCFieldDescriptor mgmt__client_telemetry_resp__field_descriptors[2] = { + { + "status", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */ + offsetof(Mgmt__ClientTelemetryResp, status), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "agent_uid", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */ + offsetof(Mgmt__ClientTelemetryResp, agent_uid), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, +}; +static const unsigned mgmt__client_telemetry_resp__field_indices_by_name[] = { + 1, /* field[1] = agent_uid */ + 0, /* field[0] = status */ +}; +static const ProtobufCIntRange mgmt__client_telemetry_resp__number_ranges[1 + 1] = {{1, 0}, {0, 2}}; +const ProtobufCMessageDescriptor mgmt__client_telemetry_resp__descriptor = { + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "mgmt.ClientTelemetryResp", + "ClientTelemetryResp", + "Mgmt__ClientTelemetryResp", + "mgmt", + sizeof(Mgmt__ClientTelemetryResp), + 2, + mgmt__client_telemetry_resp__field_descriptors, + mgmt__client_telemetry_resp__field_indices_by_name, + 1, + mgmt__client_telemetry_resp__number_ranges, + (ProtobufCMessageInit)mgmt__client_telemetry_resp__init, + NULL, + NULL, + NULL /* reserved[123] */ +}; diff --git a/src/mgmt/svc.pb-c.h b/src/mgmt/svc.pb-c.h index 381b45534f35..789a636509b4 100644 --- a/src/mgmt/svc.pb-c.h +++ b/src/mgmt/svc.pb-c.h @@ -31,7 +31,8 @@ typedef struct _Mgmt__PrepShutdownReq Mgmt__PrepShutdownReq; typedef struct _Mgmt__PingRankReq Mgmt__PingRankReq; typedef struct _Mgmt__SetRankReq Mgmt__SetRankReq; typedef struct _Mgmt__PoolMonitorReq Mgmt__PoolMonitorReq; - +typedef struct _Mgmt__ClientTelemetryReq Mgmt__ClientTelemetryReq; +typedef struct _Mgmt__ClientTelemetryResp Mgmt__ClientTelemetryResp; /* --- enums --- */ @@ -223,7 +224,7 @@ struct _Mgmt__ClientNetHint { ProtobufCMessage base; /* - * CaRT OFI provider + * CaRT provider */ char *provider; /* @@ -378,6 +379,43 @@ struct _Mgmt__PoolMonitorReq { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_monitor_req__descriptor) \ , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string } +struct _Mgmt__ClientTelemetryReq { + ProtobufCMessage base; + /* + * DAOS system identifier + */ + char *sys; + /* + * Job ID used for client telemetry + */ + char *jobid; + /* + * Client's shared memory segment key + */ + int32_t shm_key; +}; +#define MGMT__CLIENT_TELEMETRY_REQ__INIT \ + { \ + PROTOBUF_C_MESSAGE_INIT(&mgmt__client_telemetry_req__descriptor) \ + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0 \ + } + +struct _Mgmt__ClientTelemetryResp { + ProtobufCMessage base; + /* + * DAOS status code + */ + int32_t status; + /* + * UID of agent process + */ + int32_t agent_uid; +}; +#define MGMT__CLIENT_TELEMETRY_RESP__INIT \ + { \ + PROTOBUF_C_MESSAGE_INIT(&mgmt__client_telemetry_resp__descriptor) \ + , 0, 0 \ + } /* Mgmt__DaosResp methods */ void mgmt__daos_resp__init @@ -651,6 +689,36 @@ Mgmt__PoolMonitorReq * void mgmt__pool_monitor_req__free_unpacked (Mgmt__PoolMonitorReq *message, ProtobufCAllocator *allocator); +/* Mgmt__ClientTelemetryReq methods */ +void +mgmt__client_telemetry_req__init(Mgmt__ClientTelemetryReq *message); +size_t +mgmt__client_telemetry_req__get_packed_size(const Mgmt__ClientTelemetryReq *message); +size_t +mgmt__client_telemetry_req__pack(const Mgmt__ClientTelemetryReq *message, uint8_t *out); +size_t +mgmt__client_telemetry_req__pack_to_buffer(const Mgmt__ClientTelemetryReq *message, + ProtobufCBuffer *buffer); +Mgmt__ClientTelemetryReq * +mgmt__client_telemetry_req__unpack(ProtobufCAllocator *allocator, size_t len, const uint8_t *data); +void +mgmt__client_telemetry_req__free_unpacked(Mgmt__ClientTelemetryReq *message, + ProtobufCAllocator *allocator); +/* Mgmt__ClientTelemetryResp methods */ +void +mgmt__client_telemetry_resp__init(Mgmt__ClientTelemetryResp *message); +size_t +mgmt__client_telemetry_resp__get_packed_size(const Mgmt__ClientTelemetryResp *message); +size_t +mgmt__client_telemetry_resp__pack(const Mgmt__ClientTelemetryResp *message, uint8_t *out); +size_t +mgmt__client_telemetry_resp__pack_to_buffer(const Mgmt__ClientTelemetryResp *message, + ProtobufCBuffer *buffer); +Mgmt__ClientTelemetryResp * +mgmt__client_telemetry_resp__unpack(ProtobufCAllocator *allocator, size_t len, const uint8_t *data); +void +mgmt__client_telemetry_resp__free_unpacked(Mgmt__ClientTelemetryResp *message, + ProtobufCAllocator *allocator); /* --- per-message closures --- */ typedef void (*Mgmt__DaosResp_Closure) @@ -701,6 +769,10 @@ typedef void (*Mgmt__SetRankReq_Closure) typedef void (*Mgmt__PoolMonitorReq_Closure) (const Mgmt__PoolMonitorReq *message, void *closure_data); +typedef void (*Mgmt__ClientTelemetryReq_Closure)(const Mgmt__ClientTelemetryReq *message, + void *closure_data); +typedef void (*Mgmt__ClientTelemetryResp_Closure)(const Mgmt__ClientTelemetryResp *message, + void *closure_data); /* --- services --- */ @@ -724,6 +796,8 @@ extern const ProtobufCMessageDescriptor mgmt__prep_shutdown_req__descriptor; extern const ProtobufCMessageDescriptor mgmt__ping_rank_req__descriptor; extern const ProtobufCMessageDescriptor mgmt__set_rank_req__descriptor; extern const ProtobufCMessageDescriptor mgmt__pool_monitor_req__descriptor; +extern const ProtobufCMessageDescriptor mgmt__client_telemetry_req__descriptor; +extern const ProtobufCMessageDescriptor mgmt__client_telemetry_resp__descriptor; PROTOBUF_C__END_DECLS diff --git a/src/object/cli_mod.c b/src/object/cli_mod.c index 9bc4f14362c4..f39f95600f69 100644 --- a/src/object/cli_mod.c +++ b/src/object/cli_mod.c @@ -12,6 +12,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include "obj_rpc.h" #include "obj_internal.h" @@ -19,14 +24,121 @@ unsigned int srv_io_mode = DIM_DTX_FULL_ENABLED; int dc_obj_proto_version; +static void * +dc_obj_tls_init(int tags, int xs_id, int pid) +{ + struct dc_obj_tls *tls; + int opc; + int rc; + unsigned long tid = pthread_self(); + + D_ALLOC_PTR(tls); + if (tls == NULL) + return NULL; + + /** register different per-opcode sensors */ + for (opc = 0; opc < OBJ_PROTO_CLI_COUNT; opc++) { + /** Start with number of active requests, of type gauge */ + rc = d_tm_add_metric(&tls->cot_op_active[opc], D_TM_STATS_GAUGE, + "number of active object RPCs", "ops", "%lu/io/ops/%s/active", + tid, obj_opc_to_str(opc)); + if (rc) { + D_WARN("Failed to create active counter: " DF_RC "\n", DP_RC(rc)); + D_GOTO(out, rc); + } + + if (opc == DAOS_OBJ_RPC_UPDATE || opc == DAOS_OBJ_RPC_TGT_UPDATE || + opc == DAOS_OBJ_RPC_FETCH) + /** See below, latency reported per size for those */ + continue; + + /** And finally the per-opcode latency, of type gauge */ + rc = d_tm_add_metric(&tls->cot_op_lat[opc], D_TM_STATS_GAUGE, + "object RPC processing time", "us", "%lu/io/ops/%s/latency", + tid, obj_opc_to_str(opc)); + if (rc) { + D_WARN("Failed to create latency sensor: " DF_RC "\n", DP_RC(rc)); + D_GOTO(out, rc); + } + } + + /** + * Maintain per-I/O size latency for update & fetch RPCs + * of type gauge + */ + rc = obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, pid, tls->cot_update_lat, + obj_opc_to_str(DAOS_OBJ_RPC_UPDATE), "update RPC processing time", + false); + if (rc) + D_GOTO(out, rc); + + rc = obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, pid, tls->cot_fetch_lat, + obj_opc_to_str(DAOS_OBJ_RPC_FETCH), "fetch RPC processing time", + false); + if (rc) + D_GOTO(out, rc); + +out: + if (rc) { + D_FREE(tls); + tls = NULL; + } + + return tls; +} + +static void +dc_obj_tls_fini(int tags, void *data) +{ + struct dc_obj_tls *tls = data; + + D_FREE(tls); +} + +struct daos_module_key dc_obj_module_key = { + .dmk_tags = DAOS_CLI_TAG, + .dmk_index = -1, + .dmk_init = dc_obj_tls_init, + .dmk_fini = dc_obj_tls_fini, +}; + +static void * +dc_obj_metrics_alloc(const char *path, int tgt_id) +{ + return obj_metrics_alloc_internal(path, tgt_id, false); +} + +static void +dc_obj_metrics_free(void *data) +{ + D_FREE(data); +} + +/* metrics per pool */ +struct daos_module_metrics dc_obj_metrics = { + .dmm_tags = DAOS_CLI_TAG, + .dmm_init = dc_obj_metrics_alloc, + .dmm_fini = dc_obj_metrics_free, + .dmm_nr_metrics = obj_metrics_count, +}; + /** * Initialize object interface */ int dc_obj_init(void) { - uint32_t ver_array[2] = {DAOS_OBJ_VERSION - 1, DAOS_OBJ_VERSION}; - int rc; + uint32_t ver_array[2] = {DAOS_OBJ_VERSION - 1, DAOS_OBJ_VERSION}; + int rc; + + if (daos_client_metric) { + daos_register_key(&dc_obj_module_key); + rc = daos_metrics_init(DAOS_CLI_TAG, DAOS_OBJ_MODULE, &dc_obj_metrics); + if (rc) { + DL_ERROR(rc, "register object failed"); + return rc; + } + } rc = obj_utils_init(); if (rc) @@ -78,6 +190,7 @@ dc_obj_init(void) out_utils: if (rc) obj_utils_fini(); + return rc; } @@ -94,4 +207,6 @@ dc_obj_fini(void) obj_ec_codec_fini(); obj_class_fini(); obj_utils_fini(); + if (daos_client_metric) + daos_unregister_key(&dc_obj_module_key); } diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 6179e1deb25b..71b65cdf025a 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -14,7 +14,9 @@ #include #include #include -#include "obj_rpc.h" +#include +#include +#include #include "obj_internal.h" static inline struct dc_obj_layout * @@ -104,6 +106,7 @@ struct rw_cb_args { daos_iom_t *maps; crt_endpoint_t tgt_ep; struct shard_rw_args *shard_args; + uint64_t send_time; }; static struct dcs_layout * @@ -886,6 +889,99 @@ dc_shard_update_size(struct rw_cb_args *rw_args, int fetch_rc) return rc; } +daos_size_t +obj_get_fetch_size(struct rw_cb_args *arg) +{ + struct obj_rw_out *orwo; + daos_size_t size = 0; + + orwo = crt_reply_get(arg->rpc); + + if (orwo->orw_sgls.ca_count > 0) { + /* inline transfer */ + size = + daos_sgls_packed_size(orwo->orw_sgls.ca_arrays, orwo->orw_sgls.ca_count, NULL); + } else if (arg->rwaa_sgls != NULL) { + /* bulk transfer */ + daos_size_t *replied_sizes = orwo->orw_data_sizes.ca_arrays; + int i; + + for (i = 0; i < orwo->orw_data_sizes.ca_count; i++) + size += replied_sizes[i]; + } + + return size; +} + +static void +obj_shard_update_metrics_begin(crt_rpc_t *rpc) +{ + struct dc_obj_tls *tls; + int opc; + + if (!daos_client_metric) + return; + + tls = dc_obj_tls_get(); + D_ASSERT(tls != NULL); + opc = opc_get(rpc->cr_opc); + d_tm_inc_gauge(tls->cot_op_active[opc], 1); +} + +static void +obj_shard_update_metrics_end(crt_rpc_t *rpc, uint64_t send_time, void *arg, int ret) +{ + struct dc_obj_tls *tls; + struct rw_cb_args *rw_args; + struct dc_pool *pool; + struct obj_rw_in *orw; + struct d_tm_node_t *lat = NULL; + struct obj_pool_metrics *opm = NULL; + daos_size_t size; + uint64_t time; + int opc; + + if (!daos_client_metric || ret != 0) + return; + tls = dc_obj_tls_get(); + D_ASSERT(tls != NULL); + opc = opc_get(rpc->cr_opc); + orw = crt_req_get(rpc); + d_tm_dec_gauge(tls->cot_op_active[opc], 1); + /** + * Measure latency of successful I/O only. + * Use bit shift for performance and tolerate some inaccuracy. + */ + time = daos_get_ntime() - send_time; + time >>= 10; + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_FETCH: + rw_args = arg; + pool = rw_args->shard_args->auxi.obj_auxi->obj->cob_pool; + D_ASSERT(pool != NULL); + opm = pool->dp_metrics[DAOS_OBJ_MODULE]; + D_ASSERTF(opm != NULL, "pool %p\n", pool); + if (opc == DAOS_OBJ_RPC_UPDATE) { + size = daos_sgls_packed_size(rw_args->rwaa_sgls, orw->orw_nr, NULL); + d_tm_inc_counter(opm->opm_update_bytes, size); + lat = tls->cot_update_lat[lat_bucket(size)]; + } else { + size = obj_get_fetch_size(rw_args); + lat = tls->cot_fetch_lat[lat_bucket(size)]; + d_tm_inc_counter(opm->opm_fetch_bytes, size); + } + break; + default: + lat = tls->cot_op_lat[opc]; + break; + } + + if (lat != NULL) + d_tm_set_gauge(lat, time); +} + static int dc_rw_cb(tse_task_t *task, void *arg) { @@ -1191,10 +1287,15 @@ dc_rw_cb(tse_task_t *task, void *arg) out: if (rc == -DER_CSUM && opc == DAOS_OBJ_RPC_FETCH) dc_shard_csum_report(task, &rw_args->tgt_ep, rw_args->rpc); + + obj_shard_update_metrics_end(rw_args->rpc, rw_args->send_time, rw_args, + ret == 0 ? rc : ret); + crt_req_decref(rw_args->rpc); if (ret == 0 || obj_retry_error(rc)) ret = rc; + return ret; } @@ -1362,7 +1463,9 @@ dc_obj_shard_rw(struct dc_obj_shard *shard, enum obj_rpc_opc opc, rw_args.co = shard->do_co; rw_args.shard_args = args; /* remember the sgl to copyout the data inline for fetch */ - rw_args.rwaa_sgls = (opc == DAOS_OBJ_RPC_FETCH) ? sgls : NULL; + rw_args.rwaa_sgls = sgls; + rw_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); if (args->reasb_req && args->reasb_req->orr_recov) { rw_args.maps = NULL; orw->orw_flags |= ORF_EC_RECOV; @@ -1421,6 +1524,7 @@ dc_obj_shard_rw(struct dc_obj_shard *shard, enum obj_rpc_opc opc, struct obj_punch_cb_args { crt_rpc_t *rpc; unsigned int *map_ver; + uint64_t send_time; }; static int @@ -1436,7 +1540,10 @@ obj_shard_punch_cb(tse_task_t *task, void *data) *cb_args->map_ver = obj_reply_map_version_get(rpc); } + obj_shard_update_metrics_end(cb_args->rpc, cb_args->send_time, cb_args, task->dt_result); + crt_req_decref(rpc); + return task->dt_result; } @@ -1480,6 +1587,8 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, crt_req_addref(req); cb_args.rpc = req; cb_args.map_ver = &args->pa_auxi.map_ver; + cb_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); rc = tse_task_register_comp_cb(task, obj_shard_punch_cb, &cb_args, sizeof(cb_args)); if (rc != 0) @@ -1540,6 +1649,7 @@ struct obj_enum_args { d_iov_t *csum; struct dtx_epoch *epoch; daos_handle_t *th; + uint64_t send_time; }; /** @@ -1858,10 +1968,15 @@ dc_enumerate_cb(tse_task_t *task, void *arg) crt_bulk_free(oei->oei_bulk); if (oei->oei_kds_bulk != NULL) crt_bulk_free(oei->oei_kds_bulk); + + obj_shard_update_metrics_end(enum_args->rpc, enum_args->send_time, enum_args, + ret == 0 ? rc : ret); + crt_req_decref(enum_args->rpc); if (ret == 0 || obj_retry_error(rc)) ret = rc; + return ret; } @@ -2007,6 +2122,8 @@ dc_obj_shard_list(struct dc_obj_shard *obj_shard, enum obj_rpc_opc opc, enum_args.eaa_recxs = args->la_recxs; enum_args.epoch = &args->la_auxi.epoch; enum_args.th = &obj_args->th; + enum_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); rc = tse_task_register_comp_cb(task, dc_enumerate_cb, &enum_args, sizeof(enum_args)); if (rc != 0) @@ -2038,6 +2155,7 @@ struct obj_query_key_cb_args { struct dc_obj_shard *shard; struct dtx_epoch epoch; daos_handle_t th; + uint64_t send_time; }; static void @@ -2235,6 +2353,7 @@ obj_shard_query_key_cb(tse_task_t *task, void *data) D_SPIN_UNLOCK(&cb_args->obj->cob_spin); out: + obj_shard_update_metrics_end(rpc, cb_args->send_time, cb_args, rc); crt_req_decref(rpc); if (ret == 0 || obj_retry_error(rc)) ret = rc; @@ -2285,6 +2404,8 @@ dc_obj_shard_query_key(struct dc_obj_shard *shard, struct dtx_epoch *epoch, uint cb_args.epoch = *epoch; cb_args.th = th; cb_args.max_epoch = max_epoch; + cb_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); rc = tse_task_register_comp_cb(task, obj_shard_query_key_cb, &cb_args, sizeof(cb_args)); if (rc != 0) @@ -2328,6 +2449,7 @@ struct obj_shard_sync_cb_args { crt_rpc_t *rpc; daos_epoch_t *epoch; uint32_t *map_ver; + uint64_t send_time; }; static int @@ -2377,6 +2499,8 @@ obj_shard_sync_cb(tse_task_t *task, void *data) oso->oso_epoch, oso->oso_map_version); out: + obj_shard_update_metrics_end(rpc, cb_args->send_time, cb_args, rc); + crt_req_decref(rpc); return rc; } @@ -2418,10 +2542,11 @@ dc_obj_shard_sync(struct dc_obj_shard *shard, enum obj_rpc_opc opc, D_GOTO(out, rc); crt_req_addref(req); - cb_args.rpc = req; - cb_args.epoch = args->sa_epoch; - cb_args.map_ver = &args->sa_auxi.map_ver; - + cb_args.rpc = req; + cb_args.epoch = args->sa_epoch; + cb_args.map_ver = &args->sa_auxi.map_ver; + cb_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); rc = tse_task_register_comp_cb(task, obj_shard_sync_cb, &cb_args, sizeof(cb_args)); if (rc != 0) @@ -2455,8 +2580,9 @@ struct obj_k2a_args { unsigned int *eaa_map_ver; struct dtx_epoch *epoch; daos_handle_t *th; - daos_anchor_t *anchor; - uint32_t shard; + daos_anchor_t *anchor; + uint64_t send_time; + uint32_t shard; }; static int @@ -2511,6 +2637,8 @@ dc_k2a_cb(tse_task_t *task, void *arg) enum_anchor_copy(k2a_args->anchor, &oko->oko_anchor); dc_obj_shard2anchor(k2a_args->anchor, k2a_args->shard); out: + obj_shard_update_metrics_end(k2a_args->rpc, k2a_args->send_time, k2a_args, + ret == 0 ? rc : ret); if (k2a_args->eaa_obj != NULL) obj_shard_decref(k2a_args->eaa_obj); crt_req_decref(k2a_args->rpc); @@ -2584,6 +2712,8 @@ dc_obj_shard_key2anchor(struct dc_obj_shard *obj_shard, enum obj_rpc_opc opc, cb_args.th = &obj_args->th; cb_args.anchor = args->ka_anchor; cb_args.shard = obj_shard->do_shard_idx; + cb_args.send_time = daos_client_metric ? daos_get_ntime() : 0; + obj_shard_update_metrics_begin(req); rc = tse_task_register_comp_cb(task, dc_k2a_cb, &cb_args, sizeof(cb_args)); if (rc != 0) D_GOTO(out_eaa, rc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 149915c10fa7..4d750c873328 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "obj_rpc.h" #include "obj_ec.h" @@ -535,6 +536,87 @@ struct dc_obj_verify_args { struct dc_obj_verify_cursor cursor; }; +/* + * Report latency on a per-I/O size. + * Buckets starts at [0; 256B[ and are increased by power of 2 + * (i.e. [256B; 512B[, [512B; 1KB[) up to [4MB; infinity[ + * Since 4MB = 2^22 and 256B = 2^8, this means + * (22 - 8 + 1) = 15 buckets plus the 4MB+ bucket, so + * 16 buckets in total. + */ +#define NR_LATENCY_BUCKETS 16 + +struct dc_obj_tls { + /** Measure update/fetch latency based on I/O size (type = gauge) */ + struct d_tm_node_t *cot_update_lat[NR_LATENCY_BUCKETS]; + struct d_tm_node_t *cot_fetch_lat[NR_LATENCY_BUCKETS]; + + /** Measure per-operation latency in us (type = gauge) */ + struct d_tm_node_t *cot_op_lat[OBJ_PROTO_CLI_COUNT]; + /** Count number of per-opcode active requests (type = gauge) */ + struct d_tm_node_t *cot_op_active[OBJ_PROTO_CLI_COUNT]; +}; + +int +obj_latency_tm_init(uint32_t opc, int tgt_id, struct d_tm_node_t **tm, char *op, char *desc, + bool server); +extern struct daos_module_key dc_obj_module_key; + +static inline struct dc_obj_tls * +dc_obj_tls_get() +{ + struct daos_thread_local_storage *dtls; + + dtls = dc_tls_get(dc_obj_module_key.dmk_tags); + D_ASSERT(dtls != NULL); + return daos_module_key_get(dtls, &dc_obj_module_key); +} + +struct obj_pool_metrics { + /** Count number of total per-opcode requests (type = counter) */ + struct d_tm_node_t *opm_total[OBJ_PROTO_CLI_COUNT]; + /** Total number of bytes fetched (type = counter) */ + struct d_tm_node_t *opm_fetch_bytes; + /** Total number of bytes updated (type = counter) */ + struct d_tm_node_t *opm_update_bytes; + + /** Total number of silently restarted updates (type = counter) */ + struct d_tm_node_t *opm_update_restart; + /** Total number of resent update operations (type = counter) */ + struct d_tm_node_t *opm_update_resent; + /** Total number of retry update operations (type = counter) */ + struct d_tm_node_t *opm_update_retry; + /** Total number of EC full-stripe update operations (type = counter) */ + struct d_tm_node_t *opm_update_ec_full; + /** Total number of EC partial update operations (type = counter) */ + struct d_tm_node_t *opm_update_ec_partial; +}; + +void +obj_metrics_free(void *data); +int +obj_metrics_count(void); +void * +obj_metrics_alloc_internal(const char *path, int tgt_id, bool server); + +static inline unsigned int +lat_bucket(uint64_t size) +{ + int nr; + + if (size <= 256) + return 0; + + /** return number of leading zero-bits */ + nr = __builtin_clzl(size - 1); + + /** >4MB, return last bucket */ + if (nr < 42) + return NR_LATENCY_BUCKETS - 1; + + return 56 - nr; +} + static inline int dc_cont2uuid(struct dc_cont *dc_cont, uuid_t *hdl_uuid, uuid_t *uuid) { diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 8312c6719d89..f85409aee9b2 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -10,6 +10,10 @@ #define DDSUBSYS DDFAC(object) #include +#include +#include +#include +#include #include "obj_internal.h" static daos_size_t @@ -86,6 +90,150 @@ daos_iods_free(daos_iod_t *iods, int nr, bool need_free) D_FREE(iods); } +int +obj_latency_tm_init(uint32_t opc, int tgt_id, struct d_tm_node_t **tm, char *op, char *desc, + bool server) +{ + unsigned int bucket_max = 256; + int i; + int rc = 0; + + for (i = 0; i < NR_LATENCY_BUCKETS; i++) { + char *path; + + if (server) { + if (bucket_max < 1024) /** B */ + D_ASPRINTF(path, "io/latency/%s/%uB/tgt_%u", op, bucket_max, + tgt_id); + else if (bucket_max < 1024 * 1024) /** KB */ + D_ASPRINTF(path, "io/latency/%s/%uKB/tgt_%u", op, bucket_max / 1024, + tgt_id); + else if (bucket_max <= 1024 * 1024 * 4) /** MB */ + D_ASPRINTF(path, "io/latency/%s/%uMB/tgt_%u", op, + bucket_max / (1024 * 1024), tgt_id); + else /** >4MB */ + D_ASPRINTF(path, "io/latency/%s/GT4MB/tgt_%u", op, tgt_id); + } else { + unsigned long tid = pthread_self(); + + if (bucket_max < 1024) /** B */ + D_ASPRINTF(path, "%lu/io/latency/%s/%uB", tid, op, bucket_max); + else if (bucket_max < 1024 * 1024) /** KB */ + D_ASPRINTF(path, "%lu/io/latency/%s/%uKB", tid, op, + bucket_max / 1024); + else if (bucket_max <= 1024 * 1024 * 4) /** MB */ + D_ASPRINTF(path, "%lu/io/latency/%s/%uMB", tid, op, + bucket_max / (1024 * 1024)); + else /** >4MB */ + D_ASPRINTF(path, "%lu/io/latency/%s/GT4MB", tid, op); + } + rc = d_tm_add_metric(&tm[i], D_TM_STATS_GAUGE, desc, "us", path); + if (rc) + D_WARN("Failed to create per-I/O size latency " + "sensor: " DF_RC "\n", + DP_RC(rc)); + D_FREE(path); + + bucket_max <<= 1; + } + + return rc; +} + +void +obj_metrics_free(void *data) +{ + D_FREE(data); +} + +int +obj_metrics_count(void) +{ + return (sizeof(struct obj_pool_metrics) / sizeof(struct d_tm_node_t *)); +} + +void * +obj_metrics_alloc_internal(const char *path, int tgt_id, bool server) +{ + struct obj_pool_metrics *metrics; + char tgt_path[32]; + uint32_t opc; + int rc; + + D_ASSERT(tgt_id >= 0); + if (server) + snprintf(tgt_path, sizeof(tgt_path), "/tgt_%u", tgt_id); + else + tgt_path[0] = '\0'; + + D_ALLOC_PTR(metrics); + if (metrics == NULL) { + D_ERROR("failed to alloc object metrics"); + return NULL; + } + + /** register different per-opcode counters */ + for (opc = 0; opc < OBJ_PROTO_CLI_COUNT; opc++) { + /** Then the total number of requests, of type counter */ + rc = d_tm_add_metric(&metrics->opm_total[opc], D_TM_COUNTER, + "total number of processed object RPCs", "ops", "%s/ops/%s%s", + path, obj_opc_to_str(opc), tgt_path); + if (rc) + D_WARN("Failed to create total counter: " DF_RC "\n", DP_RC(rc)); + } + + /** Total number of silently restarted updates, of type counter */ + rc = d_tm_add_metric(&metrics->opm_update_restart, D_TM_COUNTER, + "total number of restarted update ops", "updates", "%s/restarted%s", + path, tgt_path); + if (rc) + D_WARN("Failed to create restarted counter: " DF_RC "\n", DP_RC(rc)); + + /** Total number of resent updates, of type counter */ + rc = d_tm_add_metric(&metrics->opm_update_resent, D_TM_COUNTER, + "total number of resent update RPCs", "updates", "%s/resent%s", path, + tgt_path); + if (rc) + D_WARN("Failed to create resent counter: " DF_RC "\n", DP_RC(rc)); + + /** Total number of retry updates locally, of type counter */ + rc = d_tm_add_metric(&metrics->opm_update_retry, D_TM_COUNTER, + "total number of retried update RPCs", "updates", "%s/retry%s", path, + tgt_path); + if (rc) + D_WARN("Failed to create retry cnt sensor: " DF_RC "\n", DP_RC(rc)); + + /** Total bytes read */ + rc = d_tm_add_metric(&metrics->opm_fetch_bytes, D_TM_COUNTER, + "total number of bytes fetched/read", "bytes", "%s/xferred/fetch%s", + path, tgt_path); + if (rc) + D_WARN("Failed to create bytes fetch counter: " DF_RC "\n", DP_RC(rc)); + + /** Total bytes written */ + rc = d_tm_add_metric(&metrics->opm_update_bytes, D_TM_COUNTER, + "total number of bytes updated/written", "bytes", + "%s/xferred/update%s", path, tgt_path); + if (rc) + D_WARN("Failed to create bytes update counter: " DF_RC "\n", DP_RC(rc)); + + /** Total number of EC full-stripe update operations, of type counter */ + rc = d_tm_add_metric(&metrics->opm_update_ec_full, D_TM_COUNTER, + "total number of EC full-stripe updates", "updates", + "%s/EC_update/full_stripe%s", path, tgt_path); + if (rc) + D_WARN("Failed to create EC full stripe update counter: " DF_RC "\n", DP_RC(rc)); + + /** Total number of EC partial update operations, of type counter */ + rc = d_tm_add_metric(&metrics->opm_update_ec_partial, D_TM_COUNTER, + "total number of EC partial updates", "updates", + "%s/EC_update/partial%s", path, tgt_path); + if (rc) + D_WARN("Failed to create EC partial update counter: " DF_RC "\n", DP_RC(rc)); + + return metrics; +} + struct recx_rec { daos_recx_t *rr_recx; }; diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index 368595bbfb46..885a966c55cb 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -114,36 +114,6 @@ struct migrate_cont_hdl { void migrate_pool_tls_destroy(struct migrate_pool_tls *tls); -/* - * Report latency on a per-I/O size. - * Buckets starts at [0; 256B[ and are increased by power of 2 - * (i.e. [256B; 512B[, [512B; 1KB[) up to [4MB; infinity[ - * Since 4MB = 2^22 and 256B = 2^8, this means - * (22 - 8 + 1) = 15 buckets plus the 4MB+ bucket, so - * 16 buckets in total. - */ -#define NR_LATENCY_BUCKETS 16 - -struct obj_pool_metrics { - /** Count number of total per-opcode requests (type = counter) */ - struct d_tm_node_t *opm_total[OBJ_PROTO_CLI_COUNT]; - /** Total number of bytes fetched (type = counter) */ - struct d_tm_node_t *opm_fetch_bytes; - /** Total number of bytes updated (type = counter) */ - struct d_tm_node_t *opm_update_bytes; - - /** Total number of silently restarted updates (type = counter) */ - struct d_tm_node_t *opm_update_restart; - /** Total number of resent update operations (type = counter) */ - struct d_tm_node_t *opm_update_resent; - /** Total number of retry update operations (type = counter) */ - struct d_tm_node_t *opm_update_retry; - /** Total number of EC full-stripe update operations (type = counter) */ - struct d_tm_node_t *opm_update_ec_full; - /** Total number of EC partial update operations (type = counter) */ - struct d_tm_node_t *opm_update_ec_partial; -}; - struct obj_tls { d_sg_list_t ot_echo_sgl; d_list_t ot_pool_list; @@ -175,24 +145,6 @@ obj_tls_get() return dss_module_key_get(dss_tls_get(), &obj_module_key); } -static inline unsigned int -lat_bucket(uint64_t size) -{ - int nr; - - if (size <= 256) - return 0; - - /** return number of leading zero-bits */ - nr = __builtin_clzl(size - 1); - - /** >4MB, return last bucket */ - if (nr < 42) - return NR_LATENCY_BUCKETS - 1; - - return 56 - nr; -} - enum latency_type { BULK_LATENCY, BIO_LATENCY, diff --git a/src/object/srv_mod.c b/src/object/srv_mod.c index 4fd889bb7de0..ddb39b8e9fb7 100644 --- a/src/object/srv_mod.c +++ b/src/object/srv_mod.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "obj_rpc.h" #include "srv_internal.h" @@ -73,41 +74,6 @@ static struct daos_rpc_handler obj_handlers[] = { #undef X -static int -obj_latency_tm_init(uint32_t opc, int tgt_id, struct d_tm_node_t **tm, char *op, char *desc) -{ - unsigned int bucket_max = 256; - int i; - int rc = 0; - - for (i = 0; i < NR_LATENCY_BUCKETS; i++) { - char *path; - - if (bucket_max < 1024) /** B */ - D_ASPRINTF(path, "io/latency/%s/%uB/tgt_%u", - op, bucket_max, tgt_id); - else if (bucket_max < 1024 * 1024) /** KB */ - D_ASPRINTF(path, "io/latency/%s/%uKB/tgt_%u", - op, bucket_max / 1024, tgt_id); - else if (bucket_max <= 1024 * 1024 * 4) /** MB */ - D_ASPRINTF(path, "io/latency/%s/%uMB/tgt_%u", - op, bucket_max / (1024 * 1024), tgt_id); - else /** >4MB */ - D_ASPRINTF(path, "io/latency/%s/GT4MB/tgt_%u", - op, tgt_id); - - rc = d_tm_add_metric(&tm[i], D_TM_STATS_GAUGE, desc, "us", path); - if (rc) - D_WARN("Failed to create per-I/O size latency " - "sensor: "DF_RC"\n", DP_RC(rc)); - D_FREE(path); - - bucket_max <<= 1; - } - - return rc; -} - static void * obj_tls_init(int tags, int xs_id, int tgt_id) { @@ -158,27 +124,28 @@ obj_tls_init(int tags, int xs_id, int tgt_id) */ obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_lat, - obj_opc_to_str(DAOS_OBJ_RPC_UPDATE), "update RPC processing time"); + obj_opc_to_str(DAOS_OBJ_RPC_UPDATE), "update RPC processing time", + true); obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_lat, - obj_opc_to_str(DAOS_OBJ_RPC_FETCH), "fetch RPC processing time"); + obj_opc_to_str(DAOS_OBJ_RPC_FETCH), "fetch RPC processing time", true); obj_latency_tm_init(DAOS_OBJ_RPC_TGT_UPDATE, tgt_id, tls->ot_tgt_update_lat, obj_opc_to_str(DAOS_OBJ_RPC_TGT_UPDATE), - "update tgt RPC processing time"); - obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_bulk_lat, - "bulk_update", "Bulk update processing time"); - obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_bulk_lat, - "bulk_fetch", "Bulk fetch processing time"); - - obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_vos_lat, - "vos_update", "VOS update processing time"); - obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_vos_lat, - "vos_fetch", "VOS fetch processing time"); - - obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_bio_lat, - "bio_update", "BIO update processing time"); - obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_bio_lat, - "bio_fetch", "BIO fetch processing time"); + "update tgt RPC processing time", true); + obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_bulk_lat, "bulk_update", + "Bulk update processing time", true); + obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_bulk_lat, "bulk_fetch", + "Bulk fetch processing time", true); + + obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_vos_lat, "vos_update", + "VOS update processing time", true); + obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_vos_lat, "vos_fetch", + "VOS fetch processing time", true); + + obj_latency_tm_init(DAOS_OBJ_RPC_UPDATE, tgt_id, tls->ot_update_bio_lat, "bio_update", + "BIO update processing time", true); + obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_bio_lat, "bio_fetch", + "BIO fetch processing time", true); return tls; } @@ -239,103 +206,14 @@ static struct dss_module_ops ds_obj_mod_ops = { static void * obj_metrics_alloc(const char *path, int tgt_id) { - struct obj_pool_metrics *metrics; - uint32_t opc; - int rc; - - D_ASSERT(tgt_id >= 0); - - D_ALLOC_PTR(metrics); - if (metrics == NULL) - return NULL; - - /** register different per-opcode counters */ - for (opc = 0; opc < OBJ_PROTO_CLI_COUNT; opc++) { - /** Then the total number of requests, of type counter */ - rc = d_tm_add_metric(&metrics->opm_total[opc], D_TM_COUNTER, - "total number of processed object RPCs", - "ops", "%s/ops/%s/tgt_%u", path, - obj_opc_to_str(opc), tgt_id); - if (rc) - D_WARN("Failed to create total counter: "DF_RC"\n", - DP_RC(rc)); - } - - /** Total number of silently restarted updates, of type counter */ - rc = d_tm_add_metric(&metrics->opm_update_restart, D_TM_COUNTER, - "total number of restarted update ops", "updates", - "%s/restarted/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create restarted counter: "DF_RC"\n", - DP_RC(rc)); - - /** Total number of resent updates, of type counter */ - rc = d_tm_add_metric(&metrics->opm_update_resent, D_TM_COUNTER, - "total number of resent update RPCs", "updates", - "%s/resent/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create resent counter: "DF_RC"\n", - DP_RC(rc)); - - /** Total number of retry updates locally, of type counter */ - rc = d_tm_add_metric(&metrics->opm_update_retry, D_TM_COUNTER, - "total number of retried update RPCs", "updates", - "%s/retry/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create retry cnt sensor: "DF_RC"\n", DP_RC(rc)); - - /** Total bytes read */ - rc = d_tm_add_metric(&metrics->opm_fetch_bytes, D_TM_COUNTER, - "total number of bytes fetched/read", "bytes", - "%s/xferred/fetch/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create bytes fetch counter: "DF_RC"\n", - DP_RC(rc)); - - /** Total bytes written */ - rc = d_tm_add_metric(&metrics->opm_update_bytes, D_TM_COUNTER, - "total number of bytes updated/written", "bytes", - "%s/xferred/update/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create bytes update counter: "DF_RC"\n", - DP_RC(rc)); - - /** Total number of EC full-stripe update operations, of type counter */ - rc = d_tm_add_metric(&metrics->opm_update_ec_full, D_TM_COUNTER, - "total number of EC sull-stripe updates", "updates", - "%s/EC_update/full_stripe/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create EC full stripe update counter: "DF_RC"\n", - DP_RC(rc)); - - /** Total number of EC partial update operations, of type counter */ - rc = d_tm_add_metric(&metrics->opm_update_ec_partial, D_TM_COUNTER, - "total number of EC sull-partial updates", "updates", - "%s/EC_update/partial/tgt_%u", path, tgt_id); - if (rc) - D_WARN("Failed to create EC partial update counter: "DF_RC"\n", - DP_RC(rc)); - - return metrics; -} - -static void -obj_metrics_free(void *data) -{ - D_FREE(data); -} - -static int -obj_metrics_count(void) -{ - return (sizeof(struct obj_pool_metrics) / sizeof(struct d_tm_node_t *)); + return obj_metrics_alloc_internal(path, tgt_id, true); } -struct dss_module_metrics obj_metrics = { - .dmm_tags = DAOS_TGT_TAG, - .dmm_init = obj_metrics_alloc, - .dmm_fini = obj_metrics_free, - .dmm_nr_metrics = obj_metrics_count, +struct daos_module_metrics obj_metrics = { + .dmm_tags = DAOS_TGT_TAG, + .dmm_init = obj_metrics_alloc, + .dmm_fini = obj_metrics_free, + .dmm_nr_metrics = obj_metrics_count, }; struct dss_module obj_module = { diff --git a/src/pool/cli.c b/src/pool/cli.c index e688cd9ecd33..89f7eb256a14 100644 --- a/src/pool/cli.c +++ b/src/pool/cli.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,9 +15,13 @@ #define D_LOGFAC DD_FAC(pool) #include +#include +#include #include #include #include +#include +#include #include #include #include @@ -32,6 +36,152 @@ struct rsvc_client_state { int dc_pool_proto_version; +struct dc_pool_metrics { + d_list_t dp_pool_list; /* pool metrics list on this thread */ + uuid_t dp_uuid; + char dp_path[D_TM_MAX_NAME_LEN]; + void *dp_metrics[DAOS_NR_MODULE]; + int dp_ref; +}; + +/** + * Destroy metrics for a specific pool. + * + * \param[in] pool pointer to ds_pool structure + */ +static void +dc_pool_metrics_free(struct dc_pool_metrics *metrics) +{ + int rc; + + if (!daos_client_metric) + return; + + daos_module_fini_metrics(DAOS_CLI_TAG, metrics->dp_metrics); + if (!daos_client_metric_retain) { + rc = d_tm_del_ephemeral_dir(metrics->dp_path); + if (rc != 0) { + D_WARN(DF_UUID ": failed to remove pool metrics dir for pool: " DF_RC "\n", + DP_UUID(metrics->dp_uuid), DP_RC(rc)); + return; + } + } + + D_INFO(DF_UUID ": destroyed ds_pool metrics: %s\n", DP_UUID(metrics->dp_uuid), + metrics->dp_path); +} + +static int +dc_pool_metrics_alloc(uuid_t pool_uuid, struct dc_pool_metrics **metrics_p) +{ + struct dc_pool_metrics *metrics = NULL; + int pid; + size_t size; + int rc; + + if (!daos_client_metric) + return 0; + + D_ALLOC_PTR(metrics); + if (metrics == NULL) + return -DER_NOMEM; + + uuid_copy(metrics->dp_uuid, pool_uuid); + pid = getpid(); + snprintf(metrics->dp_path, sizeof(metrics->dp_path), "pool/" DF_UUIDF, + DP_UUID(metrics->dp_uuid)); + + /** create new shmem space for per-pool metrics */ + size = daos_module_nr_pool_metrics() * PER_METRIC_BYTES; + rc = d_tm_add_ephemeral_dir(NULL, size, metrics->dp_path); + if (rc != 0) { + D_WARN(DF_UUID ": failed to create metrics dir for pool: " DF_RC "\n", + DP_UUID(metrics->dp_uuid), DP_RC(rc)); + return rc; + } + + /* initialize metrics on the system xstream for each module */ + rc = daos_module_init_metrics(DAOS_CLI_TAG, metrics->dp_metrics, metrics->dp_path, pid); + if (rc != 0) { + D_WARN(DF_UUID ": failed to initialize module metrics: " DF_RC "\n", + DP_UUID(metrics->dp_uuid), DP_RC(rc)); + dc_pool_metrics_free(metrics); + return rc; + } + + D_INFO(DF_UUID ": created metrics for pool %s\n", DP_UUID(metrics->dp_uuid), + metrics->dp_path); + *metrics_p = metrics; + + return 0; +} + +struct dc_pool_metrics * +dc_pool_metrics_lookup(struct dc_pool_tls *tls, uuid_t pool_uuid) +{ + struct dc_pool_metrics *metrics; + + D_MUTEX_LOCK(&tls->dpc_metrics_list_lock); + d_list_for_each_entry(metrics, &tls->dpc_metrics_list, dp_pool_list) { + if (uuid_compare(pool_uuid, metrics->dp_uuid) == 0) { + D_MUTEX_UNLOCK(&tls->dpc_metrics_list_lock); + return metrics; + } + } + D_MUTEX_UNLOCK(&tls->dpc_metrics_list_lock); + + return NULL; +} + +static void * +dc_pool_tls_init(int tags, int xs_id, int pid) +{ + struct dc_pool_tls *tls; + int rc; + + D_ALLOC_PTR(tls); + if (tls == NULL) + return NULL; + + rc = D_MUTEX_INIT(&tls->dpc_metrics_list_lock, NULL); + if (rc != 0) { + D_FREE(tls); + return NULL; + } + + D_INIT_LIST_HEAD(&tls->dpc_metrics_list); + return tls; +} + +static void +dc_pool_tls_fini(int tags, void *data) +{ + struct dc_pool_tls *tls = data; + struct dc_pool_metrics *dpm; + struct dc_pool_metrics *tmp; + + D_MUTEX_LOCK(&tls->dpc_metrics_list_lock); + d_list_for_each_entry_safe(dpm, tmp, &tls->dpc_metrics_list, dp_pool_list) { + if (dpm->dp_ref != 0) + D_WARN("still reference for pool " DF_UUID " metrics\n", + DP_UUID(dpm->dp_uuid)); + d_list_del_init(&dpm->dp_pool_list); + dc_pool_metrics_free(dpm); + D_FREE(dpm); + } + D_MUTEX_UNLOCK(&tls->dpc_metrics_list_lock); + + D_MUTEX_DESTROY(&tls->dpc_metrics_list_lock); + D_FREE(tls); +} + +struct daos_module_key dc_pool_module_key = { + .dmk_tags = DAOS_CLI_TAG, + .dmk_index = -1, + .dmk_init = dc_pool_tls_init, + .dmk_fini = dc_pool_tls_fini, +}; + /** * Initialize pool interface */ @@ -41,6 +191,9 @@ dc_pool_init(void) uint32_t ver_array[2] = {DAOS_POOL_VERSION - 1, DAOS_POOL_VERSION}; int rc; + if (daos_client_metric) + daos_register_key(&dc_pool_module_key); + dc_pool_proto_version = 0; rc = daos_rpc_proto_query(pool_proto_fmt_v4.cpf_base, ver_array, 2, &dc_pool_proto_version); if (rc) @@ -77,7 +230,68 @@ dc_pool_fini(void) else rc = daos_rpc_unregister(&pool_proto_fmt_v5); if (rc != 0) - D_ERROR("failed to unregister pool RPCs: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, "failed to unregister pool RPCs"); + + if (daos_client_metric) + daos_unregister_key(&dc_pool_module_key); +} + +static int +dc_pool_metrics_start(struct dc_pool *pool) +{ + struct dc_pool_tls *tls; + struct dc_pool_metrics *metrics; + int rc; + + if (!daos_client_metric) + return 0; + + if (pool->dp_metrics != NULL) + return 0; + + tls = dc_pool_tls_get(); + D_ASSERT(tls != NULL); + + metrics = dc_pool_metrics_lookup(tls, pool->dp_pool); + if (metrics != NULL) { + metrics->dp_ref++; + pool->dp_metrics = metrics->dp_metrics; + return 0; + } + + rc = dc_pool_metrics_alloc(pool->dp_pool, &metrics); + if (rc != 0) + return rc; + + D_MUTEX_LOCK(&tls->dpc_metrics_list_lock); + d_list_add(&metrics->dp_pool_list, &tls->dpc_metrics_list); + D_MUTEX_UNLOCK(&tls->dpc_metrics_list_lock); + metrics->dp_ref++; + pool->dp_metrics = metrics->dp_metrics; + + return 0; +} + +static void +dc_pool_metrics_stop(struct dc_pool *pool) +{ + struct dc_pool_metrics *metrics; + struct dc_pool_tls *tls; + + if (!daos_client_metric) + return; + + if (pool->dp_metrics == NULL) + return; + + tls = dc_pool_tls_get(); + D_ASSERT(tls != NULL); + + metrics = dc_pool_metrics_lookup(tls, pool->dp_pool); + if (metrics != NULL) + metrics->dp_ref--; + + pool->dp_metrics = NULL; } static void @@ -99,6 +313,8 @@ pool_free(struct d_hlink *hlink) if (pool->dp_map != NULL) pool_map_decref(pool->dp_map); + dc_pool_metrics_stop(pool); + rsvc_client_fini(&pool->dp_client); if (pool->dp_sys != NULL) dc_mgmt_sys_detach(pool->dp_sys); @@ -609,6 +825,10 @@ dc_pool_connect_internal(tse_task_t *task, daos_pool_info_t *info, goto out; } + rc = dc_pool_metrics_start(pool); + if (rc != 0) + D_GOTO(out, rc); + /** Pool connect RPC by UUID (provided, or looked up by label above) */ rc = pool_req_create(daos_task2ctx(task), &ep, POOL_CONNECT, &rpc); if (rc != 0) { @@ -1090,6 +1310,10 @@ dc_pool_g2l(struct dc_pool_glob *pool_glob, size_t len, daos_handle_t *poh) if (rc < 0) goto out; + rc = dc_pool_metrics_start(pool); + if (rc != 0) + goto out; + rc = pool_map_create(map_buf, pool_glob->dpg_map_version, &map); if (rc != 0) { D_ERROR("failed to create local pool map: "DF_RC"\n", diff --git a/src/pool/cli_internal.h b/src/pool/cli_internal.h index f8f965b4469a..fd3b26539f8d 100644 --- a/src/pool/cli_internal.h +++ b/src/pool/cli_internal.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -16,4 +16,20 @@ struct dc_pool *dc_pool_alloc(unsigned int nr); int dc_pool_map_update(struct dc_pool *pool, struct pool_map *map, bool connect); +struct dc_pool_tls { + pthread_mutex_t dpc_metrics_list_lock; + d_list_t dpc_metrics_list; +}; + +extern struct daos_module_key dc_pool_module_key; + +static inline struct dc_pool_tls * +dc_pool_tls_get() +{ + struct daos_thread_local_storage *dtls; + + dtls = dc_tls_get(dc_pool_module_key.dmk_tags); + D_ASSERT(dtls != NULL); + return daos_module_key_get(dtls, &dc_pool_module_key); +} #endif /* __POOL_CLIENT_INTERNAL_H__ */ diff --git a/src/pool/srv.c b/src/pool/srv.c index 40f1d7d18eb5..8a7ba7d14efd 100644 --- a/src/pool/srv.c +++ b/src/pool/srv.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include "rpc.h" @@ -174,11 +175,11 @@ struct dss_module_key pool_module_key = { .dmk_fini = pool_tls_fini, }; -struct dss_module_metrics pool_metrics = { - .dmm_tags = DAOS_SYS_TAG, - .dmm_init = ds_pool_metrics_alloc, - .dmm_fini = ds_pool_metrics_free, - .dmm_nr_metrics = ds_pool_metrics_count, +struct daos_module_metrics pool_metrics = { + .dmm_tags = DAOS_SYS_TAG, + .dmm_init = ds_pool_metrics_alloc, + .dmm_fini = ds_pool_metrics_free, + .dmm_nr_metrics = ds_pool_metrics_count, }; struct dss_module pool_module = { diff --git a/src/pool/srv_metrics.c b/src/pool/srv_metrics.c index 0ca5b494df17..615af9deba1b 100644 --- a/src/pool/srv_metrics.c +++ b/src/pool/srv_metrics.c @@ -8,24 +8,9 @@ #include "srv_internal.h" #include +#include #include - -/* Estimate of bytes per typical metric node */ -#define NODE_BYTES (sizeof(struct d_tm_node_t) + \ - sizeof(struct d_tm_metric_t) + \ - 64 /* buffer for metadata */) -/* Estimate of bytes per histogram bucket */ -#define BUCKET_BYTES (sizeof(struct d_tm_bucket_t) + NODE_BYTES) -/* - Estimate of bytes per metric. - This is a generous high-water mark assuming most metrics are not using - histograms. May need adjustment if the balance of metrics changes. -*/ -#define PER_METRIC_BYTES (NODE_BYTES + sizeof(struct d_tm_stats_t) + \ - sizeof(struct d_tm_histogram_t) + \ - BUCKET_BYTES) - /** * Initializes the pool metrics */ diff --git a/src/proto/mgmt/svc.proto b/src/proto/mgmt/svc.proto index a284d645106a..129fecd53707 100644 --- a/src/proto/mgmt/svc.proto +++ b/src/proto/mgmt/svc.proto @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2023 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -122,3 +122,16 @@ message PoolMonitorReq { string poolHandleUUID = 3; // Pool Handle UUID for the connection string jobid = 4; // Job ID to associate instance with. } + +message ClientTelemetryReq +{ + string sys = 1; // DAOS system identifier + string jobid = 2; // Job ID used for client telemetry + int32 shm_key = 3; // Client's shared memory segment key +} + +message ClientTelemetryResp +{ + int32 status = 1; // DAOS status code + int32 agent_uid = 2; // UID of agent process +} diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.py b/src/tests/ftest/telemetry/basic_client_telemetry.py new file mode 100644 index 000000000000..1d115b4c95e5 --- /dev/null +++ b/src/tests/ftest/telemetry/basic_client_telemetry.py @@ -0,0 +1,54 @@ +""" + (C) Copyright 2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +from ior_utils import read_data, write_data +from telemetry_test_base import TestWithClientTelemetry + + +class BasicClientTelemetry(TestWithClientTelemetry): + """Tests to verify basic client telemetry. + + :avocado: recursive + """ + + def test_client_metrics_exist(self): + """JIRA ID: DAOS-8331. + + Verify that the client-side telemetry captures some throughput metrics. + After performing some I/O, there should be some client telemetry data. + + Test steps: + 1) Create a pool and container + 2) Perform some I/O with IOR + 3) Verify that there is some client telemetry data + + :avocado: tags=all,daily_regression + :avocado: tags=vm + :avocado: tags=telemetry + :avocado: tags=BasicClientTelemetry,test_client_metrics_exist + """ + # create pool and container + pool = self.get_pool(connect=True) + container = self.get_container(pool=pool) + + self.log_step('Writing data to the pool (ior)') + ior = write_data(self, container) + self.log_step('Reading data from the pool (ior)') + read_data(self, ior, container) + + metric_names = [ + "client_pool_xferred_fetch", + "client_pool_xferred_update", + ] + + self.log_step('Reading client telemetry (reads & writes should be > 0)') + after_metrics = self.telemetry.collect_client_data(metric_names) + for metric in metric_names: + msum = 0 + for value in after_metrics[metric].values(): + msum += value + self.assertGreater(msum, 0) + + self.log_step('Test passed') diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.yaml b/src/tests/ftest/telemetry/basic_client_telemetry.yaml new file mode 100644 index 000000000000..d585dc81fda4 --- /dev/null +++ b/src/tests/ftest/telemetry/basic_client_telemetry.yaml @@ -0,0 +1,46 @@ +hosts: + test_servers: 1 + test_clients: 1 + +timeout: 180 + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 + +agent_config: + telemetry_port: 9191 + telemetry_retain: 30s + telemetry_enabled: true + +pool: + scm_size: 2G + +container: + type: POSIX + control_method: daos + dfs_oclass: SX + +ior: &ior_base + ppn: 4 + api: DFS + transfer_size: 512K + block_size: 1M + dfs_oclass: SX + +ior_write: + <<: *ior_base + flags: "-k -v -w -W -G 1" + +ior_read: + <<: *ior_base + flags: "-v -r -R -G 1" diff --git a/src/tests/ftest/util/agent_utils_params.py b/src/tests/ftest/util/agent_utils_params.py index 46b793f31ef8..7f92b9f479aa 100644 --- a/src/tests/ftest/util/agent_utils_params.py +++ b/src/tests/ftest/util/agent_utils_params.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -57,10 +57,19 @@ def __init__(self, filename, common_yaml): # Specifies the log level for agent logs. # - exclude_fabric_ifaces: , Ignore a subset of fabric interfaces when selecting # an interface for client applications. + # - telemetry_port: , e.g. 9192 + # Enable Prometheus endpoint for client telemetry. + # - telemetry_enabled: , e.g. True + # Enable client telemetry for all client processes. + # - telemetry_retain: , e.g. 5m + # Time to retain per-client telemetry data. self.runtime_dir = BasicParameter(None, "/var/run/daos_agent") self.log_file = LogParameter(log_dir, None, "daos_agent.log") self.control_log_mask = BasicParameter(None, "debug") self.exclude_fabric_ifaces = BasicParameter(None) + self.telemetry_port = BasicParameter(None) + self.telemetry_enabled = BasicParameter(None) + self.telemetry_retain = BasicParameter(None) def update_log_file(self, name): """Update the log file name for the daos agent. diff --git a/src/tests/ftest/util/telemetry_test_base.py b/src/tests/ftest/util/telemetry_test_base.py index 7641fe8d5465..6a2389935f70 100644 --- a/src/tests/ftest/util/telemetry_test_base.py +++ b/src/tests/ftest/util/telemetry_test_base.py @@ -1,10 +1,10 @@ """ -(C) Copyright 2021-2023 Intel Corporation. +(C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ from apricot import TestWithServers -from telemetry_utils import TelemetryUtils +from telemetry_utils import ClientTelemetryUtils, TelemetryUtils class TestWithTelemetry(TestWithServers): @@ -263,3 +263,36 @@ def sum_values(metric_out): total += value return total + + +class TestWithClientTelemetry(TestWithTelemetry): + """Test client telemetry metrics. + + :avocado: recursive + """ + def setUp(self): + """Set up each test case.""" + super().setUp() + self.telemetry = ClientTelemetryUtils( + self.get_dmg_command(), self.server_managers[0].hosts, self.hostlist_clients) + + def verify_client_telemetry_list(self, with_pools=False): + """Verify the dmg telemetry metrics list command output.""" + # Define a list of expected telemetry metrics names + expected = self.telemetry.get_all_client_metrics_names( + with_pools=with_pools) + + # List all of the telemetry metrics + result = self.telemetry.list_metrics() + + # Verify the lists are detected for each agent + errors = self.compare_lists( + list(result), self.hostlist_clients, 0, "", + "telemetry metrics list hosts") + for host, host_result in result.items(): + errors.extend( + self.compare_lists(expected, host_result, 2, host, "telemetry metric names")) + if errors: + self.fail("\n".join(errors)) + + self.log.info("Test PASSED") diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index d4d151af68fa..46fdd00c62fe 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2021-2023 Intel Corporation. +(C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -30,7 +30,7 @@ def _gen_stats_metrics(basename): class TelemetryUtils(): # pylint: disable=too-many-nested-blocks - """Defines a object used to verify telemetry information.""" + """Defines an object used to verify server telemetry information.""" # Define a set of patterns that shouldn't be used for comparisons. METRIC_EXCLUDE_PATTERNS = [ @@ -342,15 +342,13 @@ class TelemetryUtils(): ENGINE_NET_METRICS = [ "engine_net_glitch", "engine_net_failed_addr", + "engine_net_quota_exceeded", "engine_net_req_timeout", - "engine_net_swim_delay_stddev", - "engine_net_swim_delay_max", - "engine_net_swim_delay_mean", - "engine_net_swim_delay", - "engine_net_swim_delay_min", + *_gen_stats_metrics("engine_net_swim_delay"), "engine_net_uri_lookup_timeout", "engine_net_uri_lookup_other", - "engine_net_uri_lookup_self"] + "engine_net_uri_lookup_self", + "engine_net_waitq_depth"] ENGINE_RANK_METRICS = [ "engine_rank"] ENGINE_NVME_HEALTH_METRICS = [ @@ -475,7 +473,7 @@ def is_excluded_metric(self, name): return True return False - def list_metrics(self): + def list_metrics(self, hosts=None): """List the available metrics for each host. Returns: @@ -483,8 +481,9 @@ def list_metrics(self): """ info = {} - self.log.info("Listing telemetry metrics from %s", self.hosts) - for host in self.hosts: + host_list = hosts or self.hosts + self.log.info("Listing telemetry metrics from %s", host_list) + for host in host_list: data = self.dmg.telemetry_metrics_list(host=host) info[host] = [] if "response" in data: @@ -494,7 +493,7 @@ def list_metrics(self): info[host].append(entry["name"]) return info - def collect_data(self, names): + def collect_data(self, names, hosts=None): """Collect telemetry data for the specified metrics. Args: @@ -510,7 +509,9 @@ def collect_data(self, names): }, ... """ - return self._data.collect(self.log, names, self.hosts, self.dmg) + host_list = hosts or self.hosts + self.log.info("Collecting telemetry data from %s", host_list) + return self._data.collect(self.log, names, host_list, self.dmg) def display_data(self): """Display the telemetry metric values.""" @@ -531,7 +532,7 @@ def verify_data(self, ranges): """ return self._data.verify(self.log, ranges) - def get_metrics(self, name): + def get_metrics(self, name, hosts=None): """Obtain the specified metric information for each host. Args: @@ -543,8 +544,9 @@ def get_metrics(self, name): """ info = {} - self.log.info("Querying telemetry metric %s from %s", name, self.hosts) - for host in self.hosts: + host_list = hosts or self.hosts + self.log.info("Querying telemetry metric %s from %s", name, host_list) + for host in host_list: data = self.dmg.telemetry_metrics_query(host=host, metrics=name) info[host] = {} if "response" in data: @@ -812,6 +814,246 @@ def verify_metric_value(self, metrics_data, min_value=None, max_value=None): return status +class ClientTelemetryUtils(TelemetryUtils): + """Defines an object used to verify server and client telemetry information.""" + + CLIENT_EVENT_METRICS = [ + "client_started_at"] + CLIENT_POOL_ACTION_METRICS = [ + "client_pool_resent", + "client_pool_restarted", + "client_pool_retry", + "client_pool_xferred_fetch", + "client_pool_xferred_update"] + CLIENT_POOL_OPS_METRICS = [ + "client_pool_ops_akey_enum", + "client_pool_ops_akey_punch", + "client_pool_ops_compound", + "client_pool_ops_dkey_enum", + "client_pool_ops_dkey_punch", + "client_pool_ops_ec_agg", + "client_pool_ops_ec_rep", + "client_pool_ops_fetch", + "client_pool_ops_key2anchor", + "client_pool_ops_key_query", + "client_pool_ops_migrate", + "client_pool_ops_obj_coll_punch", + "client_pool_ops_obj_coll_query", + "client_pool_ops_obj_enum", + "client_pool_ops_obj_punch", + "client_pool_ops_obj_sync", + "client_pool_ops_recx_enum", + "client_pool_ops_tgt_akey_punch", + "client_pool_ops_tgt_dkey_punch", + "client_pool_ops_tgt_punch", + "client_pool_ops_tgt_update", + "client_pool_ops_update"] + CLIENT_POOL_EC_UPDATE_METRICS = [ + "client_pool_EC_update_full_stripe", + "client_pool_EC_update_partial"] + CLIENT_POOL_METRICS = CLIENT_POOL_ACTION_METRICS +\ + CLIENT_POOL_OPS_METRICS +\ + CLIENT_POOL_EC_UPDATE_METRICS + CLIENT_IO_LATENCY_FETCH_METRICS = \ + _gen_stats_metrics("client_io_latency_fetch") + CLIENT_IO_LATENCY_UPDATE_METRICS = \ + _gen_stats_metrics("client_io_latency_update") + CLIENT_IO_OPS_AKEY_ENUM_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_akey_enum_active") + CLIENT_IO_OPS_AKEY_ENUM_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_akey_enum_latency") + CLIENT_IO_OPS_AKEY_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_akey_punch_active") + CLIENT_IO_OPS_AKEY_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_akey_punch_latency") + CLIENT_IO_OPS_COMPOUND_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_compound_active") + CLIENT_IO_OPS_COMPOUND_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_compound_latency") + CLIENT_IO_OPS_DKEY_ENUM_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_dkey_enum_active") + CLIENT_IO_OPS_DKEY_ENUM_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_dkey_enum_latency") + CLIENT_IO_OPS_DKEY_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_dkey_punch_active") + CLIENT_IO_OPS_DKEY_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_dkey_punch_latency") + CLIENT_IO_OPS_EC_AGG_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_ec_agg_active") + CLIENT_IO_OPS_EC_AGG_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_ec_agg_latency") + CLIENT_IO_OPS_EC_REP_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_ec_rep_active") + CLIENT_IO_OPS_EC_REP_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_ec_rep_latency") + CLIENT_IO_OPS_FETCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_fetch_active") + CLIENT_IO_OPS_KEY2ANCHOR_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_key2anchor_active") + CLIENT_IO_OPS_KEY2ANCHOR_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_key2anchor_latency") + CLIENT_IO_OPS_KEY_QUERY_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_key_query_active") + CLIENT_IO_OPS_KEY_QUERY_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_key_query_latency") + CLIENT_IO_OPS_MIGRATE_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_migrate_active") + CLIENT_IO_OPS_MIGRATE_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_migrate_latency") + CLIENT_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_coll_punch_active") + CLIENT_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_coll_punch_latency") + CLIENT_IO_OPS_OBJ_COLL_QUERY_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_coll_query_active") + CLIENT_IO_OPS_OBJ_COLL_QUERY_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_coll_query_latency") + CLIENT_IO_OPS_OBJ_ENUM_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_enum_active") + CLIENT_IO_OPS_OBJ_ENUM_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_enum_latency") + CLIENT_IO_OPS_OBJ_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_punch_active") + CLIENT_IO_OPS_OBJ_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_punch_latency") + CLIENT_IO_OPS_OBJ_punch_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_sync_active") + CLIENT_IO_OPS_OBJ_SYNC_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_obj_sync_latency") + CLIENT_IO_OPS_RECX_ENUM_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_recx_enum_active") + CLIENT_IO_OPS_RECX_ENUM_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_recx_enum_latency") + CLIENT_IO_OPS_TGT_AKEY_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_akey_punch_active") + CLIENT_IO_OPS_TGT_AKEY_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_akey_punch_latency") + CLIENT_IO_OPS_TGT_DKEY_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_dkey_punch_active") + CLIENT_IO_OPS_TGT_DKEY_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_dkey_punch_latency") + CLIENT_IO_OPS_TGT_PUNCH_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_punch_active") + CLIENT_IO_OPS_TGT_PUNCH_LATENCY_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_punch_latency") + CLIENT_IO_OPS_TGT_UPDATE_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_tgt_update_active") + CLIENT_IO_OPS_UPDATE_ACTIVE_METRICS = \ + _gen_stats_metrics("client_io_ops_update_active") + CLIENT_IO_METRICS = CLIENT_IO_LATENCY_FETCH_METRICS +\ + CLIENT_IO_LATENCY_UPDATE_METRICS +\ + CLIENT_IO_OPS_AKEY_ENUM_ACTIVE_METRICS +\ + CLIENT_IO_OPS_AKEY_ENUM_LATENCY_METRICS +\ + CLIENT_IO_OPS_AKEY_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_AKEY_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_COMPOUND_ACTIVE_METRICS +\ + CLIENT_IO_OPS_COMPOUND_LATENCY_METRICS +\ + CLIENT_IO_OPS_DKEY_ENUM_ACTIVE_METRICS +\ + CLIENT_IO_OPS_DKEY_ENUM_LATENCY_METRICS +\ + CLIENT_IO_OPS_DKEY_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_DKEY_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_EC_AGG_ACTIVE_METRICS +\ + CLIENT_IO_OPS_EC_AGG_LATENCY_METRICS +\ + CLIENT_IO_OPS_EC_REP_ACTIVE_METRICS +\ + CLIENT_IO_OPS_EC_REP_LATENCY_METRICS +\ + CLIENT_IO_OPS_FETCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_KEY2ANCHOR_ACTIVE_METRICS +\ + CLIENT_IO_OPS_KEY2ANCHOR_LATENCY_METRICS +\ + CLIENT_IO_OPS_KEY_QUERY_ACTIVE_METRICS +\ + CLIENT_IO_OPS_KEY_QUERY_LATENCY_METRICS +\ + CLIENT_IO_OPS_MIGRATE_ACTIVE_METRICS +\ + CLIENT_IO_OPS_MIGRATE_LATENCY_METRICS +\ + CLIENT_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_OBJ_COLL_QUERY_ACTIVE_METRICS +\ + CLIENT_IO_OPS_OBJ_COLL_QUERY_LATENCY_METRICS +\ + CLIENT_IO_OPS_OBJ_ENUM_ACTIVE_METRICS +\ + CLIENT_IO_OPS_OBJ_ENUM_LATENCY_METRICS +\ + CLIENT_IO_OPS_OBJ_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_OBJ_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_OBJ_punch_ACTIVE_METRICS +\ + CLIENT_IO_OPS_OBJ_SYNC_LATENCY_METRICS +\ + CLIENT_IO_OPS_RECX_ENUM_ACTIVE_METRICS +\ + CLIENT_IO_OPS_RECX_ENUM_LATENCY_METRICS +\ + CLIENT_IO_OPS_TGT_AKEY_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_TGT_AKEY_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_TGT_DKEY_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_TGT_DKEY_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_TGT_PUNCH_ACTIVE_METRICS +\ + CLIENT_IO_OPS_TGT_PUNCH_LATENCY_METRICS +\ + CLIENT_IO_OPS_TGT_UPDATE_ACTIVE_METRICS +\ + CLIENT_IO_OPS_UPDATE_ACTIVE_METRICS + + def __init__(self, dmg, servers, clients): + """Create a ClientTelemetryUtils object. + + Args: + dmg (DmgCommand): the DmgCommand object configured to communicate + with the servers + servers (list): a list of server host names + clients (list): a list of client host names + """ + super().__init__(dmg, servers) + self.clients = NodeSet.fromlist(clients) + + def get_all_client_metrics_names(self, with_pools=False): + """Get all the telemetry metrics names for this client. + + Args: + with_pools (bool): if True, include pool metrics in the results + + Returns: + list: all of the telemetry metrics names for this client + + """ + all_metrics_names = list(self.CLIENT_EVENT_METRICS) + all_metrics_names.extend(self.CLIENT_IO_METRICS) + if with_pools: + all_metrics_names.extend(self.CLIENT_POOL_METRICS) + + return all_metrics_names + + def list_client_metrics(self): + """List the available metrics for each host. + + Returns: + dict: a dictionary of host keys linked to a list of metric names + + """ + return super().list_metrics(hosts=self.clients) + + def collect_client_data(self, names): + """Collect telemetry data for the specified metrics. + + Args: + names (list): list of metric names + + Returns: + dict: dictionary of metric values keyed by the metric name and combination of metric + labels and values, e.g. + : { + : , + : , + ... + }, + ... + """ + return super().collect_data(names, hosts=self.clients) + + def get_client_metrics(self, name): + """Obtain the specified metric information for each host. + + Args: + name (str): Comma-separated list of metric names to query. + + Returns: + dict: a dictionary of host keys linked to metric data for each + metric name specified + + """ + return super().get_metrics(name, hosts=self.clients) + + class MetricData(): """Defines a object used to collect, display, and verify telemetry metric data.""" @@ -890,9 +1132,10 @@ def verify(self, log, ranges): log.info(format_str, *['-' * self._display['widths'][name] for name in columns]) for metric in sorted(self._display['data']): for value, labels in self._display['data'][metric].items(): - log.info( - format_str, metric, *self._label_values(labels), value, - *self._label_values(labels, ['check'])) + for label in labels: + log.info( + format_str, metric, *self._label_values(label), value, + *self._label_values(label, ['check'])) return status def _get_metrics(self, log, names, hosts, dmg): diff --git a/src/utils/daos_metrics/daos_metrics.c b/src/utils/daos_metrics/daos_metrics.c index 2b0e9af1b574..f2133237587b 100644 --- a/src/utils/daos_metrics/daos_metrics.c +++ b/src/utils/daos_metrics/daos_metrics.c @@ -10,8 +10,9 @@ #include #include -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_consumer.h" +#include +#include +#include static void print_usage(const char *prog_name) @@ -52,30 +53,90 @@ print_usage(const char *prog_name) "--gauge, -g\n" "\tInclude gauges\n" "--read, -r\n" + "\tInclude timestamp of when metric was read\n" "--reset, -e\n" - "\tInclude timestamp of when metric was read\n", + "\tReset all metrics to zero\n" + "--jobid, -j\n" + "\tDisplay metrics of the specified job\n", prog_name); } -int -main(int argc, char **argv) +static int +process_metrics(int metric_id, char *dirname, int format, int filter, int extra_descriptors, + int delay, int num_iter, d_tm_iter_cb_t iter_cb, void *arg) { struct d_tm_node_t *root = NULL; struct d_tm_node_t *node = NULL; struct d_tm_context *ctx = NULL; + int iteration = 0; + int rc = 0; + + ctx = d_tm_open(metric_id); + if (!ctx) + D_GOTO(out, rc = 0); + + root = d_tm_get_root(ctx); + if (!root) + D_GOTO(out, rc = -DER_NONEXIST); + + if (strncmp(dirname, "/", D_TM_MAX_NAME_LEN) != 0) { + node = d_tm_find_metric(ctx, dirname); + if (node != NULL) { + root = node; + } else { + printf("No metrics found at: '%s'\n", dirname); + D_GOTO(out, rc = 0); + } + } + + if (format == D_TM_CSV) + d_tm_print_field_descriptors(extra_descriptors, (FILE *)arg); + + while ((num_iter == 0) || (iteration < num_iter)) { + d_tm_iterate(ctx, root, 0, filter, NULL, format, extra_descriptors, iter_cb, arg); + iteration++; + sleep(delay); + if (format == D_TM_STANDARD) + printf("\n\n"); + } + +out: + if (ctx != NULL) + d_tm_close(&ctx); + return rc; +} + +static void +iter_print(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format, + int opt_fields, void *arg) +{ + d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg); +} + +static void +iter_reset(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format, + int opt_fields, void *arg) +{ + d_tm_reset_node(ctx, node, level, path, format, opt_fields, (FILE *)arg); +} + +int +main(int argc, char **argv) +{ char dirname[D_TM_MAX_NAME_LEN] = {0}; + char jobid[D_TM_MAX_NAME_LEN] = {0}; bool show_meta = false; bool show_when_read = false; bool show_type = false; - int srv_idx = 0; - int iteration = 0; + int srv_idx = 0; int num_iter = 1; int filter = 0; int delay = 1; int format = D_TM_STANDARD; int opt; int extra_descriptors = 0; - uint32_t ops = 0; + d_tm_iter_cb_t iter_cb = NULL; + int rc; sprintf(dirname, "/"); @@ -96,10 +157,11 @@ main(int argc, char **argv) {"type", no_argument, NULL, 'T'}, {"read", no_argument, NULL, 'r'}, {"reset", no_argument, NULL, 'e'}, + {"jobid", required_argument, NULL, 'j'}, {"help", no_argument, NULL, 'h'}, {NULL, 0, NULL, 0}}; - opt = getopt_long_only(argc, argv, "S:cCdtsgi:p:D:MmTrhe", long_options, NULL); + opt = getopt_long_only(argc, argv, "S:cCdtsgi:p:D:MmTrj:he", long_options, NULL); if (opt == -1) break; @@ -147,7 +209,10 @@ main(int argc, char **argv) delay = atoi(optarg); break; case 'e': - ops |= D_TM_ITER_RESET; + iter_cb = iter_reset; + break; + case 'j': + snprintf(jobid, sizeof(jobid), "%s", optarg); break; case 'h': case '?': @@ -157,37 +222,13 @@ main(int argc, char **argv) } } - if (ops == 0) - ops |= D_TM_ITER_READ; + if (iter_cb == NULL) + iter_cb = iter_print; if (filter == 0) filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO | D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE; - ctx = d_tm_open(srv_idx); - if (!ctx) - goto failure; - - root = d_tm_get_root(ctx); - if (!root) - goto failure; - - if (strncmp(dirname, "/", D_TM_MAX_NAME_LEN) != 0) { - node = d_tm_find_metric(ctx, dirname); - if (node != NULL) { - root = node; - } else { - printf("No metrics found at: '%s'\n", dirname); - exit(0); - } - } - - if (format == D_TM_CSV) - filter &= ~D_TM_DIRECTORY; - else - filter |= D_TM_DIRECTORY; - - if (show_when_read) extra_descriptors |= D_TM_INCLUDE_TIMESTAMP; if (show_meta) @@ -196,27 +237,24 @@ main(int argc, char **argv) extra_descriptors |= D_TM_INCLUDE_TYPE; if (format == D_TM_CSV) - d_tm_print_field_descriptors(extra_descriptors, stdout); + filter &= ~D_TM_DIRECTORY; + else + filter |= D_TM_DIRECTORY; - while ((num_iter == 0) || (iteration < num_iter)) { - d_tm_iterate(ctx, root, 0, filter, NULL, format, extra_descriptors, - ops, stdout); - iteration++; - sleep(delay); - if (format == D_TM_STANDARD) - printf("\n\n"); + if (strlen(jobid) > 0) { + srv_idx = DC_TM_JOB_ROOT_ID; + snprintf(dirname, sizeof(dirname), "%s", jobid); } - d_tm_close(&ctx); - return 0; - -failure: - printf("Unable to attach to the shared memory for the server index: %d" - "\nMake sure to run the I/O Engine with the same index to " - "initialize the shared memory and populate it with metrics.\n" - "Verify user/group settings match those that started the I/O " - "Engine.\n", - srv_idx); - d_tm_close(&ctx); - return -1; + /* fetch metrics from server side */ + rc = process_metrics(srv_idx, dirname, format, filter, extra_descriptors, delay, num_iter, + iter_cb, stdout); + if (rc) + printf("Unable to attach to the shared memory for the server index: %d" + "\nMake sure to run the I/O Engine with the same index to " + "initialize the shared memory and populate it with metrics.\n" + "Verify user/group settings match those that started the I/O " + "Engine.\n", + srv_idx); + return rc != 0 ? -1 : 0; } diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 4d62b45b6094..2a59197f709b 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -821,11 +822,11 @@ vos_metrics_alloc(const char *path, int tgt_id) return vp_metrics; } -struct dss_module_metrics vos_metrics = { - .dmm_tags = DAOS_TGT_TAG, - .dmm_init = vos_metrics_alloc, - .dmm_fini = vos_metrics_free, - .dmm_nr_metrics = vos_metrics_count, +struct daos_module_metrics vos_metrics = { + .dmm_tags = DAOS_TGT_TAG, + .dmm_init = vos_metrics_alloc, + .dmm_fini = vos_metrics_free, + .dmm_nr_metrics = vos_metrics_count, }; struct dss_module vos_srv_module = { diff --git a/utils/config/daos_agent.yml b/utils/config/daos_agent.yml index 3656d4862682..4a7f13b36546 100644 --- a/utils/config/daos_agent.yml +++ b/utils/config/daos_agent.yml @@ -26,6 +26,27 @@ # default: 10001 #port: 10001 +## Enable HTTP endpoint for remote telemetry collection. +# Note that enabling the endpoint automatically enables +# client telemetry collection. +# +## default endpoint state: disabled +## default endpoint port: 9192 +#telemetry_port: 9192 + +## Enable client telemetry for all DAOS clients. +# If false, clients will need to optionally enable telemetry by setting +# the D_CLIENT_METRICS_ENABLE environment variable to true. +# +## default: false +#telemetry_enabled: true + +## Retain client telemetry for a period of time after the client +# process exits. +# +## default 0 (do not retain telemetry after client exit) +#telemetry_retain: 1m + ## Transport Credentials Specifying certificates to secure communications # #transport_config: