Skip to content

Commit

Permalink
DAOS-8331 client: Add client side metrics (#14030) (#14204)
Browse files Browse the repository at this point in the history
This commit comprises two separate patches to enable optional
collection and export of client-side telemetry.

The daos_agent configuration file includes new parameters to control
collection and export of per-client telemetry. If the telemetry_port option
is set, then per-client telemetry will be published in Prometheus format
for real-time sampling of client processes. By default, the client telemetry
will be automatically cleaned up on client exit, but may be optionally
retained for some amount of time after client exit in order to allow for
a final sample to be read.

Example daos_agent.yml updates:
telemetry_port: 9192 # export on port 9192
telemetry_enable: true # enable client telemetry for all connected clients
telemetry_retain: 1m # retain metrics for 1 minute after client exit

If telemetry_enable is false (default), client telemetry may be enabled on
a per-process basis by setting D_CLIENT_METRICS_ENABLE=1 in the
environment for clients that should collect telemetry.

Notes from the first patch by Di:

Move TLS to common, so both client and server can have TLS,
which metrics can be attached metrics on it.

Add object metrics on the client side, enabled by
export D_CLIENT_METRICS_ENABLE=1. And client metrics are organized
as "/jobid/pid/xxxxx".

During each daos thread initialization, it will created another
shmem (pid/xxx), which all metrics of the thread will be attached
to. And this metric will be destroyed once the thread exit, though
if D_CLIENT_METRICS_RETAIN is set, these client metrics will be
retain, and it can be retrieved by
daos_metrics --jobid
Add D_CLIENT_METRICS_DUMP_PATH dump metrics from current thread
once it exit.

Some fixes in telemetrics about conv_ptr during re-open the
share memory.

Add daos_metrics --jobid XXX options to retrieve all metrics
of the job.

Includes some useful ftest updates from the following commit:
* DAOS-11626 test: Adding MD on SSD metrics tests (#13661)
Adding tests for WAL commit, reply, and checkpoint metrics.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
Signed-off-by: Michael MacDonald <mjmac@google.com>
Signed-off-by: Di Wang <di.wang@intel.com>
Co-authored-by: Phil Henderson <phillip.henderson@intel.com>
Co-authored-by: Di Wang <di.wang@intel.com>
  • Loading branch information
3 people authored Apr 29, 2024
1 parent 2d68c65 commit 5a027b1
Show file tree
Hide file tree
Showing 81 changed files with 6,174 additions and 1,714 deletions.
12 changes: 12 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,15 @@ This file lists the environment variables used in CaRT.
. CRT_TEST_CONT
When set to 1, orterun does not automatically shut down other servers when
one server is shutdown. Used in cart internal testing.

. D_CLIENT_METRICS_ENABLE
When set to 1, client side metrics will be collected on each daos client, which
can by retrieved by daos_metrics -j job_id on each client.

. D_CLIENT_METRICS_RETAIN
when set to 1, client side metrics will be retained even after the job exits, i.e.
those metrics can be retrieved by daos_metrics even after job exits.

. D_CLIENT_METRICS_DUMP_PATH
Set client side metrics dump path(file) for each client, so these metrics will be
dumped to the specified file when the job exits.
94 changes: 52 additions & 42 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,50 @@ static volatile int gdata_init_flag;
struct crt_plugin_gdata crt_plugin_gdata;
static bool g_prov_settings_applied[CRT_PROV_COUNT];

/* List of the environment variables used in CaRT */
static const char *crt_env_names[] = {
"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR",
"DAOS_SIGNAL_REGISTER",
"D_CLIENT_METRICS_ENABLE",
"D_CLIENT_METRICS_RETAIN",
"D_CLIENT_METRICS_DUMP_PATH",

};

static void
crt_lib_init(void) __attribute__((__constructor__));

Expand Down Expand Up @@ -62,53 +106,19 @@ crt_lib_fini(void)
static void
dump_envariables(void)
{
int i;
char *val;
static const char *var_names[] = {"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR"};
int i;

D_INFO("-- ENVARS: --\n");
for (i = 0; i < ARRAY_SIZE(var_names); i++) {
d_agetenv_str(&val, var_names[i]);
for (i = 0; i < ARRAY_SIZE(crt_env_names); i++) {
char *val = NULL;

d_agetenv_str(&val, crt_env_names[i]);
if (val == NULL)
continue;
if (strcmp(var_names[i], "D_PROVIDER_AUTH_KEY") == 0)
D_INFO("%s = %s\n", var_names[i], "********");
if (strcmp(crt_env_names[i], "D_PROVIDER_AUTH_KEY") == 0)
D_INFO("%s = %s\n", crt_env_names[i], "********");
else
D_INFO("%s = %s\n", var_names[i], val);
D_INFO("%s = %s\n", crt_env_names[i], val);
d_freeenv_str(&val);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'metrics.c']


def scons():
Expand Down
14 changes: 13 additions & 1 deletion src/client/api/init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metrics.h>
#include "task_internal.h"
#include <pthread.h>

Expand Down Expand Up @@ -219,6 +220,13 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_pl, rc);

/** set up client telemetry */
rc = dc_tm_init();
if (rc != 0) {
/* should not be fatal */
DL_WARN(rc, "failed to initialize client telemetry");
}

/** set up pool */
rc = dc_pool_init();
if (rc != 0)
Expand All @@ -242,6 +250,7 @@ daos_init(void)
out_pool:
dc_pool_fini();
out_mgmt:
dc_tm_fini();
dc_mgmt_fini();
out_pl:
pl_fini();
Expand Down Expand Up @@ -291,6 +300,8 @@ daos_fini(void)
D_GOTO(unlock, rc);
}

/** clean up all registered per-module metrics */
daos_metrics_fini();
dc_obj_fini();
dc_cont_fini();
dc_pool_fini();
Expand All @@ -301,6 +312,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
Loading

0 comments on commit 5a027b1

Please sign in to comment.