From 9e3919bf07fa241ec72c3d32b2657220f9ba1ed6 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Thu, 29 Jun 2023 09:01:27 +0100 Subject: [PATCH 01/16] DAOS-13604 dfuse: Fix resource leaks in readdir. (#12408) Ensure that resources are properly freed if errors are encountered during readdir. Signed-off-by: Ashley Pittman --- src/client/dfuse/ops/readdir.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/ops/readdir.c b/src/client/dfuse/ops/readdir.c index 17d8dcfbe8c..9602c2330df 100644 --- a/src/client/dfuse/ops/readdir.c +++ b/src/client/dfuse/ops/readdir.c @@ -539,9 +539,9 @@ dfuse_do_readdir(struct dfuse_projection_info *fs_handle, fuse_req_t req, struct DFUSE_TRA_DEBUG(oh, "Switching to private handle"); dfuse_dre_drop(fs_handle, oh); oh->doh_rd = _handle_init(oh->doh_ie->ie_dfs); + hdl = oh->doh_rd; if (oh->doh_rd == NULL) D_GOTO(out_reset, rc = ENOMEM); - hdl = oh->doh_rd; DFUSE_TRA_UP(oh->doh_rd, oh, "readdir"); } else { dfuse_readdir_reset(hdl); @@ -647,9 +647,11 @@ dfuse_do_readdir(struct dfuse_projection_info *fs_handle, fuse_req_t req, struct NULL); if (rc == ENOENT) { DFUSE_TRA_DEBUG(oh, "File does not exist"); + D_FREE(drc); continue; } else if (rc != 0) { DFUSE_TRA_DEBUG(oh, "Problem finding file %d", rc); + D_FREE(drc); D_GOTO(reply, rc); } @@ -665,6 +667,8 @@ dfuse_do_readdir(struct dfuse_projection_info *fs_handle, fuse_req_t req, struct rc = create_entry(fs_handle, oh->doh_ie, &stbuf, obj, dre->dre_name, out, attr_len, &rlink); if (rc != 0) { + dfs_release(obj); + D_FREE(drc); D_GOTO(reply, rc); } @@ -769,7 +773,8 @@ dfuse_do_readdir(struct dfuse_projection_info *fs_handle, fuse_req_t req, struct return 0; out_reset: - dfuse_readdir_reset(hdl); + if (hdl) + dfuse_readdir_reset(hdl); D_ASSERT(rc != 0); return rc; } From 50b8b7f7deed61e44a964d1222564ae42b7c8648 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Thu, 29 Jun 2023 09:06:31 +0100 Subject: [PATCH 02/16] DAOS-13393 gurt: Add thread local setting to fault injection. (#12274) Allow faults to be enabled/disabled on a per thread basis. This paves the way for running better fault injection testing on asynchronous operations where they are submitted and completed on different threads. Re-format the fault inject code and remove a header file. Signed-off-by: Ashley Pittman --- src/gurt/fault_inject.c | 401 +++++++++++++++++++------------- src/gurt/fi.h | 35 --- src/include/gurt/fault_inject.h | 116 ++++----- 3 files changed, 301 insertions(+), 251 deletions(-) delete mode 100644 src/gurt/fi.h diff --git a/src/gurt/fault_inject.c b/src/gurt/fault_inject.c index a0fcfdfea7f..27fd964fd10 100644 --- a/src/gurt/fault_inject.c +++ b/src/gurt/fault_inject.c @@ -1,34 +1,35 @@ /* - * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2018-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ -/** - * This file is part of gurt, it implements the fault injection feature. - */ -#define D_LOGFAC DD_FAC(fi) -/** max length of argument string in the yaml config file */ +#define D_LOGFAC DD_FAC(fi) +/* max length of argument string in the yaml config file */ #define FI_CONFIG_ARG_STR_MAX_LEN 4096 /* (1 << D_FA_TABLE_BITS) is the number of buckets of fa hash table */ -#define D_FA_TABLE_BITS (13) +#define D_FA_TABLE_BITS (13) #include #include -#include "fi.h" #include -/** +/* * global switch for fault injection. zero globally turns off fault injection, * non-zero turns on fault injection */ -unsigned int d_fault_inject; -unsigned int d_fault_config_file; -struct d_fault_attr_t *d_fault_attr_mem; +unsigned int d_fault_inject; +unsigned int d_fault_config_file; +struct d_fault_attr_t *d_fault_attr_mem; #if FAULT_INJECTION +struct d_fault_attr { + d_list_t fa_link; + struct d_fault_attr_t fa_attr; +}; + static struct d_fault_attr * fa_link2ptr(d_list_t *rlink) { @@ -37,8 +38,7 @@ fa_link2ptr(d_list_t *rlink) } static bool -fa_op_key_cmp(struct d_hash_table *htab, d_list_t *rlink, const void *key, - unsigned int ksize) +fa_op_key_cmp(struct d_hash_table *htab, d_list_t *rlink, const void *key, unsigned int ksize) { struct d_fault_attr *fa_ptr = fa_link2ptr(rlink); @@ -59,21 +59,18 @@ fa_op_rec_hash(struct d_hash_table *htab, d_list_t *link) static void fa_op_rec_free(struct d_hash_table *htab, d_list_t *rlink) { - struct d_fault_attr *ht_rec = fa_link2ptr(rlink); - int rc; + struct d_fault_attr *ht_rec = fa_link2ptr(rlink); + int rc; D_FREE(ht_rec->fa_attr.fa_argument); rc = D_SPIN_DESTROY(&ht_rec->fa_attr.fa_lock); if (rc != DER_SUCCESS) - D_ERROR("Can't destroy spinlock for fault id: %d\n", - ht_rec->fa_attr.fa_id); + D_ERROR("Can't destroy spinlock for fault id: %d\n", ht_rec->fa_attr.fa_id); D_FREE(ht_rec); } -/** - * abuse hop_rec_decref() so that we can safely use it without a - * hop_rec_addref(). The goal is to have d_hash_table_destroy_inplace() - * destroy all records automatically. +/* abuse hop_rec_decref() so that we can safely use it without a hop_rec_addref(). The goal is to + * have d_hash_table_destroy_inplace() destroy all records automatically. */ static bool fa_op_rec_decref(struct d_hash_table *htab, d_list_t *rlink) @@ -82,45 +79,98 @@ fa_op_rec_decref(struct d_hash_table *htab, d_list_t *rlink) } static d_hash_table_ops_t fa_table_ops = { - .hop_key_cmp = fa_op_key_cmp, - .hop_rec_hash = fa_op_rec_hash, - .hop_rec_decref = fa_op_rec_decref, - .hop_rec_free = fa_op_rec_free, + .hop_key_cmp = fa_op_key_cmp, + .hop_rec_hash = fa_op_rec_hash, + .hop_rec_decref = fa_op_rec_decref, + .hop_rec_free = fa_op_rec_free, }; struct d_fi_gdata_t { - unsigned int dfg_refcount; - unsigned int dfg_inited; - pthread_rwlock_t dfg_rwlock; - struct d_hash_table dfg_fa_table; + unsigned int dfg_refcount; + unsigned int dfg_inited; + pthread_rwlock_t dfg_rwlock; + struct d_hash_table dfg_fa_table; + bool dfg_thread_default; }; -/** - * global switch for fault injection. zero globally turns off fault injection, - * non-zero turns on fault injection +/* global switch for fault injection. zero globally turns off fault injection, non-zero turns on + * fault injection + */ +static uint32_t d_fault_inject_seed; +static struct d_fi_gdata_t d_fi_gdata; +static pthread_once_t d_fi_gdata_init_once = PTHREAD_ONCE_INIT; + +/* Check for and set thread specific enabled flag. This is a single function that returns two + * values so it can access a single _thread local. + * + * 0 - no value set; + * 1 - disabled; + * >2 - enabled; */ -static uint32_t d_fault_inject_seed; -static struct d_fi_gdata_t d_fi_gdata; -static pthread_once_t d_fi_gdata_init_once = PTHREAD_ONCE_INIT; +static int +fault_get_thread_enabled_h(int new_value) +{ + static __thread int thread_value; + + if (new_value > 0) + thread_value = new_value; + return thread_value; +} + +static bool +fault_get_thread_enabled() +{ + int rc; + + rc = fault_get_thread_enabled_h(0); + if (rc == 0) + return d_fi_gdata.dfg_thread_default; + else if (rc == 1) + return false; + return true; +} + +static void +fault_set_thread_enable(bool enabled) +{ + if (enabled) + fault_get_thread_enabled_h(2); + else + fault_get_thread_enabled_h(1); +} + +void +d_fault_inject_thread_enable(bool enabled) +{ + D_RWLOCK_RDLOCK(&d_fi_gdata.dfg_rwlock); + fault_set_thread_enable(enabled); + D_RWLOCK_UNLOCK(&d_fi_gdata.dfg_rwlock); +} + +void +d_fault_inject_thread_default_enable(bool enabled) +{ + D_RWLOCK_WRLOCK(&d_fi_gdata.dfg_rwlock); + d_fi_gdata.dfg_thread_default = enabled; + D_RWLOCK_UNLOCK(&d_fi_gdata.dfg_rwlock); +} static inline int fault_attr_set(uint32_t fault_id, struct d_fault_attr_t fa_in, bool take_lock) { - struct d_fault_attr_t *fault_attr; - char *fa_argument = NULL; - bool should_free = true; - struct d_fault_attr *new_rec = NULL; - struct d_fault_attr *rec = NULL; - d_list_t *rlink = NULL; - int rc = DER_SUCCESS; + struct d_fault_attr_t *fault_attr; + char *fa_argument = NULL; + bool should_free = true; + struct d_fault_attr *new_rec; + d_list_t *rlink; + int rc = DER_SUCCESS; D_ALLOC_PTR(new_rec); if (new_rec == NULL) D_GOTO(out, rc = -DER_NOMEM); if (fa_in.fa_argument) { - D_STRNDUP(fa_argument, fa_in.fa_argument, - FI_CONFIG_ARG_STR_MAX_LEN); + D_STRNDUP(fa_argument, fa_in.fa_argument, FI_CONFIG_ARG_STR_MAX_LEN); if (fa_argument == NULL) D_GOTO(out, rc = -DER_NOMEM); } @@ -128,17 +178,19 @@ fault_attr_set(uint32_t fault_id, struct d_fault_attr_t fa_in, bool take_lock) if (take_lock) D_RWLOCK_WRLOCK(&d_fi_gdata.dfg_rwlock); - rlink = d_hash_rec_find_insert(&d_fi_gdata.dfg_fa_table, &fault_id, - sizeof(fault_id), &new_rec->fa_link); + rlink = d_hash_rec_find_insert(&d_fi_gdata.dfg_fa_table, &fault_id, sizeof(fault_id), + &new_rec->fa_link); if (rlink == &new_rec->fa_link) { fault_attr = &new_rec->fa_attr; - rc = D_SPIN_INIT(&fault_attr->fa_lock, - PTHREAD_PROCESS_PRIVATE); + + rc = D_SPIN_INIT(&fault_attr->fa_lock, PTHREAD_PROCESS_PRIVATE); if (rc != DER_SUCCESS) D_GOTO(out_unlock, rc); D_DEBUG(DB_ALL, "new fault id: %u added.\n", fault_id); should_free = false; } else { + struct d_fault_attr *rec; + rec = fa_link2ptr(rlink); D_ASSERT(rec->fa_attr.fa_id == fault_id); fault_attr = &rec->fa_attr; @@ -147,17 +199,16 @@ fault_attr_set(uint32_t fault_id, struct d_fault_attr_t fa_in, bool take_lock) D_SPIN_LOCK(&fault_attr->fa_lock); /* at this point, global lock is released, per entry lock is held */ - fault_attr->fa_id = fault_id; + fault_attr->fa_id = fault_id; fault_attr->fa_probability_x = fa_in.fa_probability_x; fault_attr->fa_probability_y = fa_in.fa_probability_y; - fault_attr->fa_interval = fa_in.fa_interval; - fault_attr->fa_max_faults = fa_in.fa_max_faults; - fault_attr->fa_err_code = fa_in.fa_err_code; - fault_attr->fa_argument = fa_argument; + fault_attr->fa_interval = fa_in.fa_interval; + fault_attr->fa_max_faults = fa_in.fa_max_faults; + fault_attr->fa_err_code = fa_in.fa_err_code; + fault_attr->fa_argument = fa_argument; /** - * Let's update fa_num_faults here too, so the user can reset num faults - * by fault_attr_set, then it can use the same fault_attr to inject - * other failures. + * Let's update fa_num_faults here too, so the user can reset num faults by fault_attr_set, + * then it can use the same fault_attr to inject other failures. */ fault_attr->fa_num_faults = fa_in.fa_num_faults; /* nrand48() only takes the high order 48 bits for its seed */ @@ -186,17 +237,15 @@ d_fault_attr_set(uint32_t fault_id, struct d_fault_attr_t fa_in) struct d_fault_attr_t * d_fault_attr_lookup(uint32_t fault_id) { - struct d_fault_attr_t *fault_attr; - struct d_fault_attr *ht_rec; - d_list_t *rlink; + struct d_fault_attr_t *fault_attr; + struct d_fault_attr *ht_rec; + d_list_t *rlink; D_RWLOCK_RDLOCK(&d_fi_gdata.dfg_rwlock); - rlink = d_hash_rec_find(&d_fi_gdata.dfg_fa_table, (void *)&fault_id, - sizeof(fault_id)); + rlink = d_hash_rec_find(&d_fi_gdata.dfg_fa_table, (void *)&fault_id, sizeof(fault_id)); D_RWLOCK_UNLOCK(&d_fi_gdata.dfg_rwlock); if (rlink == NULL) { - D_DEBUG(DB_ALL, "fault attr for fault ID %d not set yet.\n", - fault_id); + D_DEBUG(DB_ALL, "fault attr for fault ID %d not set yet.\n", fault_id); fault_attr = NULL; } else { ht_rec = fa_link2ptr(rlink); @@ -210,8 +259,8 @@ d_fault_attr_lookup(uint32_t fault_id) int d_fault_attr_err_code(uint32_t fault_id) { - struct d_fault_attr_t *fault_attr; - int32_t err_code; + struct d_fault_attr_t *fault_attr; + int32_t err_code; fault_attr = d_fault_attr_lookup(fault_id); if (fault_attr == NULL) { @@ -229,31 +278,29 @@ d_fault_attr_err_code(uint32_t fault_id) static int one_fault_attr_parse(yaml_parser_t *parser) { - yaml_event_t first; - yaml_event_t second; - struct d_fault_attr_t attr = { .fa_probability_x = 1, - .fa_probability_y = 1, - .fa_interval = 1 }; - const char *id = "id"; - const char *probability_x = "probability_x"; - const char *probability_y = "probability_y"; - const char *interval = "interval"; - const char *max_faults = "max_faults"; - const char *err_code = "err_code"; - const char *argument = "argument"; - const char *key_str; - const char *val_str; - uint64_t val; - int has_id = 0; - int yaml_rc; - int rc = DER_SUCCESS; + yaml_event_t first; + yaml_event_t second; + struct d_fault_attr_t attr = { + .fa_probability_x = 1, .fa_probability_y = 1, .fa_interval = 1}; + const char *id = "id"; + const char *probability_x = "probability_x"; + const char *probability_y = "probability_y"; + const char *interval = "interval"; + const char *max_faults = "max_faults"; + const char *err_code = "err_code"; + const char *argument = "argument"; + const char *key_str; + const char *val_str; + uint64_t val; + int has_id = 0; + int yaml_rc; + int rc = DER_SUCCESS; do { /* libyaml functions return 1 on success, 0 on error */ yaml_rc = yaml_parser_parse(parser, &first); if (yaml_rc != 1) { - D_ERROR("yaml_parser_parse() failed. rc: %d\n", - yaml_rc); + D_ERROR("yaml_parser_parse() failed. rc: %d\n", yaml_rc); D_GOTO(out, rc = -DER_MISC); } @@ -272,8 +319,7 @@ one_fault_attr_parse(yaml_parser_t *parser) yaml_rc = yaml_parser_parse(parser, &second); if (yaml_rc != 1) { yaml_event_delete(&first); - D_ERROR("yaml_parser_parse() failed. rc: %d\n", - yaml_rc); + D_ERROR("yaml_parser_parse() failed. rc: %d\n", yaml_rc); D_GOTO(out, rc = -DER_MISC); } @@ -284,13 +330,13 @@ one_fault_attr_parse(yaml_parser_t *parser) D_GOTO(out, rc = -DER_MISC); } - key_str = (char *) first.data.scalar.value; - val_str = (const char *) second.data.scalar.value; - val = strtoul(val_str, NULL, 0); + key_str = (char *)first.data.scalar.value; + val_str = (const char *)second.data.scalar.value; + val = strtoul(val_str, NULL, 0); if (!strcmp(key_str, id)) { D_DEBUG(DB_ALL, "id: %lu\n", val); attr.fa_id = val; - has_id = 1; + has_id = 1; } else if (!strcmp(key_str, probability_x)) { attr.fa_probability_x = val; D_DEBUG(DB_ALL, "probability_x: %lu\n", val); @@ -305,11 +351,9 @@ one_fault_attr_parse(yaml_parser_t *parser) D_DEBUG(DB_ALL, "max_faults: %lu\n", val); } else if (!strcmp(key_str, err_code)) { attr.fa_err_code = strtol(val_str, NULL, 0); - D_DEBUG(DB_ALL, "err_code: "DF_RC"\n", - DP_RC(attr.fa_err_code)); + D_DEBUG(DB_ALL, "err_code: " DF_RC "\n", DP_RC(attr.fa_err_code)); } else if (!strcmp(key_str, argument)) { - D_STRNDUP(attr.fa_argument, val_str, - FI_CONFIG_ARG_STR_MAX_LEN); + D_STRNDUP(attr.fa_argument, val_str, FI_CONFIG_ARG_STR_MAX_LEN); if (attr.fa_argument == NULL) rc = -DER_NOMEM; D_DEBUG(DB_ALL, "argument: %s\n", attr.fa_argument); @@ -332,8 +376,7 @@ one_fault_attr_parse(yaml_parser_t *parser) rc = fault_attr_set(attr.fa_id, attr, true); if (rc != DER_SUCCESS) - D_ERROR("d_set_fault_attr(%u) failed, rc %d\n", attr.fa_id, - rc); + D_ERROR("d_set_fault_attr(%u) failed, rc %d\n", attr.fa_id, rc); out: D_FREE(attr.fa_argument); @@ -343,17 +386,16 @@ one_fault_attr_parse(yaml_parser_t *parser) static int fault_attr_parse(yaml_parser_t *parser) { - yaml_event_t event; - yaml_event_type_t event_type; - int yaml_rc; - int rc = -DER_SUCCESS; + yaml_event_t event; + yaml_event_type_t event_type; + int yaml_rc; + int rc = -DER_SUCCESS; do { /* libyaml functions return 1 on success, 0 on error */ yaml_rc = yaml_parser_parse(parser, &event); if (yaml_rc != 1) { - D_ERROR("yaml_parser_parse() failed. rc: %d\n", - yaml_rc); + D_ERROR("yaml_parser_parse() failed. rc: %d\n", yaml_rc); yaml_event_delete(&event); return -DER_MISC; } @@ -364,7 +406,8 @@ fault_attr_parse(yaml_parser_t *parser) rc = one_fault_attr_parse(parser); if (rc != DER_SUCCESS) { D_ERROR("yaml_parser_parse() failed. " - "rc: %d\n", rc); + "rc: %d\n", + rc); } break; default: @@ -381,13 +424,41 @@ fault_attr_parse(yaml_parser_t *parser) return rc; } +static int +thread_default_parse(yaml_parser_t *parser) +{ + yaml_event_t event; + int yaml_rc; + int rc = DER_SUCCESS; + + /* libyaml functions return 1 on success, 0 on error */ + yaml_rc = yaml_parser_parse(parser, &event); + if (yaml_rc != 1) { + D_ERROR("yaml_parser_parse() failed. rc: %d\n", yaml_rc); + return -DER_MISC; + } + + if (event.type != YAML_SCALAR_EVENT) + D_GOTO(out, rc = -DER_INVAL); + + if (strncasecmp((char *)event.data.scalar.value, "true", event.data.scalar.length) == 0) + d_fi_gdata.dfg_thread_default = true; + else + d_fi_gdata.dfg_thread_default = false; + +out: + yaml_event_delete(&event); + + return rc; +} + static int seed_parse(yaml_parser_t *parser) { - yaml_event_t event; - const char *val_str; - int yaml_rc; - int rc = DER_SUCCESS; + yaml_event_t event; + const char *val_str; + int yaml_rc; + int rc = DER_SUCCESS; /* libyaml functions return 1 on success, 0 on error */ yaml_rc = yaml_parser_parse(parser, &event); @@ -399,7 +470,7 @@ seed_parse(yaml_parser_t *parser) if (event.type != YAML_SCALAR_EVENT) D_GOTO(out, rc = -DER_INVAL); - val_str = (const char *) event.data.scalar.value; + val_str = (const char *)event.data.scalar.value; d_fault_inject_seed = strtoul(val_str, NULL, 10); out: @@ -413,12 +484,12 @@ d_fi_gdata_init(void) { int rc; - d_fi_gdata.dfg_refcount = 0; - d_fi_gdata.dfg_inited = 1; + d_fi_gdata.dfg_refcount = 0; + d_fi_gdata.dfg_inited = 1; + d_fi_gdata.dfg_thread_default = true; D_RWLOCK_INIT(&d_fi_gdata.dfg_rwlock, NULL); - rc = d_hash_table_create_inplace(D_HASH_FT_NOLOCK, D_FA_TABLE_BITS, - NULL, &fa_table_ops, - &d_fi_gdata.dfg_fa_table); + rc = d_hash_table_create_inplace(D_HASH_FT_NOLOCK, D_FA_TABLE_BITS, NULL, &fa_table_ops, + &d_fi_gdata.dfg_fa_table); if (rc != 0) D_ERROR("d_hash_table_create_inplace() failed, rc: %d.\n", rc); } @@ -428,16 +499,13 @@ d_fi_gdata_destroy(void) { int rc; - rc = d_hash_table_destroy_inplace(&d_fi_gdata.dfg_fa_table, - true /* force */); + rc = d_hash_table_destroy_inplace(&d_fi_gdata.dfg_fa_table, true /* force */); if (rc != 0) { - D_ERROR("failed to destroy fault attr data. force: %d, " - "d_hash_table_destroy_inplace failed, rc: %d\n", - true, rc); + D_ERROR("d_hash_table_destroy_inplace() failed, rc: %d\n", rc); } D_RWLOCK_DESTROY(&d_fi_gdata.dfg_rwlock); d_fi_gdata.dfg_refcount = 0; - d_fi_gdata.dfg_inited = 0; + d_fi_gdata.dfg_inited = 0; } /** @@ -446,14 +514,14 @@ d_fi_gdata_destroy(void) int d_fault_inject_init(void) { - char *config_file; - FILE *fp = NULL; - yaml_parser_t parser; - yaml_event_t event; - yaml_event_type_t event_type; - int last_errno; - int yaml_rc; - int rc = DER_SUCCESS; + char *config_file; + FILE *fp = NULL; + yaml_parser_t parser; + yaml_event_t event; + yaml_event_type_t event_type; + int last_errno; + int yaml_rc; + int rc = DER_SUCCESS; pthread_once(&d_fi_gdata_init_once, d_fi_gdata_init); D_ASSERT(d_fi_gdata.dfg_inited == 1); @@ -472,11 +540,10 @@ d_fault_inject_init(void) D_GOTO(out, rc); } - fp = fopen(config_file, "r"); + fp = fopen(config_file, "r"); last_errno = errno; if (fp == NULL) { - D_ERROR("Failed to open file %s (%s).\n", - config_file, strerror(last_errno)); + D_ERROR("Failed to open file %s (%s).\n", config_file, strerror(last_errno)); rc = d_errno2der(last_errno); D_GOTO(out, rc); } @@ -492,8 +559,7 @@ d_fault_inject_init(void) /* libyaml functions return 1 on success, 0 on error */ yaml_rc = yaml_parser_parse(&parser, &event); if (yaml_rc != 1) { - D_ERROR("yaml_parser_parse() failed. rc: %d\n", - yaml_rc); + D_ERROR("yaml_parser_parse() failed. rc: %d\n", yaml_rc); D_GOTO(out, rc = -DER_MISC); } @@ -507,17 +573,21 @@ d_fault_inject_init(void) continue; } - if (!strncmp((char *) event.data.scalar.value, - "fault_config", strlen("fault_config") + 1)) { + if (!strncmp((char *)event.data.scalar.value, "fault_config", + event.data.scalar.length)) { rc = fault_attr_parse(&parser); if (rc != DER_SUCCESS) - D_ERROR("fault_attr_parse() failed. rc %d\n", - rc); - } else if (!strncmp((char *) event.data.scalar.value, - "seed", strlen("seed") + 1)) { + D_ERROR("fault_attr_parse() failed. rc %d\n", rc); + } else if (!strncmp((char *)event.data.scalar.value, "seed", + event.data.scalar.length)) { rc = seed_parse(&parser); if (rc != DER_SUCCESS) D_ERROR("seed_parse() failed. rc %d\n", rc); + } else if (!strncmp((char *)event.data.scalar.value, "thread_default", + event.data.scalar.length)) { + rc = thread_default_parse(&parser); + if (rc != DER_SUCCESS) + D_ERROR("thread_default_parse() failed. rc %d\n", rc); } else { D_ERROR("unknown key: %s\n", event.data.scalar.value); rc = -DER_INVAL; @@ -530,10 +600,9 @@ d_fault_inject_init(void) yaml_parser_delete(&parser); if (rc == DER_SUCCESS) { - D_INFO("Config file: %s, fault injection is ON.\n", - config_file); + D_INFO("Config file: %s, fault injection is ON.\n", config_file); d_fault_config_file = 1; - d_fault_inject = 1; + d_fault_inject = 1; } else { D_ERROR("Failed to parse fault config file.\n"); D_GOTO(out, rc); @@ -553,7 +622,7 @@ d_fault_inject_init(void) int d_fault_inject_fini() { - int rc = 0; + int rc = 0; if (d_fi_gdata.dfg_inited == 0) { D_DEBUG(DB_TRACE, "fault injection not initialized.\n"); @@ -570,14 +639,13 @@ d_fault_inject_fini() D_RWLOCK_UNLOCK(&d_fi_gdata.dfg_rwlock); d_fi_gdata_destroy(); d_fi_gdata_init_once = PTHREAD_ONCE_INIT; - d_fault_inject = 0; + d_fault_inject = 0; D_DEBUG(DB_ALL, "Finalized.\n"); return rc; } - int d_fault_inject_enable(void) { @@ -611,20 +679,14 @@ d_fault_inject_is_enabled(void) return false; } -/** - * based on the state of fault_id, decide if a fault should be injected - * - * \param[in] fault_id fault injection configuration id +/* based on the state of fault_attr, decide if a fault should be injected * - * \return true if should inject fault, false if should not - * inject fault - * - * support injecting X faults in Y occurrences + * return true if should inject fault, false if should not inject fault */ bool d_should_fail(struct d_fault_attr_t *fault_attr) { - bool rc = true; + bool rc = true; if (!d_fi_initialized()) { D_ERROR("fault injection not initialized.\n"); @@ -639,6 +701,10 @@ d_should_fail(struct d_fault_attr_t *fault_attr) return false; D_SPIN_LOCK(&fault_attr->fa_lock); + + if (!fault_get_thread_enabled()) + D_GOTO(out, rc = false); + if (fault_attr->fa_probability_x == 0) D_GOTO(out, rc = false); @@ -654,7 +720,7 @@ d_should_fail(struct d_fault_attr_t *fault_attr) if (fault_attr->fa_probability_y != 0 && fault_attr->fa_probability_x <= - nrand48(fault_attr->fa_rand_state) % fault_attr->fa_probability_y) + nrand48(fault_attr->fa_rand_state) % fault_attr->fa_probability_y) D_GOTO(out, rc = false); fault_attr->fa_num_faults++; @@ -664,25 +730,29 @@ d_should_fail(struct d_fault_attr_t *fault_attr) return rc; }; #else /* FAULT_INJECT */ -int d_fault_inject_init(void) +int +d_fault_inject_init(void) { D_INFO("Fault Injection not initialized feature not included in build"); return -DER_NOSYS; } -int d_fault_inject_fini(void) +int +d_fault_inject_fini(void) { D_INFO("Fault Injection not finalized feature not included in build"); return -DER_NOSYS; } -int d_fault_inject_enable(void) +int +d_fault_inject_enable(void) { D_INFO("Fault Injection not enabled feature not included in build"); return -DER_NOSYS; } -int d_fault_inject_disable(void) +int +d_fault_inject_disable(void) { D_INFO("Fault Injection not disabled feature not included in build"); return -DER_NOSYS; @@ -718,4 +788,15 @@ d_fault_attr_err_code(uint32_t fault_id) { return 0; } + +void +d_fault_inject_thread_enable(bool enabled) +{ +} + +void +d_fault_inject_thread_default_enable(bool enabled) +{ +} + #endif /* FAULT_INJECT */ diff --git a/src/gurt/fi.h b/src/gurt/fi.h deleted file mode 100644 index 93dbce81bb5..00000000000 --- a/src/gurt/fi.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * (C) Copyright 2019-2021 Intel Corporation. - * - * SPDX-License-Identifier: BSD-2-Clause-Patent - */ -/** - * \file - * - * This file is part of gurt, it contains internal variables and functions for - * the fault injection feature. - */ - -#ifndef __FI_H__ -#define __FI_H__ - -/** @addtogroup GURT - * @{ - */ - -#if defined(__cplusplus) -extern "C" { -#endif - -struct d_fault_attr { - d_list_t fa_link; - struct d_fault_attr_t fa_attr; -}; - -#if defined(__cplusplus) -} -#endif - -/** @} - */ -#endif /* __FI_H__ */ diff --git a/src/include/gurt/fault_inject.h b/src/include/gurt/fault_inject.h index d0106fcf32c..dd91a4aa275 100644 --- a/src/include/gurt/fault_inject.h +++ b/src/include/gurt/fault_inject.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2018-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -7,8 +7,7 @@ /** * \file * - * This file is part of gurt, it contains variables and functions for the fault - * injection feature. + * This file is part of gurt, it contains variables and functions for the fault injection feature. */ #ifndef __FAULT_INJECT__ @@ -27,60 +26,51 @@ extern "C" { #endif /** Env to specify fault injection config file */ -#define D_FAULT_CONFIG_ENV "D_FI_CONFIG" +#define D_FAULT_CONFIG_ENV "D_FI_CONFIG" /** global on/off switch for fault injection */ -extern unsigned int d_fault_inject; -extern unsigned int d_fault_config_file; +extern unsigned int d_fault_inject; +extern unsigned int d_fault_config_file; -/* Location used for inecting memory allocation failures into D_ALLOC - * uses fault_id 0 - */ +/* Location used for inecting memory allocation failures into D_ALLOC uses fault_id 0 */ extern struct d_fault_attr_t *d_fault_attr_mem; -/* DFuse uses fault id 100 to force shutdown rather than mount after initialization - * is complete. +/* DFuse uses fault id 100 to force shutdown rather than mount after initialization is complete. * - * daos_init uses fault id 101 to disable memory faults for the duration of daos_init - * so that fault injection testing can avoid replicating coverage across multiple tests. + * daos_init uses fault id 101 to disable memory faults for the duration of daos_init so that fault + * injection testing can avoid replicating coverage across multiple tests. * * Other fault ids used by daos_engine are defined in src/include/daos/common.h */ struct d_fault_attr_t { + /** config id, used to select configuration from the fault_inject config file */ + uint32_t fa_id; /** - * config id, used to select configuration from the fault_inject config - * file - */ - uint32_t fa_id; - /** - * inject faults every n-th occurrence. If interval is set to 5 and - * probability is set to 20, fault injection only occurs on every 5-th - * hit of fault_id with a 20% probability. + * inject faults every n-th occurrence. If interval is set to 5 and probability is set to + * 20, fault injection only occurs on every 5-th hit of fault_id with a 20% probability. */ - uint32_t fa_interval; + uint32_t fa_interval; /** - * max number of faults to inject. 0 means unlimited. After max_faults - * is reached, no faults will be injected for fault_id. + * max number of faults to inject. 0 means unlimited. After max_faults is reached, no faults + * will be injected for fault_id. */ - uint64_t fa_max_faults; + uint64_t fa_max_faults; /** counter of injected faults */ - uint64_t fa_num_faults; + uint64_t fa_num_faults; /** number of times this injection point has been evaluated */ - uint64_t fa_num_hits; + uint64_t fa_num_hits; /** argument string. Interpretation of content is up to the user */ - char *fa_argument; + char *fa_argument; /** spin lock to protect this struct */ - pthread_spinlock_t fa_lock; + pthread_spinlock_t fa_lock; + /** the error code to inject. Can be retrieved by d_fault_attr_err_code() */ + int32_t fa_err_code; /** - * the error code to inject. Can be retrieved by d_fault_attr_err_code() + * state for nrand48. this allows each injection point has its own independent random number + * sequence. */ - int32_t fa_err_code; - /** - * state for nrand48. this allows each injection point has its own - * independent random number sequence. - */ - unsigned short fa_rand_state[3]; + unsigned short fa_rand_state[3]; /** * the frequency faults should be injected, calculated by: * @@ -89,55 +79,69 @@ struct d_fault_attr_t { * e.g. fa_probability_x = 123, fa_probability_y = 1000 * means faults will be injected randomly with frequency 12.3% */ - uint32_t fa_probability_x; - uint32_t fa_probability_y; + uint32_t fa_probability_x; + uint32_t fa_probability_y; }; /** - * Initialize the fault injection framework, injection attributes are read from - * the config file + * Initialize the fault injection framework, injection attributes are read from the config file * * \return DER_SUCCESS on success, negative value on error */ -int d_fault_inject_init(void); +int +d_fault_inject_init(void); /** * Finalize the fault injection framework * * \return DER_SUCCESS on success, negative value on error */ -int d_fault_inject_fini(void); +int +d_fault_inject_fini(void); /** * Start injecting faults. * * \return DER_SUCCESS on success, -DER_NOSYS if not supported */ -int d_fault_inject_enable(void); +int +d_fault_inject_enable(void); /** * Stop injecting faults. * * \return DER_SUCCESS on success, -DER_NOSYS if not supported */ -int d_fault_inject_disable(void); +int +d_fault_inject_disable(void); -bool d_fault_inject_is_enabled(void); +bool +d_fault_inject_is_enabled(void); -bool d_should_fail(struct d_fault_attr_t *fault_attr_ptr); +/** + * Enable/disable per thread. Sets if faults are enabled on the calling thread. + */ +void +d_fault_inject_thread_enable(bool enabled); /** - * use this macro to determine if a fault should be injected at a specific call - * site + * Enable/disable per thread for threads which haven't called d_fault_inject_thread_enable(). + * Default value here can be set via 'thread_default' in the input file. */ -#define D_SHOULD_FAIL(fault_attr) \ - ({ \ - bool __rb; \ - __rb = d_fault_inject && d_should_fail(fault_attr); \ - if (__rb) \ - D_WARN("fault_id %d, injecting fault.\n", \ - fault_attr->fa_id); \ - __rb; \ +void +d_fault_inject_thread_default_enable(bool enabled); + +bool +d_should_fail(struct d_fault_attr_t *fault_attr_ptr); + +/** use this macro to determine if a fault should be injected at a specific call site */ +#define D_SHOULD_FAIL(fault_attr) \ + ({ \ + bool __rb; \ + __rb = d_fault_inject && d_should_fail(fault_attr); \ + if (__rb) \ + D_WARN("fault_id %d, injecting fault.\n", fault_attr->fa_id); \ + __rb; \ }) /** From 54f9b968197d32807e58e631f4e56e88956e40cd Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Thu, 29 Jun 2023 16:15:14 +0100 Subject: [PATCH 03/16] DAOS-13781 dfuse: Remove a pointless assignment. (#12468) Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse_core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 85295782a9e..b65690cf3e2 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -723,7 +723,6 @@ dfuse_cont_open_by_label(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, */ DFUSE_TRA_INFO(dfc, "Using default caching values"); dfuse_set_default_cont_cache_values(dfc); - rc = 0; } else if (rc != 0) { D_GOTO(err_close, rc); } From e2d661ef2fdf996046750dc2da6109b3a28fe64f Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Thu, 29 Jun 2023 08:40:22 -0700 Subject: [PATCH 04/16] DAOS-13721 cart: Fix coverity cid 1426658 (#12490) - Fix coverity cid 1426658 Thread deadlock - Issuing 'get uri cache' command at the wrong time could end up in a deadlock due to latest changes to lockings Signed-off-by: Alexander A Oganezov --- src/cart/crt_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cart/crt_ctl.c b/src/cart/crt_ctl.c index 3500bec4a91..3ab6f1c0df4 100644 --- a/src/cart/crt_ctl.c +++ b/src/cart/crt_ctl.c @@ -117,11 +117,11 @@ crt_hdlr_ctl_get_uri_cache(crt_rpc_t *rpc_req) out_args->cguc_grp_cache.ca_count = uri_cache.idx; /* actual count */ rc = 0; out: + D_RWLOCK_UNLOCK(&grp_priv->gp_rwlock); out_args->cguc_rc = rc; rc = crt_reply_send(rpc_req); D_ASSERTF(rc == 0, "crt_reply_send() failed. rc: %d\n", rc); D_DEBUG(DB_TRACE, "sent reply to get uri cache request\n"); - D_RWLOCK_UNLOCK(&grp_priv->gp_rwlock); D_FREE(uri_cache.grp_cache); } From 7106dc583845e24bc218405284481e0a3fa6f382 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Thu, 29 Jun 2023 11:56:39 -0400 Subject: [PATCH 05/16] DAOS-12132 test: Enable setting up slurm for tests using partitions (#12070) Currently slurm is configured in CI if a specific tag is used. To allow runs using different tags to setup slurm as required for selected tests, use the inclusion of a partition in the test yaml to setup slurm. Rename DAOS_APP_DIR to DAOS_TEST_APP_DIR. Fix yaml use of DAOS_TEST_APP_DIR to reflect updated path definition. Adding --slurm_install launch.py argument to enable installing slurm RPMs when needed by the test. Signed-off-by: Phil Henderson --- src/tests/ftest/launch.py | 113 +++- src/tests/ftest/scripts/main.sh | 20 +- src/tests/ftest/slurm_setup.py | 764 +++++++++++++++++--------- src/tests/ftest/soak/smoke.yaml | 4 +- src/tests/ftest/soak/stress.yaml | 4 +- src/tests/ftest/util/package_utils.py | 64 +++ utils/cq/words.dict | 2 + 7 files changed, 663 insertions(+), 308 deletions(-) create mode 100644 src/tests/ftest/util/package_utils.py diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 719295155db..e8503f83a30 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -10,6 +10,7 @@ from collections import OrderedDict, defaultdict from tempfile import TemporaryDirectory import errno +import getpass import json import logging import os @@ -28,6 +29,7 @@ # from util.distro_utils import detect # pylint: disable=import-error,no-name-in-module from process_core_files import CoreFileProcessing, CoreFileException +from slurm_setup import SlurmSetup, SlurmSetupException # Update the path to support utils files that import other utils files sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "util")) @@ -47,7 +49,6 @@ BULLSEYE_SRC = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test.cov") BULLSEYE_FILE = os.path.join(os.sep, "tmp", "test.cov") -DEFAULT_DAOS_APP_DIR = os.path.join(os.sep, "scratch") DEFAULT_DAOS_TEST_LOG_DIR = os.path.join(os.sep, "var", "tmp", "daos_testing") DEFAULT_DAOS_TEST_USER_DIR = os.path.join(os.sep, "var", "tmp", "daos_testing", "user") DEFAULT_DAOS_TEST_SHARED_DIR = os.path.expanduser(os.path.join("~", "daos_test")) @@ -586,15 +587,19 @@ class Launch(): RESULTS_DIRS = ( "daos_configs", "daos_logs", "cart_logs", "daos_dumps", "valgrind_logs", "stacktraces") - def __init__(self, name, mode): + def __init__(self, name, mode, slurm_install, slurm_setup): """Initialize a Launch object. Args: name (str): launch job name mode (str): execution mode, e.g. "normal", "manual", or "ci" + slurm_install (bool): whether or not to install slurm RPMs if needed + slurm_setup (bool): whether or not to enable configuring slurm if needed """ self.name = name self.mode = mode + self.slurm_install = slurm_install + self.slurm_setup = slurm_setup self.avocado = AvocadoInfo() self.class_name = f"FTEST_launch.launch-{self.name.lower().replace('.', '-')}" @@ -604,6 +609,7 @@ def __init__(self, name, mode): self.tag_filters = [] self.repeat = 1 self.local_host = get_local_host() + self.user = getpass.getuser() # Results tracking settings self.job_results_dir = None @@ -619,7 +625,6 @@ def __init__(self, name, mode): # Options for creating slurm partitions self.slurm_control_node = NodeSet() self.slurm_partition_hosts = NodeSet() - self.slurm_add_partition = False def _start_test(self, class_name, test_name, log_file): """Start a new test result. @@ -945,7 +950,6 @@ def run(self, args): message = f"Invalid '--slurm_control_node={args.slurm_control_node}' argument" return self.get_exit_status(1, message, "Setup", sys.exc_info()) self.slurm_partition_hosts.add(args.test_clients or args.test_servers) - self.slurm_add_partition = args.slurm_setup # Execute the tests status = self.run_tests( @@ -1062,8 +1066,6 @@ def _set_test_environment(self, servers, clients, list_tests, provider, insecure # Set the default location for daos log files written during testing # if not already defined. - if "DAOS_APP_DIR" not in os.environ: - os.environ["DAOS_APP_DIR"] = DEFAULT_DAOS_APP_DIR if "DAOS_TEST_LOG_DIR" not in os.environ: os.environ["DAOS_TEST_LOG_DIR"] = DEFAULT_DAOS_TEST_LOG_DIR if "DAOS_TEST_USER_DIR" not in os.environ: @@ -1073,6 +1075,9 @@ def _set_test_environment(self, servers, clients, list_tests, provider, insecure os.environ["DAOS_TEST_SHARED_DIR"] = os.path.join(base_dir, "tmp") else: os.environ["DAOS_TEST_SHARED_DIR"] = DEFAULT_DAOS_TEST_SHARED_DIR + if "DAOS_TEST_APP_DIR" not in os.environ: + os.environ["DAOS_TEST_APP_DIR"] = os.path.join( + os.environ["DAOS_TEST_SHARED_DIR"], "daos_test", "apps") os.environ["D_LOG_FILE"] = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "daos.log") os.environ["D_LOG_FILE_APPEND_PID"] = "1" @@ -1808,8 +1813,11 @@ def run_tests(self, sparse, fail_fast, stop_daos, archive, rename, jenkinslog, c # Display the location of the avocado logs logger.info("Avocado job results directory: %s", self.job_results_dir) + # Configure slurm if any tests use partitions + return_code |= self.setup_slurm() + # Configure hosts to collect code coverage - self.setup_bullseye() + return_code |= self.setup_bullseye() # Run each test for as many repetitions as requested for repeat in range(1, self.repeat + 1): @@ -1861,7 +1869,7 @@ def run_tests(self, sparse, fail_fast, stop_daos, archive, rename, jenkinslog, c logger.removeHandler(test_file_handler) # Collect code coverage files after all test have completed - self.finalize_bullseye() + return_code |= self.finalize_bullseye() # Summarize the run return self._summarize_run(return_code) @@ -1933,6 +1941,80 @@ def finalize_bullseye(self): os.rename(old_file, new_file) return status + def setup_slurm(self): + """Set up slurm on the hosts if any tests are using partitions. + + Returns: + int: status code: 0 = success, 128 = failure + """ + status = 0 + logger.info("Setting up slurm partitions if required by tests") + if not any(test.yaml_info["client_partition"] for test in self.tests): + logger.debug(" No tests using client partitions detected - skipping slurm setup") + return status + + if not self.slurm_setup: + logger.debug(" The 'slurm_setup' argument is not set - skipping slurm setup") + return status + + status |= self.setup_application_directory() + + slurm_setup = SlurmSetup(logger, self.slurm_partition_hosts, self.slurm_control_node, True) + try: + if self.slurm_install: + slurm_setup.install() + slurm_setup.update_config(self.user, 'daos_client') + slurm_setup.start_munge(self.user) + slurm_setup.start_slurm(self.user, True) + except SlurmSetupException: + message = "Error setting up slurm" + self._fail_test(self.result.tests[-1], "Run", message, sys.exc_info()) + status |= 128 + except Exception: # pylint: disable=broad-except + message = "Unknown error setting up slurm" + self._fail_test(self.result.tests[-1], "Run", message, sys.exc_info()) + status |= 128 + + return status + + def setup_application_directory(self): + """Set up the application directory. + + Returns: + int: status code: 0 = success, 128 = failure + """ + app_dir = os.environ.get('DAOS_TEST_APP_DIR') + app_src = os.environ.get('DAOS_TEST_APP_SRC') + + logger.debug("Setting up the '%s' application directory", app_dir) + if not os.path.exists(app_dir): + # Create the apps directory if it does not already exist + try: + logger.debug(' Creating the application directory') + os.makedirs(app_dir) + except OSError: + message = 'Error creating the application directory' + self._fail_test(self.result.tests[-1], 'Run', message, sys.exc_info()) + return 128 + else: + logger.debug(' Using the existing application directory') + + if app_src and os.path.exists(app_src): + logger.debug(" Copying applications from the '%s' directory", app_src) + run_local(logger, f"ls -al '{app_src}'") + for app in os.listdir(app_src): + try: + run_local( + logger, f"cp -r '{os.path.join(app_src, app)}' '{app_dir}'", check=True) + except RunException: + message = 'Error copying files to the application directory' + self._fail_test(self.result.tests[-1], 'Run', message, sys.exc_info()) + return 128 + + logger.debug(" Applications in '%s':", app_dir) + run_local(logger, f"ls -al '{app_dir}'") + return 0 + @staticmethod def display_disk_space(path): """Display disk space of provided path destination. @@ -1999,18 +2081,18 @@ def _setup_host_information(self, test): partition = test.yaml_info["client_partition"] logger.debug("Determining if the %s client partition exists", partition) exists = show_partition(logger, self.slurm_control_node, partition).passed - if not exists and not self.slurm_add_partition: + if not exists and not self.slurm_setup: message = f"Error missing {partition} partition" self._fail_test(self.result.tests[-1], "Prepare", message, None) return 128 - if self.slurm_add_partition and exists: + if self.slurm_setup and exists: logger.info( "Removing existing %s partition to ensure correct configuration", partition) if not delete_partition(logger, self.slurm_control_node, partition).passed: message = f"Error removing existing {partition} partition" self._fail_test(self.result.tests[-1], "Prepare", message, None) return 128 - if self.slurm_add_partition: + if self.slurm_setup: hosts = self.slurm_partition_hosts.difference(test.yaml_info["test_servers"]) logger.debug( "Partition hosts from '%s', excluding test servers '%s': %s", @@ -3130,6 +3212,10 @@ def main(): type=str, help="slurm control node where scontrol commands will be issued to check for the existence " "of any slurm partitions required by the tests") + parser.add_argument( + "-si", "--slurm_install", + action="store_true", + help="enable installing slurm RPMs if required by the tests") parser.add_argument( "--scm_mount", action="store", @@ -3141,7 +3227,7 @@ def main(): parser.add_argument( "-ss", "--slurm_setup", action="store_true", - help="setup any slurm partitions required by the tests") + help="enable setting up slurm partitions if required by the tests") parser.add_argument( "--scm_size", action="store", @@ -3216,11 +3302,12 @@ def main(): args.sparse = True if not args.logs_threshold: args.logs_threshold = DEFAULT_LOGS_THRESHOLD + args.slurm_install = True args.slurm_setup = True args.user_create = True # Setup the Launch object - launch = Launch(args.name, args.mode) + launch = Launch(args.name, args.mode, args.slurm_install, args.slurm_setup) # Perform the steps defined by the arguments specified try: diff --git a/src/tests/ftest/scripts/main.sh b/src/tests/ftest/scripts/main.sh index 46fdae198d4..3bfb69309ca 100644 --- a/src/tests/ftest/scripts/main.sh +++ b/src/tests/ftest/scripts/main.sh @@ -157,23 +157,6 @@ if ${SETUP_ONLY:-false}; then exit 0 fi -export DAOS_APP_DIR=${DAOS_APP_DIR:-$DAOS_TEST_SHARED_DIR} - -# check if slurm needs to be configured for soak -if [[ "${TEST_TAG_ARG}" =~ soak && "${STAGE_NAME}" =~ Hardware ]]; then - if ! ./slurm_setup.py -d -c "$FIRST_NODE" -n "${TEST_NODES}" -s -i; then - exit "${PIPESTATUS[0]}" - fi - - if ! mkdir -p "${DAOS_APP_DIR}/soak/apps"; then - exit "${PIPESTATUS[0]}" - fi - - if ! cp -r /scratch/soak/apps/* "${DAOS_APP_DIR}/soak/apps/"; then - exit "${PIPESTATUS[0]}" - fi -fi - # need to increase the number of oopen files (on EL8 at least) ulimit -n 4096 @@ -188,6 +171,8 @@ export WITH_VALGRIND export STAGE_NAME export TEST_RPMS export DAOS_BASE +export DAOS_TEST_APP_SRC=${DAOS_TEST_APP_SRC:-"/scratch/daos_test/apps"} +export DAOS_TEST_APP_DIR=${DAOS_TEST_APP_DIR:-"${DAOS_TEST_SHARED_DIR}/daos_test/apps"} launch_node_args="-ts ${TEST_NODES}" if [ "${STAGE_NAME}" == "Functional Hardware 24" ]; then @@ -199,6 +184,7 @@ if [ "${STAGE_NAME}" == "Functional Hardware 24" ]; then client_nodes=$(IFS=','; echo "${test_node_list[*]:8}") launch_node_args="-ts ${server_nodes} -tc ${client_nodes}" fi + # shellcheck disable=SC2086,SC2090 if ! ./launch.py --mode ci ${launch_node_args} ${LAUNCH_OPT_ARGS} ${TEST_TAG_ARR[*]}; then rc=${PIPESTATUS[0]} diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index 2b0266604b7..8cfa0869cbd 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -10,279 +10,487 @@ import argparse import getpass import logging +import os import re import socket import sys from ClusterShell.NodeSet import NodeSet -from util.logger_utils import get_console_handler -from util.run_utils import get_clush_command, run_remote +# Update the path to support utils files that import other utils files +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "util")) +# pylint: disable=import-outside-toplevel +from logger_utils import get_console_handler # noqa: E402 +from package_utils import install_packages, remove_packages # noqa: E402 +from run_utils import get_clush_command, run_remote, command_as_user # noqa: E402 # Set up a logger for the console messages logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(get_console_handler("%(message)s", logging.DEBUG)) -SLURM_CONF = "/etc/slurm/slurm.conf" -EPILOG_FILE = "/var/tmp/epilog_soak.sh" - -PACKAGE_LIST = ["slurm", "slurm-example-configs", - "slurm-slurmctld", "slurm-slurmd"] - -COPY_LIST = ["cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf", - "cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf", - "cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf"] - -MUNGE_STARTUP = [ - "chown munge. {0}".format("/etc/munge/munge.key"), - "systemctl restart munge", - "systemctl enable munge"] - -SLURMCTLD_STARTUP = [ - "systemctl restart slurmctld", - "systemctl enable slurmctld"] - -SLURMCTLD_STARTUP_DEBUG = [ - "cat /var/log/slurmctld.log", - "grep -v \"^#\\w\" /etc/slurm/slurm.conf"] - -SLURMD_STARTUP = [ - "systemctl restart slurmd", - "systemctl enable slurmd"] - -SLURMD_STARTUP_DEBUG = [ - "cat /var/log/slurmd.log", - "grep -v \"^#\\w\" /etc/slurm/slurm.conf"] - - -def create_epilog_script(args): - """Create epilog script to run after each job. - - Args: - args (Namespace): command line arguments - - Returns: - int: 0 if command passes; 1 otherwise - - """ - sudo = "sudo" if args.sudo else "" - with open(EPILOG_FILE, 'w') as script_file: - script_file.write("#!/bin/bash\n#\n") - script_file.write("/usr/bin/bash -c 'pkill --signal 9 dfuse'\n") - script_file.write("/usr/bin/bash -c 'for dir in $(find /tmp/daos_dfuse);" - "do fusermount3 -uz $dir;rm -rf $dir; done'\n") - script_file.write("exit 0\n") - command = f"{sudo} chmod 755 {EPILOG_FILE}" - return execute_cluster_cmds(args.control, [command]) - - -def update_config_cmdlist(args): - """Create the command lines to update slurmd.conf file. - - Args: - args (Namespace): command line arguments - - Returns: - cmd_list: list of cmdlines to update config file - - """ - all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes))) - if create_epilog_script(args) > 1: - logger.error("%s could not be updated. Check if file exists", EPILOG_FILE) - sys.exit(1) - cmd_list = [f"sed -i -e 's/ClusterName=cluster/ClusterName=ci_cluster/g' {SLURM_CONF}", - f"sed -i -e 's/SlurmUser=slurm/SlurmUser={args.user}/g' {SLURM_CONF}", - f"sed -i -e 's/NodeName/#NodeName/g' {SLURM_CONF}", - f"sed -i -e 's#EpilogSlurmctld=#EpilogSlurmctld={EPILOG_FILE}#g' {SLURM_CONF}"] - sudo = "sudo" if args.sudo else "" - # Copy the slurm*example.conf files to /etc/slurm/ - if execute_cluster_cmds(all_nodes, COPY_LIST, args.sudo) > 0: - sys.exit(1) - match = False - # grep SLURM_CONF to determine format of the the file - for ctl_host in ["SlurmctldHost", "ControlMachine"]: - command = r"grep {} {}".format(ctl_host, SLURM_CONF) - if run_remote(logger, all_nodes, command).passed: - ctl_str = "sed -i -e 's/{0}=linux0/{0}={1}/g' {2}".format( - ctl_host, args.control, SLURM_CONF) - cmd_list.insert(0, ctl_str) - match = True - break - if not match: - logger.error("% could not be updated. Check conf file format", SLURM_CONF) - sys.exit(1) - - # This info needs to be gathered from every node that can run a slurm job - command = r"lscpu | grep -E '(Socket|Core|Thread)\(s\)'" - result = run_remote(logger, all_nodes, command) - for data in result.output: - info = { - match[0]: match[1] - for match in re.findall(r"(Socket|Core|Thread).*:\s+(\d+)", "\n".join(data.stdout)) - if len(match) > 1} - - if "Socket" not in info or "Core" not in info or "Thread" not in info: - # Did not find value for socket|core|thread so do not - # include in config file - pass - cmd_list.append("echo \"NodeName={0} Sockets={1} CoresPerSocket={2} " - "ThreadsPerCore={3}\" |{4} tee -a {5}".format( - data.hosts, info["Socket"], info["Core"], info["Thread"], sudo, - SLURM_CONF)) - - # - cmd_list.append("echo \"PartitionName={} Nodes={} Default=YES " - "MaxTime=INFINITE State=UP\" |{} tee -a {}".format( - args.partition, args.nodes, sudo, SLURM_CONF)) - - return execute_cluster_cmds(all_nodes, cmd_list, args.sudo) - - -def execute_cluster_cmds(nodes, cmdlist, sudo=False): - """Execute the list of cmds on hostlist nodes. - - Args: - nodes (NodeSet): nodes on which to execute the commands - cmdlist ([type]): list of cmdlines to execute - sudo (str, optional): Execute cmd with sudo privileges. Defaults to false. - - Returns: - ret_code: returns 0 if all commands passed on all hosts; 1 otherwise - - """ - for cmd in cmdlist: - if sudo: - cmd = "sudo {}".format(cmd) - if not run_remote(logger, nodes, cmd, timeout=600).passed: - # Do not bother executing any remaining commands if this one failed - return 1 - return 0 - - -def configuring_packages(args, action): - """Install required slurm and munge packages. - - Args: - args (Namespace): command line arguments - action (str): 'install' or 'remove' - - """ - # Install packages on control and compute nodes - all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes))) - logger.info("%s slurm packages on %s: %s", action, all_nodes, ", ".join(PACKAGE_LIST)) - command = ["dnf", action, "-y"] + PACKAGE_LIST - return execute_cluster_cmds(all_nodes, [" ".join(command)], args.sudo) - - -def start_munge(args): - """Start munge service on all nodes. - - Args: - args (Namespace): command line arguments - - """ - sudo = "sudo" if args.sudo else "" - all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes))) - # exclude the control node - nodes = NodeSet(str(args.nodes)) - nodes.difference_update(str(args.control)) - - # copy key to all nodes FROM slurmctl node; - # change the protections/ownership on the munge dir on all nodes - cmd_list = [ - "{0} chmod -R 777 /etc/munge; {0} chown {1}. /etc/munge".format( - sudo, args.user)] - if execute_cluster_cmds(all_nodes, cmd_list) > 0: - return 1 - - # Check if file exists on slurm control node - # change the protections/ownership on the munge key before copying - cmd_list = ["set -Eeu", - "rc=0", - "if [ ! -f /etc/munge/munge.key ]", - "then {} create-munge-key".format(sudo), - "fi", - "{} chmod 777 /etc/munge/munge.key".format(sudo), - "{} chown {}. /etc/munge/munge.key".format(sudo, args.user)] - - if execute_cluster_cmds(args.control, ["; ".join(cmd_list)]) > 0: - return 1 - # remove any existing key from other nodes - cmd_list = ["{} rm -f /etc/munge/munge.key".format(sudo)] - if execute_cluster_cmds(nodes, ["; ".join(cmd_list)]) > 0: - return 1 - - # copy munge.key to all hosts - command = get_clush_command( - nodes, args="--copy /etc/munge/munge.key --dest /etc/munge/munge.key") - if execute_cluster_cmds(args.control, [command]) > 0: - return 1 - - # set the protection back to defaults - cmd_list = [ - "{} chmod 400 /etc/munge/munge.key".format(sudo), - "{} chown munge. /etc/munge/munge.key".format(sudo), - "{} chmod 700 /etc/munge".format(sudo), - "{} chown munge. /etc/munge".format(sudo)] - if execute_cluster_cmds(all_nodes, ["; ".join(cmd_list)]) > 0: - return 1 - - # Start Munge service on all nodes - all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes))) - return execute_cluster_cmds(all_nodes, MUNGE_STARTUP, args.sudo) - - -def start_slurm(args): - """Start the slurm services on all nodes. - - Args: - args (Namespace): command line arguments - - """ - # Setting up slurm on all nodes - all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes))) - cmd_list = [ - "mkdir -p /var/log/slurm", - "chown {}. {}".format(args.user, "/var/log/slurm"), - "mkdir -p /var/spool/slurmd", - "mkdir -p /var/spool/slurmctld", - "mkdir -p /var/spool/slurm/d", - "mkdir -p /var/spool/slurm/ctld", - "chown {}. {}/ctld".format(args.user, "/var/spool/slurm"), - "chown {}. {}".format(args.user, "/var/spool/slurmctld"), - "chmod 775 {}".format("/var/spool/slurmctld"), - "rm -f /var/spool/slurmctld/clustername"] - - if execute_cluster_cmds(all_nodes, cmd_list, args.sudo) > 0: - return 1 - - # Startup the slurm control service - status = execute_cluster_cmds(args.control, SLURMCTLD_STARTUP, args.sudo) - if status > 0 or args.debug: - execute_cluster_cmds(args.control, SLURMCTLD_STARTUP_DEBUG, args.sudo) - if status > 0: - return 1 - - # Startup the slurm service - status = execute_cluster_cmds(all_nodes, SLURMD_STARTUP, args.sudo) - if status > 0 or args.debug: - execute_cluster_cmds(all_nodes, SLURMD_STARTUP_DEBUG, args.sudo) - if status > 0: - return 1 - - # ensure that the nodes are in the idle state - cmd_list = ["scontrol update nodename={} state=idle".format(args.nodes)] - status = execute_cluster_cmds(args.nodes, cmd_list, args.sudo) - if status > 0 or args.debug: - cmd_list = SLURMCTLD_STARTUP_DEBUG - execute_cluster_cmds(args.control, cmd_list, args.sudo) - cmd_list = SLURMD_STARTUP_DEBUG - execute_cluster_cmds(all_nodes, cmd_list, args.sudo) - if status > 0: - return 1 - return 0 +class SlurmSetupException(Exception): + """Exception for SlurmSetup class.""" + + +class SlurmSetup(): + """Slurm setup class.""" + + EPILOG_FILE = '/var/tmp/epilog_soak.sh' + EXAMPLE_FILES = [ + '/etc/slurm/slurm.conf.example', + '/etc/slurm/cgroup.conf.example', + '/etc/slurm/slurmdbd.conf.example'] + MUNGE_DIR = '/etc/munge' + MUNGE_KEY = '/etc/munge/munge.key' + PACKAGE_LIST = ['slurm', 'slurm-example-configs', 'slurm-slurmctld', 'slurm-slurmd'] + SLURM_CONF = '/etc/slurm/slurm.conf' + SLURM_LOG_DIR = '/var/log/slurm' + + def __init__(self, log, nodes, control_node, sudo=False): + """Initialize a SlurmSetup object. + + Args: + log (logger): object configured to log messages + nodes (NodeSet): slurm nodes + control_node (NodeSet): slurm control node + sudo (bool, optional): whether or not to use sudo with commands. Defaults to False. + """ + self.log = log + self.nodes = NodeSet(nodes) + self.control = NodeSet(control_node) + self.root = 'root' if sudo else None + + @property + def all_nodes(self): + """Get all the nodes specified in this class. + + Returns: + NodeSet: all the nodes specified in this class + """ + return self.nodes.union(self.control) + + def remove(self): + """Remove slurm packages from the nodes. + + Raises: + SlurmSetupException: if there is a problem removing the packages + """ + self.log.info("Removing slurm packages") + result = remove_packages(self.log, self.all_nodes, self.PACKAGE_LIST, self.root) + if not result.passed: + raise SlurmSetupException(f"Error removing slurm packages on {result.failed_hosts}") + + def install(self): + """Install slurm packages on the nodes. + + Raises: + SlurmSetupException: if there is a problem installing the packages + """ + self.log.info("Installing slurm packages") + result = install_packages(self.log, self.all_nodes, self.PACKAGE_LIST, self.root) + if not result.passed: + raise SlurmSetupException(f"Error installing slurm packages on {result.failed_hosts}") + + def update_config(self, slurm_user, partition): + """Update the slurm config. + + Args: + slurm_user (str): user to define in the slurm config file + partition (str): name of the slurm partition to include in the configuration + + Raises: + SlurmSetupException: if there is a problem + """ + self.log.info("Updating slurm config files") + + # Create the slurm epilog script on the control node + self._create_epilog_script(self.EPILOG_FILE) + + # Copy the slurm example.conf files to all nodes + for source in self.EXAMPLE_FILES: + self._copy_file(self.all_nodes, source, os.path.splitext(source)[0]) + + # Update the config file on all hosts + self._update_slurm_config(slurm_user, partition) + + def start_munge(self, user): + """Start munge. + + Args: + user (str): user account to use with munge + + Raises: + SlurmSetupException: if there is a problem starting munge + """ + self.log.info("Starting munge") + + # Create munge key only if it does not exist. + result = run_remote( + self.log, self.control, command_as_user(f'test -f {self.MUNGE_KEY}', self.root)) + if not result.passed: + # Create a munge key on the control host + self.log.debug('Creating a new munge key on %s', self.control) + result = run_remote( + self.log, self.control, command_as_user('create-munge-key', self.root)) + if not result.passed: + # Try the other possible munge key creation command: + result = run_remote( + self.log, self.control, command_as_user('mungekey -c', self.root)) + if not result.passed: + raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}') + + # Setup the munge dir file permissions on all hosts + self._update_file(self.all_nodes, self.MUNGE_DIR, '777', user) + + # Setup the munge key file permissions on the control host + self._update_file(self.control, self.MUNGE_KEY, '777', user) + + # Copy the munge key from the control node to the non-control nodes + non_control = self.nodes.difference(self.control) + self.log.debug('Copying the munge key to %s', non_control) + command = get_clush_command( + non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}") + result = run_remote(self.log, self.control, command) + if not result.passed: + raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}') + + # Resetting munge dir and key permissions + self._update_file(self.all_nodes, self.MUNGE_KEY, '400', 'munge') + self._update_file(self.all_nodes, self.MUNGE_DIR, '700', 'munge') + + # Restart munge on all nodes + self._restart_systemctl(self.all_nodes, 'munge') + + def start_slurm(self, user, debug): + """Start slurm. + + Args: + user (str): user account to use with slurm + debug (bool): whether or not to display slurm debug + + Raises: + SlurmSetupException: if there is a problem starting slurm + """ + self.log.info("Starting slurm") + + self._mkdir(self.all_nodes, self.SLURM_LOG_DIR) + self._update_file_ownership(self.all_nodes, self.SLURM_LOG_DIR, user) + self._mkdir(self.all_nodes, '/var/spool/slurmd') + self._mkdir(self.all_nodes, '/var/spool/slurmctld') + self._mkdir(self.all_nodes, '/var/spool/slurm/d') + self._mkdir(self.all_nodes, '/var/spool/slurm/ctld') + self._update_file_ownership(self.all_nodes, '/var/spool/slurm/ctld', user) + self._update_file(self.all_nodes, '/var/spool/slurmctld', '775', user) + self._remove_file(self.all_nodes, '/var/spool/slurmctld/clustername') + + # Restart slurmctld on the control node + self._restart_systemctl( + self.control, 'slurmctld', '/var/log/slurmctld.log', self.SLURM_CONF) + + # Restart slurmd on all nodes + self._restart_systemctl(self.all_nodes, 'slurmd', '/var/log/slurmd.log', self.SLURM_CONF) + + # Update nodes to the idle state + command = command_as_user( + f'scontrol update nodename={str(self.nodes)} state=idle', self.root) + result = run_remote(self.log, self.nodes, command) + if not result.passed or debug: + self._display_debug(self.control, '/var/log/slurmctld.log', self.SLURM_CONF) + self._display_debug(self.all_nodes, '/var/log/slurmd.log', self.SLURM_CONF) + if not result.passed: + raise SlurmSetupException(f'Error setting nodes to idle on {self.nodes}') + + def _create_epilog_script(self, script): + """Create epilog script to run after each job. + + Args: + script (str): epilog script name. + + Raises: + SlurmSetupException: if there is a problem creating the epilog script + """ + self.log.debug('Creating the slurm epilog script to run after each job.') + try: + with open(script, 'w') as script_file: + script_file.write('#!/bin/bash\n#\n') + script_file.write('/usr/bin/bash -c \'pkill --signal 9 dfuse\'\n') + script_file.write( + '/usr/bin/bash -c \'for dir in $(find /tmp/daos_dfuse);' + 'do fusermount3 -uz $dir;rm -rf $dir; done\'\n') + script_file.write('exit 0\n') + except IOError as error: + self.log.debug('Error writing %s - verifying file existence:', script) + run_remote(self.log, self.control, f'ls -al {script}') + raise SlurmSetupException(f'Error writing slurm epilog script {script}') from error + + command = command_as_user(f'chmod 755 {script}', self.root) + if not run_remote(self.log, self.control, command).passed: + raise SlurmSetupException(f'Error setting slurm epilog script {script} permissions') + + def _copy_file(self, nodes, source, destination): + """Copy the source file to the destination on all the nodes. + + Args: + nodes (NodeSet): nodes on which to copy the files + source (str): file to copy + destination (str): where to copy the file + + Raises: + SlurmSetupException: if there is an error copying the file on any host + """ + self.log.debug(f'Copying the {source} file to {destination} on {str(nodes)}') + command = command_as_user(f'cp {source} {destination}', self.root) + result = run_remote(self.log, nodes, command) + if not result.passed: + raise SlurmSetupException( + f'Error copying {source} to {destination} on {str(result.failed_hosts)}') + + def _update_slurm_config(self, slurm_user, partition): + """Update the slurm config file. + + Args: + slurm_user (str): user to define in the slurm config file + partition (str): name of the slurm partition to include in the configuration + + Raises: + SlurmSetupException: if there is a problem modifying slurm config file + """ + # Update the config file with the slurm cluster name + self._modify_slurm_config_file( + 'slurm cluster name', self.all_nodes, 's/ClusterName=cluster/ClusterName=ci_cluster/g', + self.root) + + # Update the config file with the slurm user + self._modify_slurm_config_file( + 'slurm user', self.all_nodes, f's/SlurmUser=slurm/SlurmUser={slurm_user}/g', + self.root) + + # Update the config file with the removal of the NodeName entry + self._modify_slurm_config_file( + 'node name', self.all_nodes, 's/NodeName/#NodeName/g', self.root) + + # Update the config file with the slurm epilog file + self._modify_slurm_config_file( + 'epilog file', self.all_nodes, 's#EpilogSlurmctld=#EpilogSlurmctld={EPILOG_FILE}#g', + self.root) + + # Update the config file with the slurm control node + not_updated = self.all_nodes.copy() + for control_keyword in ['SlurmctldHost', 'ControlMachine']: + command = f'grep {control_keyword} {self.SLURM_CONF}' + results = run_remote(self.log, self.all_nodes, command) + if results.passed_hosts: + not_updated.remove( + self._modify_slurm_config_file( + 'slurm control node', results.passed_hosts, + f's/{control_keyword}=linux0/{control_keyword}={str(self.control)}/g', + self.root)) + if not_updated: + raise SlurmSetupException(f'Slurm control node not updated on {not_updated}') + + # Update the config file with each node's socket/core/thread information + self._update_slurm_config_sys_info() + + # Update the config file with the partition information + self._update_slurm_config_partitions(partition) + + def _modify_slurm_config_file(self, description, hosts, replacement, user=None): + """Replace text in the slurm configuration file. + + Args: + description (str): what is being modified in the slurm config file + hosts (NodeSet): hosts on which to modify the slurm config file + replacement (str): what text to replace + user (str, optional): user to use when running the sed command. Defaults to None. + + Raises: + SlurmSetupException: if there is a problem modifying slurm config file + + Returns: + NodeSet: hosts on which the command succeeded + """ + self.log.debug( + 'Updating the %s in the %s config file on %s', description, self.SLURM_CONF, hosts) + command = command_as_user(f'sed -i -e \'{replacement}\' {self.SLURM_CONF}', user) + result = run_remote(self.log, hosts, command) + if result.failed_hosts: + raise SlurmSetupException( + f'Error updating {description} in the {self.SLURM_CONF} config ' + f'file on {result.failed_hosts}') + return result.passed_hosts + + def _update_slurm_config_sys_info(self): + """Update the slurm config files with hosts socket/core/thread information. + + Raises: + SlurmSetupException: if there is a problem updating the slurm config file + """ + self.log.debug('Updating slurm config socket/core/thread information on %s', self.all_nodes) + command = r"lscpu | grep -E '(Socket|Core|Thread)\(s\)'" + result = run_remote(self.log, self.all_nodes, command) + for data in result.output: + info = { + match[0]: match[1] + for match in re.findall(r"(Socket|Core|Thread).*:\s+(\d+)", "\n".join(data.stdout)) + if len(match) > 1} + + if "Socket" in info and "Core" in info and "Thread" in info: + echo_command = (f'echo \"Nodename={data.hosts} Sockets={info["Socket"]} ' + f'CoresPerSocket={info["Core"]} ThreadsPerCore={info["Thread"]}\"') + mod_result = self._append_config_file(echo_command) + if mod_result.failed_hosts: + raise SlurmSetupException( + 'Error updating socket/core/thread information on ' + f'{mod_result.failed_hosts}') + + def _update_slurm_config_partitions(self, partition): + """Update the slurm config files with hosts partition information. + + Args: + partition (str): name of the slurm partition to include in the configuration + + Raises: + SlurmSetupException: if there is a problem updating the slurm config file + """ + self.log.debug('Updating slurm config partition information on %s', self.all_nodes) + echo_command = ( + f'echo \"PartitionName={partition} Nodes={self.nodes} Default=YES MaxTime=INFINITE ' + 'State=UP\"') + mod_result = self._append_config_file(echo_command) + if mod_result.failed_hosts: + raise SlurmSetupException( + f'Error updating partition information on {mod_result.failed_hosts}') + + def _append_config_file(self, echo_command): + """Append data to the config file. + + Args: + echo_command (str): command adding contents to the config file + + Returns: + RemoteCommandResult: the result from the echo | tee command + """ + tee_command = command_as_user(f'tee -a {self.SLURM_CONF}', self.root) + return run_remote(self.log, self.all_nodes, f'{echo_command} | {tee_command}') + + def _update_file(self, nodes, file, permission, user): + """Update file permissions and ownership. + + Args: + nodes (NodeSet): nodes on which to update the file permissions/ownership + file (str): file whose permissions/ownership will be updated + permission (str): file permission to set + user (str): user to have ownership of the file + + Raises: + SlurmSetupException: if there was an error updating the file permissions/ownership + """ + self._update_file_permissions(nodes, file, permission) + self._update_file_ownership(nodes, file, user) + + def _update_file_permissions(self, nodes, file, permission): + """Update the file permissions. + + Args: + nodes (NodeSet): nodes on which to update the file permissions + file (str): file whose permissions will be updated + permission (str): file permission to set + user (str): user to use with chown command + + Raises: + SlurmSetupException: if there was an error updating the file permissions + """ + self.log.debug('Updating file permissions for %s on %s', self.MUNGE_DIR, nodes) + result = run_remote( + self.log, nodes, command_as_user(f'chmod -R {permission} {file}', self.root)) + if not result.passed: + raise SlurmSetupException( + f'Error updating permissions to {permission} for {file} on {result.failed_hosts}') + + def _update_file_ownership(self, nodes, file, user): + """Update the file ownership. + + Args: + nodes (NodeSet): nodes on which to update the file ownership + file (str): file whose ownership will be updated + user (str): user to have ownership of the file + + Raises: + SlurmSetupException: if there was an error updating the file ownership + """ + result = run_remote(self.log, nodes, command_as_user(f'chown {user}. {file}', self.root)) + if not result.passed: + raise SlurmSetupException( + f'Error updating ownership to {user} for {file} on {result.failed_hosts}') + + def _remove_file(self, nodes, file): + """Remove a file. + + Args: + nodes (NodeSet): nodes on which to remove the file + file (str): file to remove + + Raises: + SlurmSetupException: if there was an error removing the file + """ + self.log.debug('Removing %s on %s', file, nodes) + result = run_remote(self.log, nodes, command_as_user(f'rm -fr {file}', self.root)) + if not result.passed: + raise SlurmSetupException(f'Error removing {file} on {result.failed_hosts}') + + def _restart_systemctl(self, nodes, service, debug_log=None, debug_config=None): + """Restart the systemctl service. + + Args: + nodes (NodeSet): nodes on which to restart the systemctl service + service (str): systemctl service to restart/enable + debug_log (str, optional): log file to display if there is a problem restarting + debug_config (str, optional): config file to display if there is a problem restarting + + Raises: + SlurmSetupException: if there is a problem restarting the systemctl service + """ + self.log.debug('Restarting %s on %s', service, nodes) + for action in ('restart', 'enable'): + command = command_as_user(f'systemctl {action} {service}', self.root) + result = run_remote(self.log, self.all_nodes, command) + if not result.passed: + self._display_debug(result.failed_hosts, debug_log, debug_config) + raise SlurmSetupException(f'Error restarting {service} on {result.failed_hosts}') + + def _display_debug(self, nodes, debug_log=None, debug_config=None): + """Display debug information. + + Args: + nodes (NodeSet): nodes on which to display the debug information + debug_log (str, optional): log file to display. Defaults to None. + debug_config (str, optional): config file to display. Defaults to None. + """ + if debug_log: + self.log.debug('DEBUG: %s contents:', debug_log) + command = command_as_user(f'cat {debug_log}', self.root) + run_remote(self.log, nodes, command) + if debug_config: + self.log.debug('DEBUG: %s contents:', debug_config) + command = command_as_user(f'grep -v \"^#\\w\" {debug_config}', self.root) + run_remote(self.log, nodes, command) + + def _mkdir(self, nodes, directory): + """Create a directory. + + Args: + nodes (NodeSet): nodes on which to create the directory + directory (str): directory to create + + Raises: + SlurmSetupException: if there was an error creating the directory + """ + self.log.debug('Creating %s on %s', directory, nodes) + result = run_remote(self.log, nodes, command_as_user(f'mkdir -p {directory}', self.root)) + if not result.passed: + raise SlurmSetupException(f'Error creating {directory} on {result.failed_hosts}') def main(): @@ -330,36 +538,44 @@ def main(): logger.error("slurm_nodes: Specify at least one slurm node") sys.exit(1) - # Convert control node and slurm node list into NodeSets - args.control = NodeSet(args.control) - args.nodes = NodeSet(args.nodes) + slurm_setup = SlurmSetup(logger, args.nodes, args.control, args.sudo) # Remove packages if specified with --remove and then exit if args.remove: - ret_code = configuring_packages(args, "remove") - if ret_code > 0: + try: + slurm_setup.remove() + sys.exit(0) + except SlurmSetupException as error: + logger.error(str(error)) sys.exit(1) - sys.exit(0) # Install packages if specified with --install and continue with setup if args.install: - ret_code = configuring_packages(args, "install") - if ret_code > 0: + try: + slurm_setup.install() + except SlurmSetupException as error: + logger.error(str(error)) sys.exit(1) # Edit the slurm conf files - ret_code = update_config_cmdlist(args) - if ret_code > 0: + try: + slurm_setup.update_config(args.user, args.partition) + except SlurmSetupException as error: + logger.error(str(error)) sys.exit(1) # Munge Setup - ret_code = start_munge(args) - if ret_code > 0: + try: + slurm_setup.start_munge(args.user) + except SlurmSetupException as error: + logger.error(str(error)) sys.exit(1) # Slurm Startup - ret_code = start_slurm(args) - if ret_code > 0: + try: + slurm_setup.start_slurm(args.user, args.debug) + except SlurmSetupException as error: + logger.error(str(error)) sys.exit(1) sys.exit(0) diff --git a/src/tests/ftest/soak/smoke.yaml b/src/tests/ftest/soak/smoke.yaml index 0cee0ef4ccd..85e73ab55ff 100644 --- a/src/tests/ftest/soak/smoke.yaml +++ b/src/tests/ftest/soak/smoke.yaml @@ -164,7 +164,7 @@ vpic_smoke: - 1 taskspernode: - 1 - cmdline: "${DAOS_APP_DIR}/soak/apps/vpic-install/bin/harris.Linux" + cmdline: "${DAOS_TEST_APP_DIR}/vpic-install/bin/harris.Linux" api: - POSIX - POSIX-LIBIOIL @@ -183,7 +183,7 @@ lammps_smoke: - 1 taskspernode: - 1 - cmdline: "${DAOS_APP_DIR}/soak/apps/lammps/src/lmp_mpi -i ${DAOS_APP_DIR}/soak/apps/lammps/bench/in.lj.smoke" + cmdline: "${DAOS_TEST_APP_DIR}/lammps/src/lmp_mpi -i ${DAOS_TEST_APP_DIR}/lammps/bench/in.lj.smoke" api: - POSIX - POSIX-LIBIOIL diff --git a/src/tests/ftest/soak/stress.yaml b/src/tests/ftest/soak/stress.yaml index 736bfe46936..ea86426f6b0 100644 --- a/src/tests/ftest/soak/stress.yaml +++ b/src/tests/ftest/soak/stress.yaml @@ -185,7 +185,7 @@ vpic_stress: - 1 taskspernode: - 32 - cmdline: "${DAOS_APP_DIR}/soak/apps/vpic-install/bin/harris.Linux" + cmdline: "${DAOS_TEST_APP_DIR}/vpic-install/bin/harris.Linux" api: - POSIX - POSIX-LIBIOIL @@ -202,7 +202,7 @@ lammps_stress: - 8 taskspernode: - 32 - cmdline: "${DAOS_APP_DIR}/soak/apps/lammps/src/lmp_mpi -i ${DAOS_APP_DIR}/soak/apps/lammps/bench/in.lj" + cmdline: "${DAOS_TEST_APP_DIR}/lammps/src/lmp_mpi -i ${DAOS_TEST_APP_DIR}/lammps/bench/in.lj" api: - POSIX - POSIX-LIBIOIL diff --git a/src/tests/ftest/util/package_utils.py b/src/tests/ftest/util/package_utils.py new file mode 100644 index 00000000000..bbc5f549ecb --- /dev/null +++ b/src/tests/ftest/util/package_utils.py @@ -0,0 +1,64 @@ +""" +(C) Copyright 2023 Intel Corporation. + +SPDX-License-Identifier: BSD-2-Clause-Patent +""" + +from run_utils import run_remote, command_as_user + + +def find_packages(log, hosts, pattern, user=None): + """Get the installed packages on each specified host. + + Args: + log (logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to search for installed packages + pattern (str): grep pattern to use to search for installed packages + user (str, optional): user account to use to run the search command. Defaults to None. + + Returns: + dict: a dictionary of host keys with a list of installed RPM values + """ + installed = {} + command = command_as_user(f"rpm -qa | grep -E {pattern} | sort -n", user) + result = run_remote(log, hosts, command) + for data in result.output: + if data.passed: + installed[str(data.hosts)] = data.stdout or [] + return installed + + +def install_packages(log, hosts, packages, user=None, timeout=600): + """Install the packages on the hosts. + + Args: + log (logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to install the packages + packages (list): a list of packages to install + user (str, optional): user to use when installing the packages. Defaults to None. + timeout (int, optional): timeout for the dnf install command. Defaults to 600. + + Returns: + RemoteCommandResult: the 'dnf install' command results + """ + log.info('Installing packages on %s: %s', hosts, ', '.join(packages)) + command = command_as_user(' '.join(['dnf', 'install', '-y'] + packages), user) + return run_remote(log, hosts, command, timeout=timeout) + + +def remove_packages(log, hosts, packages, user=None, timeout=600): + """Remove the packages on the hosts. + + Args: + log (logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to remove the packages + packages (list): a list of packages to remove + user (str, optional): user to use when removing the packages. Defaults to None. + timeout (int, optional): timeout for the dnf remove command. Defaults to 600. + + Returns: + RemoteCommandResult: the 'dnf remove' command results + """ + log.info('Removing packages on %s: %s', hosts, ', '.join(packages)) + command = command_as_user(' '.join(['dnf', 'remove', '-y'] + packages), user) + return run_remote(log, hosts, command, timeout=timeout) diff --git a/utils/cq/words.dict b/utils/cq/words.dict index 07b9e37cef1..f5dba89a4ee 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -388,7 +388,9 @@ shlex simul sinfo slurm +slurmd slurmctl +slurmctld spdk squeue src From 3d06890dfeebf3c56e605ac30c567c73e8a28e3e Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Thu, 29 Jun 2023 10:11:38 -0600 Subject: [PATCH 06/16] DAOS-13496 test: Add logging to MockInvoker (#12384) Added logging to the MockInvoker to make debugging easier in the event that unit tests that use it fail. Signed-off-by: Kris Jacque --- src/control/lib/control/mocks.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/control/lib/control/mocks.go b/src/control/lib/control/mocks.go index 1ec98b972bc..60f6c6b0626 100644 --- a/src/control/lib/control/mocks.go +++ b/src/control/lib/control/mocks.go @@ -125,10 +125,12 @@ func (mi *MockInvoker) InvokeUnaryRPCAsync(ctx context.Context, uReq UnaryReques ur := mi.cfg.UnaryResponse mi.invokeCountMutex.RLock() if len(mi.cfg.UnaryResponseSet) > mi.invokeCount { + mi.log.Debugf("using configured UnaryResponseSet[%d]", mi.invokeCount) ur = mi.cfg.UnaryResponseSet[mi.invokeCount] } mi.invokeCountMutex.RUnlock() if ur == nil { + mi.log.Debugf("using dummy UnaryResponse") // If the config didn't define a response, just dummy one up for // tests that don't care. ur = &UnaryResponse{ @@ -140,6 +142,8 @@ func (mi *MockInvoker) InvokeUnaryRPCAsync(ctx context.Context, uReq UnaryReques }, }, } + } else { + mi.log.Debugf("using configured UnaryResponse") } var invokeCount int @@ -148,6 +152,7 @@ func (mi *MockInvoker) InvokeUnaryRPCAsync(ctx context.Context, uReq UnaryReques invokeCount = mi.invokeCount mi.invokeCountMutex.Unlock() go func(invokeCount int) { + mi.log.Debugf("returning mock responses, invokeCount=%d", invokeCount) delayIdx := invokeCount - 1 for idx, hr := range ur.Responses { var delay time.Duration @@ -156,13 +161,16 @@ func (mi *MockInvoker) InvokeUnaryRPCAsync(ctx context.Context, uReq UnaryReques delay = mi.cfg.UnaryResponseDelays[delayIdx][idx] } if delay > 0 { + mi.log.Debugf("delaying mock response for %s", delay) time.Sleep(delay) } select { case <-ctx.Done(): + mi.log.Debugf("context canceled on iteration %d (error=%s)", idx, ctx.Err().Error()) return case responses <- hr: + mi.log.Debug("sending mock response") } } close(responses) From 0f206b16239144164f76dd4acb28db668b319b06 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Thu, 29 Jun 2023 14:25:59 -0600 Subject: [PATCH 07/16] DAOS-13813 ci: Move GHA build to Fedora 38 (#12516) Fix a couple of newer compiler warnings on Fedora 38 and move GHA to Fedora 38 to prevent new ones. Move developer build to ucx 1.14.1. The RPM update can be done independently or not at all. We only build the RPM to build DAOS. We use MOFED drivers when we actually use UCX at runtime. Signed-off-by: Jeff Olivier --- .github/workflows/ci2.yml | 2 +- .github/workflows/landing-builds.yml | 4 ++-- site_scons/components/__init__.py | 6 ++++++ site_scons/prereq_tools/base.py | 16 +++++++++++++++- src/bio/bio_wal.c | 2 +- src/cart/crt_internal_types.h | 4 ++-- src/client/java/daos-java/pom.xml | 6 +++--- src/client/java/hadoop-daos/pom.xml | 6 +++--- src/object/tests/srv_checksum_tests.c | 6 +++--- utils/build.config | 2 +- utils/docker/Dockerfile.el.8 | 1 + utils/scripts/install-el8.sh | 2 +- 12 files changed, 39 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci2.yml b/.github/workflows/ci2.yml index 3dcc59131df..87162f1da55 100644 --- a/.github/workflows/ci2.yml +++ b/.github/workflows/ci2.yml @@ -82,7 +82,7 @@ jobs: with: rockylinux/rockylinux:9 - distro: fedora base: el.8 - with: fedora:36 + with: fedora:38 - distro: leap.15 base: leap.15 with: opensuse/leap:15.4 diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml index afd1c74f233..ec442e7a74c 100644 --- a/.github/workflows/landing-builds.yml +++ b/.github/workflows/landing-builds.yml @@ -46,7 +46,7 @@ jobs: with: rockylinux/rockylinux:9 - distro: fedora base: el.8 - with: fedora:36 + with: fedora:38 - distro: leap.15 base: leap.15 with: opensuse/leap:15.4 @@ -156,7 +156,7 @@ jobs: with: rockylinux/rockylinux:9 - distro: fedora base: el.8 - with: fedora:36 + with: fedora:38 - distro: leap.15 base: leap.15 with: opensuse/leap:15.4 diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py index 205cc9c3563..1fe43214676 100644 --- a/site_scons/components/__init__.py +++ b/site_scons/components/__init__.py @@ -159,6 +159,7 @@ def define_mercury(reqs): ['make', 'install'], ['mkdir', '-p', '$UCX_PREFIX/lib64/pkgconfig'], ['cp', 'ucx.pc', '$UCX_PREFIX/lib64/pkgconfig']], + build_env={'CFLAGS': '-Wno-error'}, package='ucx-devel' if inst(reqs, 'ucx') else None) mercury_build = ['cmake', @@ -228,6 +229,11 @@ def define_common(reqs): reqs.define('hwloc', libs=['hwloc'], headers=['hwloc.h'], package='hwloc-devel') + if ARM_PLATFORM: + reqs.define('ipmctl', skip_arch=True) + else: + reqs.define('ipmctl', headers=['nvm_management.h'], package='libipmctl-devel') + def define_ompi(reqs): """OMPI and related components""" diff --git a/site_scons/prereq_tools/base.py b/site_scons/prereq_tools/base.py index 491e132deea..a42822ad083 100644 --- a/site_scons/prereq_tools/base.py +++ b/site_scons/prereq_tools/base.py @@ -528,7 +528,7 @@ def run_build(self, opts): common_reqs = ['argobots', 'ucx', 'ofi', 'hwloc', 'mercury', 'boost', 'uuid', 'crypto', 'protobufc', 'lz4', 'isal', 'isal_crypto'] client_reqs = ['fuse', 'json-c', 'capstone'] - server_reqs = ['pmdk', 'spdk'] + server_reqs = ['pmdk', 'spdk', 'ipmctl'] test_reqs = ['cmocka'] reqs = [] @@ -721,6 +721,7 @@ def define(self, name, **kw): extra_include_path -- Subdirectories to add to dependent component path out_of_src_build -- Build from a different directory if set to True build_env -- Environment variables to set for build + skip_arch -- not required on this architecture """ use_installed = False if 'all' in self.installed or name in self.installed: @@ -951,6 +952,7 @@ class _Component(): out_of_src_build -- Build from a different directory if set to True patch_rpath -- Add appropriate relative rpaths to binaries build_env -- Environment variable(s) to add to build environment + skip_arch -- not required on this platform """ def __init__(self, @@ -992,6 +994,7 @@ def __init__(self, self.include_path.extend(kw.get("extra_include_path", [])) self.out_of_src_build = kw.get("out_of_src_build", False) self.patch_path = self.prereqs.get_build_dir() + self.skip_arch = kw.get("skip_arch", False) @staticmethod def _sanitize_patch_path(path): @@ -1128,6 +1131,10 @@ def has_missing_targets(self, env): if self.targets_found: return False + if self.skip_arch: + self.targets_found = True + return False + if self.__check_only: # Temporarily turn off dry-run. env.SetOption('no_exec', False) @@ -1216,6 +1223,9 @@ def is_installed(self, needed_libs): def configure(self): """Setup paths for a required component""" + if self.skip_arch: + return + if not self.retriever: self.prebuilt_path = "/usr" else: @@ -1234,6 +1244,10 @@ def configure(self): def set_environment(self, env, needed_libs): """Modify the specified construction environment to build with the external component""" + + if self.skip_arch: + return + lib_paths = [] # Make sure CheckProg() looks in the component's bin/ dir diff --git a/src/bio/bio_wal.c b/src/bio/bio_wal.c index 521b509e48b..33426e065ad 100644 --- a/src/bio/bio_wal.c +++ b/src/bio/bio_wal.c @@ -1670,7 +1670,7 @@ bio_wal_replay(struct bio_meta_context *mc, struct bio_wal_rp_stats *wrs, uint64_t tx_id, start_id, unmap_start, unmap_end; int rc; uint64_t total_bytes = 0, rpl_entries = 0, total_tx = 0; - uint64_t s_us; + uint64_t s_us = 0; D_ALLOC(buf, max_blks * blk_bytes); if (buf == NULL) diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index d21c05de112..1e4eaa28cfa 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -311,7 +311,7 @@ struct crt_opc_map { struct crt_opc_map_L2 *com_map; }; - -void crt_na_config_fini(bool primary, int provider); +void +crt_na_config_fini(bool primary, crt_provider_t provider); #endif /* __CRT_INTERNAL_TYPES_H__ */ diff --git a/src/client/java/daos-java/pom.xml b/src/client/java/daos-java/pom.xml index 4d53b2ce7cb..494a3eecbb0 100644 --- a/src/client/java/daos-java/pom.xml +++ b/src/client/java/daos-java/pom.xml @@ -295,7 +295,7 @@ org.javassist:javassist - ${artifactId}-${version}-protobuf3-netty4-shaded + ${project.artifactId}-${project.version}-protobuf3-netty4-shaded @@ -314,7 +314,7 @@ - target/${artifactId}-${version}-protobuf3-netty4-shaded.jar + target/${project.artifactId}-${project.version}-protobuf3-netty4-shaded.jar jar protobuf3-netty4-shaded @@ -484,7 +484,7 @@ - target/${artifactId}-${version}-shaded.jar + target/${project.artifactId}-${project.version}-shaded.jar jar all-shaded diff --git a/src/client/java/hadoop-daos/pom.xml b/src/client/java/hadoop-daos/pom.xml index da83dac821e..34ecdf445ac 100644 --- a/src/client/java/hadoop-daos/pom.xml +++ b/src/client/java/hadoop-daos/pom.xml @@ -190,7 +190,7 @@ com.google.protobuf:protobuf-java - ${artifactId}-${version}-protobuf3-netty4-shaded + ${project.artifactId}-${project.version}-protobuf3-netty4-shaded @@ -209,7 +209,7 @@ - target/${artifactId}-${version}-protobuf3-netty4-shaded.jar + target/${project.artifactId}-${project.version}-protobuf3-netty4-shaded.jar jar protobuf3-netty4-shaded @@ -333,7 +333,7 @@ - target/${artifactId}-${version}-shaded.jar + target/${project.artifactId}-${project.version}-shaded.jar jar all-shaded diff --git a/src/object/tests/srv_checksum_tests.c b/src/object/tests/srv_checksum_tests.c index 751ee44b3f1..97c9d19fa26 100644 --- a/src/object/tests/srv_checksum_tests.c +++ b/src/object/tests/srv_checksum_tests.c @@ -289,9 +289,9 @@ fetch_csum_verify_bsgl_with_args(struct vos_fetch_test_context *ctx) #define ASSERT_CSUM(ctx, csum) \ assert_memory_equal(csum, ctx.iod_csum->ic_data->cs_csum, \ sizeof(csum) - 1) -#define ASSERT_CSUM_EMPTY(ctx, idx) \ - assert_string_equal("", ctx.iod_csum->ic_data->cs_csum + \ - (idx * ctx.iod_csum->ic_data->cs_len)) +#define ASSERT_CSUM_EMPTY(ctx, idx) \ + assert_int_equal( \ + 0, *(ctx.iod_csum->ic_data->cs_csum + (idx * ctx.iod_csum->ic_data->cs_len))) #define ASSERT_CSUM_IDX(ctx, csum, idx) \ assert_memory_equal(csum, ctx.iod_csum->ic_data->cs_csum + \ (idx * ctx.iod_csum->ic_data->cs_len), \ diff --git a/utils/build.config b/utils/build.config index c81f4630ceb..8b05f5d55cc 100644 --- a/utils/build.config +++ b/utils/build.config @@ -10,7 +10,7 @@ SPDK = v22.01.2 OFI = v1.18.0 MERCURY = v2.3.0 PROTOBUFC = v1.3.3 -UCX=v1.13.0 +UCX=v1.14.1 [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff diff --git a/utils/docker/Dockerfile.el.8 b/utils/docker/Dockerfile.el.8 index 951fddc33df..4b63665594d 100644 --- a/utils/docker/Dockerfile.el.8 +++ b/utils/docker/Dockerfile.el.8 @@ -156,6 +156,7 @@ ARG DAOS_JAVA_BUILD=$DAOS_BUILD RUN [ "$DAOS_JAVA_BUILD" != "yes" ] || { \ mkdir /home/daos/.m2 && \ cp /home/daos/daos/utils/scripts/helpers/maven-settings.xml.in /home/daos/.m2/settings.xml && \ + export JAVA_HOME=$(daos-java/find_java_home.sh) && \ mvn clean install -T 1C \ -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn \ -DskipITs -Dgpg.skip -Ddaos.install.path=/opt/daos; \ diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh index 807c3bab733..e9aa8181c87 100755 --- a/utils/scripts/install-el8.sh +++ b/utils/scripts/install-el8.sh @@ -10,7 +10,7 @@ set -e -arch=$(uname -i) +arch=$(uname -m) dnf --nodocs install \ boost-python3-devel \ From 05af214658726cd1700e53c9039e04c5c2580589 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Fri, 30 Jun 2023 11:55:02 +0100 Subject: [PATCH 08/16] DAOS-623 test: Check for use of DER_UNKNOWN in logs. (#12560) Add a check for and remove use of DER_UNKNOWN in logs. This results when a value is used with DF_RC or similar and is not a daos error number. Signed-off-by: Ashley Pittman --- src/object/cli_obj.c | 5 ++--- src/object/srv_enum.c | 2 +- src/tests/ftest/cart/util/cart_logtest.py | 3 +++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 4bbb7e4e654..fa44513e9fb 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -6205,9 +6205,8 @@ obj_list_shards_get(struct obj_auxi_args *obj_auxi, unsigned int map_ver, grp_idx, *shard, *shard_cnt); out: - D_DEBUG(DB_IO, DF_OID" list on shard %u leader %s: "DF_RC"\n", - DP_OID(obj->cob_md.omd_id), *shard, - obj_auxi->to_leader ? "yes" : "no", DP_RC(rc)); + D_DEBUG(DB_IO, DF_OID " list on shard %u leader %s: %d\n", DP_OID(obj->cob_md.omd_id), + *shard, obj_auxi->to_leader ? "yes" : "no", rc); return rc; } diff --git a/src/object/srv_enum.c b/src/object/srv_enum.c index 242ea818c10..c569e1a8dcf 100644 --- a/src/object/srv_enum.c +++ b/src/object/srv_enum.c @@ -788,6 +788,6 @@ ds_obj_enum_pack(vos_iter_param_t *param, vos_iter_type_t type, bool recursive, rc = iter_cb(param, type, recursive, anchors, enum_pack_cb, NULL, arg, dth); - D_DEBUG(DB_IO, "enum type %d rc "DF_RC"\n", type, DP_RC(rc)); + D_DEBUG(DB_IO, "enum type %d rc %d\n", type, rc); return rc; } diff --git a/src/tests/ftest/cart/util/cart_logtest.py b/src/tests/ftest/cart/util/cart_logtest.py index e8010a4ba5f..1b235a4b013 100755 --- a/src/tests/ftest/cart/util/cart_logtest.py +++ b/src/tests/ftest/cart/util/cart_logtest.py @@ -357,6 +357,9 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks self.save_log_line(line) try: msg = ''.join(line._fields[2:]) + + if 'DER_UNKNOWN' in msg: + show_line(line, 'NORMAL', 'Use of DER_UNKNOWN') # Warn if a line references the name of the function it was in, # but skip short function names or _internal suffixes. if line.function in msg and len(line.function) > 6 and \ From 0c992c86ab6672efca7e97d41a4f7b8178683a91 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Fri, 30 Jun 2023 14:44:45 +0100 Subject: [PATCH 09/16] DAOS-13214 engine: Setup modules in DRPC SetUp handler (#12504) Setup all modules from within the dRPC setup handler to ensure this is performed before the ready state is set in the control-plane. Signed-off-by: Tom Nabarro --- src/engine/init.c | 5 ----- src/engine/srv_internal.h | 1 - src/include/daos_srv/daos_engine.h | 8 +++++--- src/mgmt/srv_drpc.c | 12 +++++++++++- src/mgmt/srv_internal.h | 2 +- src/mgmt/srv_target.c | 2 +- src/mgmt/tests/mocks.c | 9 +++++++-- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/engine/init.c b/src/engine/init.c index 23f2aff0c65..a376488e62b 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -802,11 +802,6 @@ server_init(int argc, char *argv[]) server_init_state_wait(DSS_INIT_STATE_SET_UP); - rc = dss_module_setup_all(); - if (rc != 0) - goto exit_init_state; - D_INFO("Modules successfully set up\n"); - rc = crt_register_event_cb(dss_crt_event_cb, NULL); if (rc) D_GOTO(exit_init_state, rc); diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index e708855f3ac..5a8fb7aa167 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -139,7 +139,6 @@ int dss_module_load(const char *modname); int dss_module_init_all(uint64_t *mod_fac); int dss_module_unload(const char *modname); void dss_module_unload_all(void); -int dss_module_setup_all(void); int dss_module_cleanup_all(void); /* srv.c */ diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 90d35300b40..7175a6df1b1 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -796,9 +796,11 @@ enum dss_media_error_type { void dss_init_state_set(enum dss_init_state state); -/* Notify control-plane of a bio error. */ -int -ds_notify_bio_error(int media_err_type, int tgt_id); +/** Call module setup from drpc setup call handler. */ +int dss_module_setup_all(void); + +/** Notify control-plane of a bio error. */ +int ds_notify_bio_error(int media_err_type, int tgt_id); int ds_get_pool_svc_ranks(uuid_t pool_uuid, d_rank_list_t **svc_ranks); int ds_pool_find_bylabel(d_const_string_t label, uuid_t pool_uuid, diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index bf0e36ee9e2..94388c7ae4a 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -2411,11 +2411,21 @@ void ds_mgmt_drpc_set_up(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) { Mgmt__DaosResp resp = MGMT__DAOS_RESP__INIT; + int rc; D_INFO("Received request to setup engine\n"); - dss_init_state_set(DSS_INIT_STATE_SET_UP); + rc = dss_module_setup_all(); + if (rc != 0) { + D_ERROR("Module setup failed: %d\n", rc); + goto err; + } + D_INFO("Modules successfully set up\n"); + + dss_init_state_set(DSS_INIT_STATE_SET_UP); +err: + resp.status = rc; pack_daos_response(&resp, drpc_resp); } diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index b22341fb35b..9cfc0b5a0ab 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index 4321178d3ae..49e017df269 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index 3d4124dfd7a..4b104f19195 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -478,6 +478,12 @@ dss_init_state_set(enum dss_init_state state) { } +int +dss_module_setup_all() +{ + return 0; +} + size_t ds_rsvc_get_md_cap(void) { @@ -608,4 +614,3 @@ mock_ds_mgmt_dev_set_faulty_setup(void) ds_mgmt_dev_set_faulty_return = 0; uuid_clear(ds_mgmt_dev_set_faulty_uuid); } - From 60b5a498feb5cdeb6a2b31d7ffc8339dfcb1c12d Mon Sep 17 00:00:00 2001 From: Ken Cain Date: Fri, 30 Jun 2023 09:45:42 -0400 Subject: [PATCH 10/16] DAOS-13808 container: check before dereferencing filter parts (#12512) Fix for Coverity CID 1451828. Signed-off-by: Kenneth Cain --- src/container/srv_container.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/container/srv_container.c b/src/container/srv_container.c index e7dd13bb362..32f57ad94f2 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -4216,6 +4216,13 @@ cont_filter_match(struct rdb_tx *tx, struct cont *cont, daos_pool_cont_filter_t uint32_t combine_op = filt->pcf_combine_func; int rc = 0; + /* defensive, partially redundant with pool_cont_filter_is_valid() from top-level handler */ + if ((filt->pcf_nparts > 0) && (filt->pcf_parts == NULL)) { + D_ERROR(DF_CONT": filter has %u parts but pcf_parts is NULL\n", + DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid), filt->pcf_nparts); + return -DER_INVAL; + } + /* logical OR combining: start with false result, transition to true on first match */ if ((filt->pcf_parts != NULL) && (combine_op == PCF_COMBINE_LOGICAL_OR)) whole_match = false; From 54b6e0fa3eeb94c5ccdae21f1e5dddbcf14d7238 Mon Sep 17 00:00:00 2001 From: Cedric Koch-Hofer <94527853+knard-intel@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:58:33 +0200 Subject: [PATCH 11/16] DAOS-12946 test: fix telemetry_pool_metrics with ior error (#12478) When the ior command is failing, it is not possible to properly test the metrics as we do not know how much data have been transferred. Thus we cancel the test when such error is occurring, as the purpose of this test is not to check the ior command. Signed-off-by: Cedric Koch-Hofer --- .../ftest/telemetry/telemetry_pool_metrics.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/telemetry/telemetry_pool_metrics.py b/src/tests/ftest/telemetry/telemetry_pool_metrics.py index 86cc1f21400..edd7c437bf3 100644 --- a/src/tests/ftest/telemetry/telemetry_pool_metrics.py +++ b/src/tests/ftest/telemetry/telemetry_pool_metrics.py @@ -3,7 +3,6 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ -from avocado.core.exceptions import TestFail from ior_test_base import IorTestBase from telemetry_test_base import TestWithTelemetry @@ -152,14 +151,12 @@ def test_telemetry_pool_metrics(self): metrics_init = self.get_metrics(metric_names) # Run ior command. - try: - self.update_ior_cmd_with_pool(False) - self.ior_cmd.dfs_oclass.update(self.dfs_oclass) - self.ior_cmd.dfs_chunk.update(self.ior_cmd.transfer_size.value) - self.run_ior_with_pool( - timeout=200, create_pool=False, create_cont=False) - except TestFail: - self.log.info("#ior command failed!") + self.update_ior_cmd_with_pool(False) + self.ior_cmd.dfs_oclass.update(self.dfs_oclass) + self.ior_cmd.dfs_chunk.update(self.ior_cmd.transfer_size.value) + # NOTE DAOS-12946: Not catching ior failures is intended. Indeed, to properly test the + # metrics we have to exactly know how much data have been transferred. + self.run_ior_with_pool(timeout=200, create_pool=False, create_cont=False) # collect second set of pool metric data after read/write metrics_end = self.get_metrics(metric_names) From b7860a48f267b0609f87398f0a43130b0b3a4926 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Fri, 30 Jun 2023 17:54:50 +0100 Subject: [PATCH 12/16] DAOS-13214 control: Disallow dRPCs if engine is not ready (#12503) Move IsReady() check into CallDrpc() function and adjust rank setup routine as appropriate. This eliminates the possibility of calling into an engine from the control-plane before SetRank() and Setup() have both returned successfully. Signed-off-by: Tom Nabarro --- src/control/lib/daos/status.go | 4 +++ src/control/server/ctl_ranks_rpc.go | 5 ---- src/control/server/ctl_ranks_rpc_test.go | 4 +-- src/control/server/ctl_smd_rpc.go | 24 ++++++++++------- src/control/server/ctl_storage_rpc.go | 2 +- src/control/server/ctl_storage_rpc_test.go | 2 +- src/control/server/harness.go | 16 ++--------- src/control/server/harness_test.go | 5 ++-- src/control/server/instance.go | 6 ++--- src/control/server/instance_drpc.go | 19 ++++++++++--- src/control/server/instance_drpc_test.go | 31 +++++++++++++++++----- src/control/server/mgmt_pool.go | 2 +- src/control/server/mgmt_system.go | 2 +- 13 files changed, 72 insertions(+), 50 deletions(-) diff --git a/src/control/lib/daos/status.go b/src/control/lib/daos/status.go index e858ea29302..96f7cf1b313 100644 --- a/src/control/lib/daos/status.go +++ b/src/control/lib/daos/status.go @@ -26,6 +26,10 @@ func (ds Status) Error() string { return fmt.Sprintf("%s(%d): %s", dErrStr, ds, dErrDesc) } +func (ds Status) Int32() int32 { + return int32(ds) +} + const ( // Success indicates no error Success Status = 0 diff --git a/src/control/server/ctl_ranks_rpc.go b/src/control/server/ctl_ranks_rpc.go index 0ddc09ea145..40e902a3fb2 100644 --- a/src/control/server/ctl_ranks_rpc.go +++ b/src/control/server/ctl_ranks_rpc.go @@ -434,11 +434,6 @@ func (svc *ControlService) SetEngineLogMasks(ctx context.Context, req *ctlpb.Set idx, ei.Index()) } - if !ei.IsReady() { - resp.Errors[idx] = "not ready" - continue - } - if err := updateSetLogMasksReq(svc.srvCfg.Engines[idx], &eReq); err != nil { resp.Errors[idx] = err.Error() continue diff --git a/src/control/server/ctl_ranks_rpc_test.go b/src/control/server/ctl_ranks_rpc_test.go index e43c2b41a4c..417cce2f27c 100644 --- a/src/control/server/ctl_ranks_rpc_test.go +++ b/src/control/server/ctl_ranks_rpc_test.go @@ -991,8 +991,8 @@ func TestServer_CtlSvc_SetEngineLogMasks(t *testing.T) { instancesStopped: true, expResp: &ctlpb.SetLogMasksResp{ Errors: []string{ - "not ready", - "not ready", + FaultDataPlaneNotStarted.Error(), + FaultDataPlaneNotStarted.Error(), }, }, }, diff --git a/src/control/server/ctl_smd_rpc.go b/src/control/server/ctl_smd_rpc.go index 74e348e6f93..72e59727218 100644 --- a/src/control/server/ctl_smd_rpc.go +++ b/src/control/server/ctl_smd_rpc.go @@ -238,7 +238,7 @@ type devID struct { uuid string } -type engineDevMap map[*Engine][]devID +type engineDevMap map[Engine][]devID // Map requested device IDs provided in comma-separated string to the engine that controls the given // device. Device can be identified either by UUID or transport (PCI) address. @@ -258,14 +258,14 @@ func (svc *ControlService) mapIDsToEngine(ctx context.Context, ids string, useTr edm := make(engineDevMap) for _, rr := range resp.Ranks { - eis, err := svc.harness.FilterInstancesByRankSet(fmt.Sprintf("%d", rr.Rank)) + engines, err := svc.harness.FilterInstancesByRankSet(fmt.Sprintf("%d", rr.Rank)) if err != nil { return nil, err } - if len(eis) == 0 { + if len(engines) == 0 { return nil, errors.Errorf("failed to retrieve instance for rank %d", rr.Rank) } - eisPtr := &eis[0] + engine := engines[0] for _, dev := range rr.Devices { if dev == nil { return nil, errors.New("nil device in smd query resp") @@ -282,7 +282,7 @@ func (svc *ControlService) mapIDsToEngine(ctx context.Context, ids string, useTr if trAddrs[dds.TrAddr] || uuidMatch { // If UUID matches, add by TrAddr rather than UUID which // should avoid duplicate UUID entries for the same TrAddr. - edm[eisPtr] = append(edm[eisPtr], devID{trAddr: dds.TrAddr}) + edm[engine] = append(edm[engine], devID{trAddr: dds.TrAddr}) delete(trAddrs, dds.TrAddr) delete(devUUIDs, dds.Uuid) continue @@ -291,7 +291,7 @@ func (svc *ControlService) mapIDsToEngine(ctx context.Context, ids string, useTr if uuidMatch { // Only add UUID entry if TrAddr is not available for a device. - edm[eisPtr] = append(edm[eisPtr], devID{uuid: dds.Uuid}) + edm[engine] = append(edm[engine], devID{uuid: dds.Uuid}) delete(devUUIDs, dds.Uuid) } } @@ -306,8 +306,14 @@ func (svc *ControlService) mapIDsToEngine(ctx context.Context, ids string, useTr return edm, nil } -func sendManageReq(c context.Context, e *Engine, m drpc.Method, b proto.Message) (*ctlpb.SmdManageResp_Result, error) { - dResp, err := (*e).CallDrpc(c, m, b) +func sendManageReq(c context.Context, e Engine, m drpc.Method, b proto.Message) (*ctlpb.SmdManageResp_Result, error) { + if !e.IsReady() { + return &ctlpb.SmdManageResp_Result{ + Status: daos.Unreachable.Int32(), + }, nil + } + + dResp, err := e.CallDrpc(c, m, b) if err != nil { return nil, errors.Wrap(err, "call drpc") } @@ -372,7 +378,7 @@ func (svc *ControlService) SmdManage(ctx context.Context, req *ctlpb.SmdManageRe for engine, devs := range engineDevMap { devResults := []*ctlpb.SmdManageResp_Result{} - rank, err := (*engine).GetRank() + rank, err := engine.GetRank() if err != nil { return nil, errors.Wrap(err, "retrieving engine rank") } diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 0169f89ac4c..3cdec1b8a51 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -511,7 +511,7 @@ func checkEnginesReady(instances []Engine) error { if !inst.IsReady() { var err error = FaultDataPlaneNotStarted if inst.IsStarted() { - err = errInstanceNotReady + err = errEngineNotReady } return errors.Wrapf(err, "instance %d", inst.Index()) diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index cee5ceb30a7..f2a402cfd5e 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -1440,7 +1440,7 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { {Message: newBioHealthResp(2)}, }, }, - expErr: errInstanceNotReady, + expErr: errEngineNotReady, }, // Sometimes when more than a few ssds are assigned to engine without many targets, // some of the smd entries for the latter ssds are in state "NEW" rather than diff --git a/src/control/server/harness.go b/src/control/server/harness.go index b3fc4be7b96..d6a4460591a 100644 --- a/src/control/server/harness.go +++ b/src/control/server/harness.go @@ -166,7 +166,7 @@ func (h *EngineHarness) CallDrpc(ctx context.Context, method drpc.Method, body p } // Don't trigger callbacks for these errors which can happen when // things are still starting up. - if err == FaultHarnessNotStarted || err == errInstanceNotReady { + if err == FaultHarnessNotStarted || err == errEngineNotReady { return } @@ -186,22 +186,10 @@ func (h *EngineHarness) CallDrpc(ctx context.Context, method drpc.Method, body p // the first one that is available to service the request. // If the request fails, that error will be returned. for _, i := range h.Instances() { - if !i.IsReady() { - if i.IsStarted() { - if err == nil { - err = errInstanceNotReady - } - } else { - if err == nil { - err = FaultDataPlaneNotStarted - } - } - continue - } resp, err = i.CallDrpc(ctx, method, body) switch errors.Cause(err) { - case errDRPCNotReady, FaultDataPlaneNotStarted: + case errEngineNotReady, errDRPCNotReady, FaultDataPlaneNotStarted: continue default: return diff --git a/src/control/server/harness_test.go b/src/control/server/harness_test.go index 261d38013dd..19064458076 100644 --- a/src/control/server/harness_test.go +++ b/src/control/server/harness_test.go @@ -537,10 +537,11 @@ func TestServer_Harness_CallDrpc(t *testing.T) { "instance not ready": { mics: []*MockInstanceConfig{ { - Started: atm.NewBool(true), + Started: atm.NewBool(true), + CallDrpcErr: errEngineNotReady, }, }, - expErr: errInstanceNotReady, + expErr: errEngineNotReady, }, "harness not started": { mics: []*MockInstanceConfig{}, diff --git a/src/control/server/instance.go b/src/control/server/instance.go index 4e7f2640525..3837860fabd 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2019-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -287,7 +287,7 @@ func (ei *EngineInstance) SetupRank(ctx context.Context, rank ranklist.Rank, map } func (ei *EngineInstance) callSetRank(ctx context.Context, rank ranklist.Rank, map_version uint32) error { - dresp, err := ei.CallDrpc(ctx, drpc.MethodSetRank, &mgmtpb.SetRankReq{Rank: rank.Uint32(), MapVersion: map_version}) + dresp, err := ei.callDrpc(ctx, drpc.MethodSetRank, &mgmtpb.SetRankReq{Rank: rank.Uint32(), MapVersion: map_version}) if err != nil { return err } @@ -355,7 +355,7 @@ func (ei *EngineInstance) GetTargetCount() int { } func (ei *EngineInstance) callSetUp(ctx context.Context) error { - dresp, err := ei.CallDrpc(ctx, drpc.MethodSetUp, nil) + dresp, err := ei.callDrpc(ctx, drpc.MethodSetUp, nil) if err != nil { return err } diff --git a/src/control/server/instance_drpc.go b/src/control/server/instance_drpc.go index b820032e415..542b636e2f0 100644 --- a/src/control/server/instance_drpc.go +++ b/src/control/server/instance_drpc.go @@ -28,8 +28,8 @@ import ( ) var ( - errDRPCNotReady = errors.New("no dRPC client set (data plane not started?)") - errInstanceNotReady = errors.New("instance not ready yet") + errDRPCNotReady = errors.New("no dRPC client set (data plane not started?)") + errEngineNotReady = errors.New("engine not ready yet") ) func (ei *EngineInstance) setDrpcClient(c drpc.DomainSocketClient) { @@ -67,8 +67,7 @@ func (ei *EngineInstance) awaitDrpcReady() chan *srvpb.NotifyReadyReq { return ei.drpcReady } -// CallDrpc makes the supplied dRPC call via this instance's dRPC client. -func (ei *EngineInstance) CallDrpc(ctx context.Context, method drpc.Method, body proto.Message) (*drpc.Response, error) { +func (ei *EngineInstance) callDrpc(ctx context.Context, method drpc.Method, body proto.Message) (*drpc.Response, error) { dc, err := ei.getDrpcClient() if err != nil { return nil, err @@ -87,6 +86,18 @@ func (ei *EngineInstance) CallDrpc(ctx context.Context, method drpc.Method, body return makeDrpcCall(ctx, ei.log, dc, method, body) } +// CallDrpc makes the supplied dRPC call via this instance's dRPC client. +func (ei *EngineInstance) CallDrpc(ctx context.Context, method drpc.Method, body proto.Message) (*drpc.Response, error) { + if !ei.IsStarted() { + return nil, FaultDataPlaneNotStarted + } + if !ei.IsReady() { + return nil, errEngineNotReady + } + + return ei.callDrpc(ctx, method, body) +} + // drespToMemberResult converts drpc.Response to system.MemberResult. // // MemberResult is populated with rank, state and error dependent on processing diff --git a/src/control/server/instance_drpc_test.go b/src/control/server/instance_drpc_test.go index 023626678e3..c42d9ce87f1 100644 --- a/src/control/server/instance_drpc_test.go +++ b/src/control/server/instance_drpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -22,6 +22,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/daos" . "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" + "github.com/daos-stack/daos/src/control/server/engine" . "github.com/daos-stack/daos/src/control/system" ) @@ -61,13 +62,23 @@ func TestEngineInstance_NotifyDrpcReady(t *testing.T) { func TestEngineInstance_CallDrpc(t *testing.T) { for name, tc := range map[string]struct { - notReady bool - resp *drpc.Response - expErr error + notStarted bool + notReady bool + noClient bool + resp *drpc.Response + expErr error }{ + "not started": { + notStarted: true, + expErr: FaultDataPlaneNotStarted, + }, "not ready": { notReady: true, - expErr: errors.New("no dRPC client set"), + expErr: errEngineNotReady, + }, + "no client configured": { + noClient: true, + expErr: errDRPCNotReady, }, "success": { resp: &drpc.Response{}, @@ -76,8 +87,14 @@ func TestEngineInstance_CallDrpc(t *testing.T) { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - instance := getTestEngineInstance(log) - if !tc.notReady { + + trc := engine.TestRunnerConfig{} + trc.Running.Store(!tc.notStarted) + runner := engine.NewTestRunner(&trc, engine.MockConfig()) + instance := NewEngineInstance(log, nil, nil, runner) + instance.ready.Store(!tc.notReady) + + if !tc.noClient { cfg := &mockDrpcClientConfig{ SendMsgResponse: tc.resp, } diff --git a/src/control/server/mgmt_pool.go b/src/control/server/mgmt_pool.go index 79eaa2e7789..9189a7ef116 100644 --- a/src/control/server/mgmt_pool.go +++ b/src/control/server/mgmt_pool.go @@ -455,7 +455,7 @@ func (svc *mgmtSvc) poolCreate(parent context.Context, req *mgmtpb.PoolCreateReq } switch errors.Cause(err) { - case errInstanceNotReady: + case errEngineNotReady: // If the pool create failed because there was no available instance // to service the request, signal to the client that it should try again. resp.Status = int32(daos.TryAgain) diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index ff6ab204e04..2aa0f0cc058 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -251,7 +251,7 @@ func (svc *mgmtSvc) doGroupUpdate(ctx context.Context, forced bool) error { svc.log.Debugf("group update request: version: %d, ranks: %s", req.MapVersion, rankSet) dResp, err := svc.harness.CallDrpc(ctx, drpc.MethodGroupUpdate, req) if err != nil { - if err == errInstanceNotReady { + if err == errEngineNotReady { return err } svc.log.Errorf("dRPC GroupUpdate call failed: %s", err) From 0e5aff935021687cb523ca6ffd77a7470ae7d290 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Sun, 2 Jul 2023 14:57:09 -0400 Subject: [PATCH 13/16] DAOS-13878 test: Install golang >= 1.18 as a daos-tests dep (#12554) Needed for the dfuse/daos_build.py tests. Also disables module filtering for the daos-stack-deps repository in order to allow installation of newer golang RPMs. Signed-off-by: Michael MacDonald --- ci/provisioning/post_provision_config_common_functions.sh | 7 +++++++ debian/changelog | 6 ++++++ debian/control | 1 + utils/rpms/daos.spec | 6 +++++- 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 662475a5678..70dc68c4945 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -196,6 +196,13 @@ set_local_repo() { dnf -y config-manager \ --disable daos-stack-daos-"${DISTRO_GENERIC}"-"$version"-x86_64-stable-local-artifactory fi + + if [ "$repo_server" = "artifactory" ]; then + # Disable module filtering for our deps repo + deps_repo="daos-stack-deps-${DISTRO_GENERIC}-$version-x86_64-stable-local-artifactory" + dnf config-manager --save --setopt "$deps_repo.module_hotfixes=true" "$deps_repo" + fi + dnf repolist } diff --git a/debian/changelog b/debian/changelog index 7c6f5321872..d33007a7711 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.5.100-6) unstable; urgency=medium + [Michael MacDonald] + * Add golang-go as a tests dependency for dfuse/daos_build.py + + -- Michael MacDonald Thu, 29 Jun 2023 10:10:00 -0400 + daos (2.5.100-5) unstable; urgency=medium [ Li Wei ] * Update raft to 0.10.1-1408.g9524cdb diff --git a/debian/control b/debian/control index 7a01b3ef36e..7e213538e8e 100644 --- a/debian/control +++ b/debian/control @@ -115,6 +115,7 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml, ${shlibs:Depends}, ${misc:Depends}, daos-client (= ${binary:Version}), daos-admin (= ${binary:Version}), + golang-go (>=1.18), libcapstone-dev Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source software-defined object store designed from the ground up for diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 9db35377ef3..249b198799b 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.5.100 -Release: 5%{?relval}%{?dist} +Release: 6%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -216,6 +216,7 @@ Requires: git Requires: dbench Requires: lbzip2 Requires: attr +Requires: golang >= 1.18 %if (0%{?suse_version} >= 1315) Requires: lua-lmod Requires: libcapstone-devel @@ -553,6 +554,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Thu Jun 29 2023 Michael MacDonald 2.3.105-6 +- Install golang >= 1.18 as a daos-client-tests dependency + * Thu Jun 22 2023 Li Wei 2.5.100-5 - Update raft to 0.10.1-1.408.g9524cdb From 048321d97d5e96841a87f4ee418d047d3c6bdbf7 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Mon, 3 Jul 2023 09:23:49 +0800 Subject: [PATCH 14/16] DAOS-13815 bio: fix typo in on_normal() (#12522) Keep re-trigger reintegration when reint_reaction() returns 1. Required-githooks: true Signed-off-by: Niu Yawei --- src/bio/bio_recovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bio/bio_recovery.c b/src/bio/bio_recovery.c index 76fb08f991c..82b212bb1f4 100644 --- a/src/bio/bio_recovery.c +++ b/src/bio/bio_recovery.c @@ -514,6 +514,8 @@ on_normal(struct bio_blobstore *bbs) rc = ract_ops->reint_reaction(tgt_ids, tgt_cnt); if (rc < 0) D_ERROR("Reint reaction failed. "DF_RC"\n", DP_RC(rc)); + else if (rc > 0) + D_DEBUG(DB_MGMT, "Reint reaction is in-progress."); else bdev->bb_trigger_reint = false; } From d42de2c01ca608621954481c078c12000a01fedb Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Sun, 2 Jul 2023 19:37:01 -0600 Subject: [PATCH 15/16] DAOS-13884 vos: Avoid updating HAE when entries are skipped. (#12557) Without this patch, the HAE gets updated meaning that any such entries will never be aggregated until either a full scan happens or another update occurs in the subtree. If HAE is not updated, the entries will be re-scanned. Required-githooks: true Signed-off-by: Jeff Olivier --- src/vos/vos_aggregate.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/vos/vos_aggregate.c b/src/vos/vos_aggregate.c index 7b8fc13c682..4f753f45fcb 100644 --- a/src/vos/vos_aggregate.c +++ b/src/vos/vos_aggregate.c @@ -162,10 +162,8 @@ struct vos_agg_param { /* Boundary for aggregatable write filter */ daos_epoch_t ap_filter_epoch; uint32_t ap_flags; - unsigned int ap_discard:1, - ap_csum_err:1, - ap_nospc_err:1, - ap_discard_obj:1; + unsigned int ap_discard : 1, ap_csum_err : 1, ap_nospc_err : 1, ap_in_progress : 1, + ap_discard_obj : 1; struct umem_instance *ap_umm; int (*ap_yield_func)(void *arg); void *ap_yield_arg; @@ -2326,7 +2324,7 @@ vos_aggregate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, *acts |= VOS_ITER_CB_ABORT; if (rc == -DER_CSUM) { - agg_param->ap_csum_err = true; + agg_param->ap_csum_err = 1; if (vam && vam->vam_csum_errs) d_tm_inc_counter(vam->vam_csum_errs, 1); } else if (rc == -DER_NOSPACE) { @@ -2336,6 +2334,7 @@ vos_aggregate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, * this entry to avoid orphaned tree * assertion */ + agg_param->ap_in_progress = 1; agg_param->ap_skip_akey = true; agg_param->ap_skip_dkey = true; agg_param->ap_skip_obj = true; @@ -2439,6 +2438,7 @@ vos_aggregate_post_cb(daos_handle_t ih, vos_iter_entry_t *entry, if (rc == -DER_TX_BUSY) { struct vos_agg_metrics *vam = agg_cont2metrics(cont); + agg_param->ap_in_progress = 1; rc = 0; switch (type) { default: @@ -2691,6 +2691,15 @@ vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr, rc = -DER_CSUM; /* Inform caller the csum error */ close_merge_window(&ad->ad_agg_param.ap_window, rc); /* HAE needs be updated for csum error case */ + } else if (ad->ad_agg_param.ap_in_progress) { + /* Don't update HAE when there were in-progress entries. Otherwise, + * we will never aggregate anything in those subtrees until there is + * a new write. + * + * NB: We may be able to improve this by tracking the lowest epoch + * of such entries and updating the HAE to that value - 1. + */ + goto exit; } update_hae: From 03ef5e9e25ef9335de68b79fcea68c1728fe9aa3 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Mon, 3 Jul 2023 07:51:58 +0100 Subject: [PATCH 16/16] DAOS-13658 dfuse: Add filesystem query command. (#12367) Add a command to daos to query dfuse metrics. Use atomics to track number of pool and containers open, as well as inodes and file handles. Add a test, and use new command to verify container disconnect. Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse.h | 24 +++++++++-- src/client/dfuse/dfuse_cont.c | 4 +- src/client/dfuse/dfuse_core.c | 38 ++++++++++++----- src/client/dfuse/dfuse_main.c | 7 ++++ src/client/dfuse/dfuse_pool.c | 4 +- src/client/dfuse/ops/create.c | 8 ++-- src/client/dfuse/ops/ioctl.c | 34 +++++++++++++++- src/client/dfuse/ops/lookup.c | 6 +-- src/client/dfuse/ops/mknod.c | 4 +- src/client/dfuse/ops/open.c | 9 +++-- src/client/dfuse/ops/opendir.c | 13 +++--- src/client/dfuse/ops/readdir.c | 6 +-- src/client/dfuse/ops/symlink.c | 2 +- src/control/cmd/daos/filesystem.go | 48 ++++++++++++++++++++++ src/include/dfuse_ioctl.h | 17 ++++++-- src/utils/daos_hdlr.c | 32 +++++++++++++++ src/utils/daos_hdlr.h | 65 +++++++++++++++++------------- utils/node_local_test.py | 30 ++++++++++++++ 18 files changed, 279 insertions(+), 72 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 6edde3aab08..d11ddcc5ce9 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -56,6 +56,11 @@ struct dfuse_info { /* Array of dfuse_eq */ struct dfuse_eq *di_eqt; ATOMIC uint64_t di_eqt_idx; + + ATOMIC uint64_t di_inode_count; + ATOMIC uint64_t di_fh_count; + ATOMIC uint64_t di_pool_count; + ATOMIC uint64_t di_container_count; }; /* legacy, allow the old name for easier migration */ @@ -277,7 +282,8 @@ dfuse_dre_drop(struct dfuse_projection_info *fs_handle, struct dfuse_obj_hdl *oh * Set required initial state in dfuse_obj_hdl. */ void -dfuse_open_handle_init(struct dfuse_obj_hdl *oh, struct dfuse_inode_entry *ie); +dfuse_open_handle_init(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, + struct dfuse_inode_entry *ie); struct dfuse_inode_ops { void (*create)(fuse_req_t req, struct dfuse_inode_entry *parent, @@ -849,10 +855,22 @@ check_for_uns_ep(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, ch daos_size_t len); void -dfuse_ie_init(struct dfuse_inode_entry *ie); +dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie); + +#define dfuse_ie_free(_di, _ie) \ + do { \ + atomic_fetch_sub_relaxed(&(_di)->di_inode_count, 1); \ + D_FREE(_ie); \ + } while (0) + +#define dfuse_oh_free(_di, _oh) \ + do { \ + atomic_fetch_sub_relaxed(&(_di)->di_fh_count, 1); \ + D_FREE(_oh); \ + } while (0) void -dfuse_ie_close(struct dfuse_inode_entry *ie); +dfuse_ie_close(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie); /* ops/...c */ diff --git a/src/client/dfuse/dfuse_cont.c b/src/client/dfuse/dfuse_cont.c index 4243a0f0eb5..898c87a6015 100644 --- a/src/client/dfuse/dfuse_cont.c +++ b/src/client/dfuse/dfuse_cont.c @@ -78,7 +78,7 @@ dfuse_cont_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * DFUSE_TRA_UP(ie, parent, "inode"); - dfuse_ie_init(ie); + dfuse_ie_init(dfuse_info, ie); rc = dfs_lookup(dfc->dfs_ns, "/", O_RDWR, &ie->ie_obj, NULL, &ie->ie_stat); if (rc) { @@ -98,7 +98,7 @@ dfuse_cont_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * dfuse_reply_entry(dfuse_info, ie, NULL, true, req); return; close: - D_FREE(ie); + dfuse_ie_free(dfuse_info, ie); decref: d_hash_rec_decref(&dfp->dfp_cont_table, &dfc->dfs_entry); err: diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index b65690cf3e2..efab59965eb 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -201,7 +201,7 @@ ih_free(struct d_hash_table *htable, d_list_t *rlink) ie = container_of(rlink, struct dfuse_inode_entry, ie_htl); DFUSE_TRA_DEBUG(ie, "parent %#lx", ie->ie_parent); - dfuse_ie_close(ie); + dfuse_ie_close(htable->ht_priv, ie); } static d_hash_table_ops_t ie_hops = { @@ -263,7 +263,7 @@ ph_decref(struct d_hash_table *htable, d_list_t *link) } static void -_ph_free(struct dfuse_pool *dfp) +_ph_free(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp) { int rc; @@ -282,13 +282,15 @@ _ph_free(struct dfuse_pool *dfp) if (rc != -DER_SUCCESS) DFUSE_TRA_ERROR(dfp, "Failed to destroy pool hash table: " DF_RC, DP_RC(rc)); + atomic_fetch_sub_relaxed(&dfuse_info->di_pool_count, 1); + D_FREE(dfp); } static void ph_free(struct d_hash_table *htable, d_list_t *link) { - _ph_free(container_of(link, struct dfuse_pool, dfp_entry)); + _ph_free(htable->ht_priv, container_of(link, struct dfuse_pool, dfp_entry)); } static d_hash_table_ops_t pool_hops = { @@ -365,6 +367,7 @@ _ch_free(struct dfuse_info *dfuse_info, struct dfuse_cont *dfc) DFUSE_TRA_ERROR(dfc, "daos_cont_close() failed, " DF_RC, DP_RC(rc)); } + atomic_fetch_sub_relaxed(&dfuse_info->di_container_count, 1); d_hash_rec_decref(&dfuse_info->di_pool_table, &dfc->dfs_dfp->dfp_entry); D_FREE(dfc); @@ -438,12 +441,14 @@ dfuse_pool_connect(struct dfuse_info *fs_handle, const char *label, struct dfuse D_GOTO(err_disconnect, rc = daos_der2errno(rc)); } + atomic_fetch_add_relaxed(&fs_handle->di_pool_count, 1); + rlink = d_hash_rec_find_insert(&fs_handle->di_pool_table, &dfp->dfp_pool, sizeof(dfp->dfp_pool), &dfp->dfp_entry); if (rlink != &dfp->dfp_entry) { DFUSE_TRA_DEBUG(dfp, "Found existing pool, reusing"); - _ph_free(dfp); + _ph_free(fs_handle, dfp); dfp = container_of(rlink, struct dfuse_pool, dfp_entry); } @@ -865,6 +870,8 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, uuid_t *c /* Take a reference on the pool */ d_hash_rec_addref(&dfuse_info->di_pool_table, &dfp->dfp_entry); + atomic_fetch_add_relaxed(&dfuse_info->di_container_count, 1); + /* Finally insert into the hash table. This may return an existing * container if there is a race to insert, so if that happens * just use that one. @@ -1015,6 +1022,11 @@ dfuse_fs_init(struct dfuse_info *fs_handle) if (fs_handle->di_eqt == NULL) D_GOTO(err, rc = -DER_NOMEM); + atomic_init(&fs_handle->di_inode_count, 0); + atomic_init(&fs_handle->di_fh_count, 0); + atomic_init(&fs_handle->di_pool_count, 0); + atomic_init(&fs_handle->di_container_count, 0); + rc = d_hash_table_create_inplace(D_HASH_FT_LRU | D_HASH_FT_EPHEMERAL, 3, fs_handle, &pool_hops, &fs_handle->di_pool_table); if (rc != 0) @@ -1083,7 +1095,8 @@ dfuse_fs_init(struct dfuse_info *fs_handle) } void -dfuse_open_handle_init(struct dfuse_obj_hdl *oh, struct dfuse_inode_entry *ie) +dfuse_open_handle_init(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, + struct dfuse_inode_entry *ie) { oh->doh_dfs = ie->ie_dfs->dfs_ns; oh->doh_ie = ie; @@ -1092,20 +1105,22 @@ dfuse_open_handle_init(struct dfuse_obj_hdl *oh, struct dfuse_inode_entry *ie) atomic_init(&oh->doh_il_calls, 0); atomic_init(&oh->doh_readdir_number, 0); atomic_init(&oh->doh_write_count, 0); + atomic_fetch_add_relaxed(&dfuse_info->di_fh_count, 1); } void -dfuse_ie_init(struct dfuse_inode_entry *ie) +dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { atomic_init(&ie->ie_ref, 1); atomic_init(&ie->ie_open_count, 0); atomic_init(&ie->ie_open_write_count, 0); atomic_init(&ie->ie_il_count, 0); atomic_init(&ie->ie_readdir_number, 0); + atomic_fetch_add_relaxed(&dfuse_info->di_inode_count, 1); } void -dfuse_ie_close(struct dfuse_inode_entry *ie) +dfuse_ie_close(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { int rc; uint32_t ref; @@ -1138,7 +1153,7 @@ dfuse_ie_close(struct dfuse_inode_entry *ie) d_hash_rec_decref(&dfp->dfp_cont_table, &dfc->dfs_entry); } - D_FREE(ie); + dfuse_ie_free(dfuse_info, ie); } static void @@ -1270,13 +1285,13 @@ dfuse_fs_start(struct dfuse_info *fs_handle, struct dfuse_cont *dfs) ie->ie_dfs = dfs; ie->ie_root = true; ie->ie_parent = 1; - dfuse_ie_init(ie); + dfuse_ie_init(fs_handle, ie); if (dfs->dfs_ops == &dfuse_dfs_ops) { rc = dfs_lookup(dfs->dfs_ns, "/", O_RDWR, &ie->ie_obj, NULL, &ie->ie_stat); if (rc) { DFUSE_TRA_ERROR(ie, "dfs_lookup() failed: %d (%s)", rc, strerror(rc)); - D_GOTO(err, rc = daos_errno2der(rc)); + D_GOTO(err_ie, rc = daos_errno2der(rc)); } } else { ie->ie_stat.st_uid = geteuid(); @@ -1337,10 +1352,11 @@ dfuse_fs_start(struct dfuse_info *fs_handle, struct dfuse_cont *dfs) err_ie_remove: dfs_release(ie->ie_obj); d_hash_rec_delete_at(&fs_handle->dpi_iet, &ie->ie_htl); +err_ie: + dfuse_ie_free(fs_handle, ie); err: DFUSE_TRA_ERROR(fs_handle, "Failed to start dfuse, rc: " DF_RC, DP_RC(rc)); fuse_opt_free_args(&args); - D_FREE(ie); return rc; } diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c index 4f79e597cc4..8da5e9356a4 100644 --- a/src/client/dfuse/dfuse_main.c +++ b/src/client/dfuse/dfuse_main.c @@ -682,6 +682,13 @@ main(int argc, char **argv) if (rc == -DER_SUCCESS) rc = rc2; out_fini: + if (dfuse_info) { + D_ASSERT(atomic_load_relaxed(&dfuse_info->di_inode_count) == 0); + D_ASSERT(atomic_load_relaxed(&dfuse_info->di_fh_count) == 0); + D_ASSERT(atomic_load_relaxed(&dfuse_info->di_pool_count) == 0); + D_ASSERT(atomic_load_relaxed(&dfuse_info->di_container_count) == 0); + } + DFUSE_TRA_DOWN(dfuse_info); daos_fini(); out_debug: diff --git a/src/client/dfuse/dfuse_pool.c b/src/client/dfuse/dfuse_pool.c index 9d728b876aa..71072fad96b 100644 --- a/src/client/dfuse/dfuse_pool.c +++ b/src/client/dfuse/dfuse_pool.c @@ -83,7 +83,7 @@ dfuse_pool_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * DFUSE_TRA_UP(ie, parent, "inode"); - dfuse_ie_init(ie); + dfuse_ie_init(dfuse_info, ie); ie->ie_parent = parent->ie_stat.st_ino; strncpy(ie->ie_name, name, NAME_MAX); @@ -134,7 +134,7 @@ dfuse_pool_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * return; decref: d_hash_rec_decref(&dfuse_info->di_pool_table, &dfp->dfp_entry); - D_FREE(ie); + dfuse_ie_free(dfuse_info, ie); daos_prop_free(prop); err: if (rc == ENOENT) { diff --git a/src/client/dfuse/ops/create.c b/src/client/dfuse/ops/create.c index 7f88c2700f6..4959e4dcabe 100644 --- a/src/client/dfuse/ops/create.c +++ b/src/client/dfuse/ops/create.c @@ -156,8 +156,8 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, ie->ie_stat.st_uid = ctx->uid; ie->ie_stat.st_gid = ctx->gid; - dfuse_ie_init(ie); - dfuse_open_handle_init(oh, ie); + dfuse_ie_init(fs_handle, ie); + dfuse_open_handle_init(fs_handle, oh, ie); oh->doh_linear_read = false; @@ -220,6 +220,6 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, dfs_release(oh->doh_obj); err: DFUSE_REPLY_ERR_RAW(parent, req, rc); - D_FREE(oh); - D_FREE(ie); + dfuse_oh_free(fs_handle, oh); + dfuse_ie_free(fs_handle, ie); } diff --git a/src/client/dfuse/ops/ioctl.c b/src/client/dfuse/ops/ioctl.c index 5438e52eb66..1388756282b 100644 --- a/src/client/dfuse/ops/ioctl.c +++ b/src/client/dfuse/ops/ioctl.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -297,6 +297,33 @@ handle_dooh_ioctl(struct dfuse_obj_hdl *oh, size_t size, fuse_req_t req) DFUSE_REPLY_ERR_RAW(oh, req, rc); } +static void +handle_cont_query_ioctl(fuse_req_t req, const void *in_buf, size_t in_bufsz) +{ + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_mem_query query = {}; + const struct dfuse_mem_query *in_query = in_buf; + int rc; + + if (in_bufsz != sizeof(query)) + D_GOTO(err, rc = EIO); + + /* Not supported yet, future-proofing for DAOS-12751 */ + if (in_query->ino != 0) + D_GOTO(err, rc = EIO); + + query.inode_count = atomic_load_relaxed(&dfuse_info->di_inode_count); + query.fh_count = atomic_load_relaxed(&dfuse_info->di_fh_count); + query.pool_count = atomic_load_relaxed(&dfuse_info->di_pool_count); + query.container_count = atomic_load_relaxed(&dfuse_info->di_container_count); + + DFUSE_REPLY_IOCTL_SIZE(dfuse_info, req, &query, sizeof(query)); + return; + +err: + DFUSE_REPLY_ERR_RAW(dfuse_info, req, rc); +} + #ifdef FUSE_IOCTL_USE_INT void dfuse_cb_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi, unsigned int flags, @@ -336,6 +363,11 @@ void dfuse_cb_ioctl(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg, DFUSE_TRA_DEBUG(oh, "ioctl cmd=%#x", cmd); + if (cmd == DFUSE_IOCTL_COUNT_QUERY) { + handle_cont_query_ioctl(req, in_buf, in_bufsz); + return; + } + if (cmd == DFUSE_IOCTL_IL) { if (out_bufsz < sizeof(struct dfuse_il_reply)) D_GOTO(out_err, rc = EIO); diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index 07f3e48df3b..511c978ebb2 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -118,7 +118,7 @@ dfuse_reply_entry(struct dfuse_projection_info *fs_handle, strncpy(inode->ie_name, ie->ie_name, NAME_MAX + 1); } atomic_fetch_sub_relaxed(&ie->ie_ref, 1); - dfuse_ie_close(ie); + dfuse_ie_close(fs_handle, ie); ie = inode; } @@ -258,7 +258,7 @@ dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, DFUSE_TRA_UP(ie, parent, "inode"); - dfuse_ie_init(ie); + dfuse_ie_init(fs_handle, ie); ie->ie_parent = parent->ie_stat.st_ino; ie->ie_dfs = parent->ie_dfs; @@ -295,7 +295,7 @@ dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, out_release: dfs_release(ie->ie_obj); out_free: - D_FREE(ie); + dfuse_ie_free(fs_handle, ie); out: if (rc == ENOENT && parent->ie_dfs->dfc_ndentry_timeout > 0) { struct fuse_entry_param entry = {}; diff --git a/src/client/dfuse/ops/mknod.c b/src/client/dfuse/ops/mknod.c index b9a213a922d..79394bf41d6 100644 --- a/src/client/dfuse/ops/mknod.c +++ b/src/client/dfuse/ops/mknod.c @@ -29,7 +29,7 @@ dfuse_cb_mknod(fuse_req_t req, struct dfuse_inode_entry *parent, const char *nam if (rc != 0) D_GOTO(err, rc); - dfuse_ie_init(ie); + dfuse_ie_init(fs_handle, ie); ie->ie_stat.st_uid = ctx->uid; ie->ie_stat.st_gid = ctx->gid; @@ -56,5 +56,5 @@ dfuse_cb_mknod(fuse_req_t req, struct dfuse_inode_entry *parent, const char *nam return; err: DFUSE_REPLY_ERR_RAW(parent, req, rc); - D_FREE(ie); + dfuse_ie_free(fs_handle, ie); } diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index bc74bc69f3d..94304923d86 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -30,7 +30,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) DFUSE_TRA_UP(oh, ie, "open handle"); - dfuse_open_handle_init(oh, ie); + dfuse_open_handle_init(fs_handle, oh, ie); /* Upgrade fd permissions from O_WRONLY to O_RDWR if wb caching is * enabled so the kernel can do read-modify-write @@ -92,14 +92,15 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) return; err: d_hash_rec_decref(&fs_handle->dpi_iet, rlink); - D_FREE(oh); + dfuse_oh_free(fs_handle, oh); DFUSE_REPLY_ERR_RAW(ie, req, rc); } void dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; int rc; uint32_t il_calls; @@ -152,5 +153,5 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) DFUSE_REPLY_ZERO(oh, req); else DFUSE_REPLY_ERR_RAW(oh, req, rc); - D_FREE(oh); + dfuse_oh_free(dfuse_info, oh); } diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index d76fbf28800..9c03d3fdc7d 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -10,7 +10,8 @@ void dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_info *fi) { - struct dfuse_obj_hdl *oh = NULL; + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_obj_hdl *oh; struct fuse_file_info fi_out = {0}; int rc; @@ -20,7 +21,7 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ DFUSE_TRA_UP(oh, ie, "open handle"); - dfuse_open_handle_init(oh, ie); + dfuse_open_handle_init(dfuse_info, oh, ie); fi_out.fh = (uint64_t)oh; @@ -48,8 +49,8 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ void dfuse_cb_releasedir(fuse_req_t req, struct dfuse_inode_entry *ino, struct fuse_file_info *fi) { - struct dfuse_projection_info *fs_handle = fuse_req_userdata(req); - struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; /* Perform the opposite of what the ioctl call does, always change the open handle count * but the inode only tracks number of open handles with non-zero ioctl counts @@ -69,6 +70,6 @@ dfuse_cb_releasedir(fuse_req_t req, struct dfuse_inode_entry *ino, struct fuse_f } DFUSE_REPLY_ZERO(oh, req); - dfuse_dre_drop(fs_handle, oh); - D_FREE(oh); + dfuse_dre_drop(dfuse_info, oh); + dfuse_oh_free(dfuse_info, oh); }; diff --git a/src/client/dfuse/ops/readdir.c b/src/client/dfuse/ops/readdir.c index 9602c2330df..bd1a9a537c9 100644 --- a/src/client/dfuse/ops/readdir.c +++ b/src/client/dfuse/ops/readdir.c @@ -188,7 +188,7 @@ create_entry(struct dfuse_projection_info *fs_handle, struct dfuse_inode_entry * DFUSE_TRA_UP(ie, parent, "inode"); - dfuse_ie_init(ie); + dfuse_ie_init(fs_handle, ie); ie->ie_obj = obj; ie->ie_stat = *stbuf; @@ -244,13 +244,13 @@ create_entry(struct dfuse_projection_info *fs_handle, struct dfuse_inode_entry * strncpy(inode->ie_name, ie->ie_name, NAME_MAX + 1); atomic_fetch_sub_relaxed(&ie->ie_ref, 1); - dfuse_ie_close(ie); + dfuse_ie_close(fs_handle, ie); ie = inode; } *rlinkp = rlink; if (rc != 0) - dfuse_ie_close(ie); + dfuse_ie_close(fs_handle, ie); out: return rc; } diff --git a/src/client/dfuse/ops/symlink.c b/src/client/dfuse/ops/symlink.c index d7ada52bbbc..6a90a637c76 100644 --- a/src/client/dfuse/ops/symlink.c +++ b/src/client/dfuse/ops/symlink.c @@ -22,7 +22,7 @@ dfuse_cb_symlink(fuse_req_t req, const char *link, struct dfuse_inode_entry *par DFUSE_TRA_UP(ie, parent, "inode"); - dfuse_ie_init(ie); + dfuse_ie_init(fs_handle, ie); ie->ie_stat.st_uid = ctx->uid; ie->ie_stat.st_gid = ctx->gid; diff --git a/src/control/cmd/daos/filesystem.go b/src/control/cmd/daos/filesystem.go index 6e683574152..902d07e91c0 100644 --- a/src/control/cmd/daos/filesystem.go +++ b/src/control/cmd/daos/filesystem.go @@ -37,6 +37,7 @@ type fsCmd struct { ResetAttr fsResetAttrCmd `command:"reset-attr" description:"reset fs attributes"` ResetChunkSize fsResetChunkSizeCmd `command:"reset-chunk-size" description:"reset fs chunk size"` ResetObjClass fsResetOclassCmd `command:"reset-oclass" description:"reset fs obj class"` + DfuseQuery fsDfuseQueryCmd `command:"query" description:"Query dfuse for memory usage"` } type fsCopyCmd struct { @@ -428,3 +429,50 @@ func (cmd *fsFixRootCmd) Execute(_ []string) error { return nil } + +type fsDfuseQueryCmd struct { + daosCmd + + Args struct { + Path string `positional-arg-name:"path" description:"DFuse path to query" required:"1"` + } `positional-args:"yes"` +} + +func (cmd *fsDfuseQueryCmd) Execute(_ []string) error { + ap, deallocCmdArgs, err := allocCmdArgs(cmd.Logger) + if err != nil { + return err + } + + ap.path = C.CString(cmd.Args.Path) + defer freeString(ap.path) + defer deallocCmdArgs() + + rc := C.dfuse_count_query(ap) + if err := daosError(rc); err != nil { + return errors.Wrapf(err, "failed to query %s", cmd.Args.Path) + } + + if cmd.jsonOutputEnabled() { + jsonAttrs := &struct { + NumInodes uint64 `json:"inodes"` + NumFileHandles uint64 `json:"open_files"` + NumPools uint64 `json:"pools"` + NumContainers uint64 `json:"containers"` + }{ + NumInodes: uint64(ap.dfuse_mem.inode_count), + NumFileHandles: uint64(ap.dfuse_mem.fh_count), + NumPools: uint64(ap.dfuse_mem.pool_count), + NumContainers: uint64(ap.dfuse_mem.container_count), + } + return cmd.outputJSON(jsonAttrs, nil) + } + + cmd.Infof("DFuse descriptor usage.") + cmd.Infof(" Pools: %d", ap.dfuse_mem.pool_count) + cmd.Infof(" Containers: %d", ap.dfuse_mem.container_count) + cmd.Infof(" Inodes: %d", ap.dfuse_mem.inode_count) + cmd.Infof(" Open files: %d", ap.dfuse_mem.fh_count) + + return nil +} diff --git a/src/include/dfuse_ioctl.h b/src/include/dfuse_ioctl.h index d5d5bec56cf..e642eb7f12f 100644 --- a/src/include/dfuse_ioctl.h +++ b/src/include/dfuse_ioctl.h @@ -15,9 +15,7 @@ #define DFUSE_IOCTL_REPLY_CORE (DFUSE_IOCTL_REPLY_BASE) -/* (DFUSE_IOCTL_REPLY_BASE + 1) is reserved by an older version of - * IOCTL_REPLY_SIZE - */ +/* (DFUSE_IOCTL_REPLY_BASE + 1) is reserved by an older version of IOCTL_REPLY_SIZE */ #define DFUSE_IOCTL_REPLY_POH (DFUSE_IOCTL_REPLY_BASE + 2) #define DFUSE_IOCTL_REPLY_COH (DFUSE_IOCTL_REPLY_BASE + 3) @@ -28,6 +26,7 @@ #define DFUSE_IOCTL_REPLY_PFILE (DFUSE_IOCTL_REPLY_BASE + 8) #define DFUSE_IOCTL_R_DFUSE_USER (DFUSE_IOCTL_REPLY_BASE + 9) +#define DFUSE_COUNT_QUERY_CMD (DFUSE_IOCTL_REPLY_BASE + 10) /** Metadada caching is enabled for this file */ #define DFUSE_IOCTL_FLAGS_MCACHE (0x1) @@ -60,6 +59,15 @@ struct dfuse_user_reply { gid_t gid; }; +struct dfuse_mem_query { + uint64_t inode_count; + uint64_t fh_count; + uint64_t pool_count; + uint64_t container_count; + ino_t ino; + bool found; +}; + /* Defines the IOCTL command to get the object ID for a open file */ #define DFUSE_IOCTL_IL ((int)_IOR(DFUSE_IOCTL_TYPE, DFUSE_IOCTL_REPLY_CORE, struct dfuse_il_reply)) @@ -76,4 +84,7 @@ struct dfuse_user_reply { #define DFUSE_IOCTL_DFUSE_USER \ ((int)_IOR(DFUSE_IOCTL_TYPE, DFUSE_IOCTL_R_DFUSE_USER, struct dfuse_user_reply)) +#define DFUSE_IOCTL_COUNT_QUERY \ + ((int)_IOWR(DFUSE_IOCTL_TYPE, DFUSE_COUNT_QUERY_CMD, struct dfuse_mem_query)) + #endif /* __DFUSE_IOCTL_H__ */ diff --git a/src/utils/daos_hdlr.c b/src/utils/daos_hdlr.c index f0cdeaf4361..cae356928f7 100644 --- a/src/utils/daos_hdlr.c +++ b/src/utils/daos_hdlr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -2437,3 +2438,34 @@ cont_clone_hdlr(struct cmd_args_s *ap) } return rc; } + +int +dfuse_count_query(struct cmd_args_s *ap) +{ + struct dfuse_mem_query query = {}; + int rc = -DER_SUCCESS; + int fd; + + fd = open(ap->path, O_NOFOLLOW, O_RDONLY); + if (fd < 0) { + rc = errno; + DH_PERROR_SYS(ap, rc, "Failed to open path"); + return daos_errno2der(rc); + } + + rc = ioctl(fd, DFUSE_IOCTL_COUNT_QUERY, &query); + if (rc < 0) { + rc = daos_errno2der(errno); + DH_PERROR_DER(ap, rc, "ioctl failed"); + goto close; + } + + ap->dfuse_mem.inode_count = query.inode_count; + ap->dfuse_mem.fh_count = query.fh_count; + ap->dfuse_mem.pool_count = query.pool_count; + ap->dfuse_mem.container_count = query.container_count; + +close: + close(fd); + return rc; +} diff --git a/src/utils/daos_hdlr.h b/src/utils/daos_hdlr.h index 9124095ae7a..2dbb3a5ea45 100644 --- a/src/utils/daos_hdlr.h +++ b/src/utils/daos_hdlr.h @@ -9,6 +9,8 @@ #include +#include + #define OID_ARR_SIZE 8 enum fs_op { @@ -159,19 +161,11 @@ struct cmd_args_s { char *group; /* --group name */ bool verbose; /* --verbose mode */ char *entry; /* --entry for ACL */ - char *principal; /* --principal for ACL */ -}; - -#define ARGS_VERIFY_PATH_CREATE(ap, label, rcexpr) \ - do { \ - if (((ap)->type == DAOS_PROP_CO_LAYOUT_UNKNOWN)) { \ - fprintf(stderr, "create by --path : must also " \ - "specify --type\n"); \ - D_GOTO(label, (rcexpr)); \ - } \ - } while (0) + char *principal; /* --principal for ACL */ -typedef int (*command_hdlr_t)(struct cmd_args_s *ap); + /* DFuse related */ + struct dfuse_mem_query dfuse_mem; /* --memquery */ +}; int pool_autotest_hdlr(struct cmd_args_s *ap); /* TODO: implement these pool op functions @@ -179,25 +173,38 @@ int pool_autotest_hdlr(struct cmd_args_s *ap); */ /* general datamover operations */ -void dm_cont_free_usr_attrs(int n, char ***_names, void ***_buffers, size_t **_sizes); -int dm_cont_get_usr_attrs(struct cmd_args_s *ap, daos_handle_t coh, int *_n, char ***_names, - void ***_buffers, size_t **_sizes); -int dm_cont_get_all_props(struct cmd_args_s *ap, daos_handle_t coh, daos_prop_t **_props, - bool get_oid, bool get_label, bool get_roots); -int dm_copy_usr_attrs(struct cmd_args_s *ap, daos_handle_t src_coh, daos_handle_t dst_coh); +void +dm_cont_free_usr_attrs(int n, char ***_names, void ***_buffers, size_t **_sizes); +int +dm_cont_get_usr_attrs(struct cmd_args_s *ap, daos_handle_t coh, int *_n, char ***_names, + void ***_buffers, size_t **_sizes); +int +dm_cont_get_all_props(struct cmd_args_s *ap, daos_handle_t coh, daos_prop_t **_props, bool get_oid, + bool get_label, bool get_roots); +int +dm_copy_usr_attrs(struct cmd_args_s *ap, daos_handle_t src_coh, daos_handle_t dst_coh); /* DAOS filesystem operations */ -int fs_copy_hdlr(struct cmd_args_s *ap); -int fs_dfs_hdlr(struct cmd_args_s *ap); -int fs_dfs_get_attr_hdlr(struct cmd_args_s *ap, dfs_obj_info_t *attrs); -int parse_filename_dfs(const char *path, char **_obj_name, char **_cont_name); -int fs_fix_entry_hdlr(struct cmd_args_s *ap, bool fix_entry); -int fs_recreate_sb_hdlr(struct cmd_args_s *ap); -int fs_relink_root_hdlr(struct cmd_args_s *ap); +int +fs_copy_hdlr(struct cmd_args_s *ap); +int +fs_dfs_hdlr(struct cmd_args_s *ap); +int +fs_dfs_get_attr_hdlr(struct cmd_args_s *ap, dfs_obj_info_t *attrs); +int +parse_filename_dfs(const char *path, char **_obj_name, char **_cont_name); +int +fs_fix_entry_hdlr(struct cmd_args_s *ap, bool fix_entry); +int +fs_recreate_sb_hdlr(struct cmd_args_s *ap); +int +fs_relink_root_hdlr(struct cmd_args_s *ap); /* Container operations */ -int cont_check_hdlr(struct cmd_args_s *ap); -int cont_clone_hdlr(struct cmd_args_s *ap); +int +cont_check_hdlr(struct cmd_args_s *ap); +int +cont_clone_hdlr(struct cmd_args_s *ap); /* TODO implement the following container op functions * all with signatures similar to this: @@ -207,4 +214,8 @@ int cont_clone_hdlr(struct cmd_args_s *ap); * int cont_rollback_hdlr() */ +/* Dfuse operations, mostly handled through ioctls */ +int +dfuse_count_query(struct cmd_args_s *ap); + #endif /* __DAOS_HDLR_H__ */ diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 28620350452..7ef0c56b664 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1428,6 +1428,27 @@ def il_cmd(self, cmd, check_read=True, check_write=True, check_fstat=True): assert ret.returncode == 0, ret return ret + def check_usage(self, inodes=None, open_files=None, pools=None, containers=None): + """Query and verify the dfuse statistics. + + Returns the raw numbers in a dict. + """ + rc = run_daos_cmd(self.conf, ['filesystem', 'query', self.dir], use_json=True) + print(rc) + assert rc.returncode == 0 + + print(dir(rc.json)) + + if inodes: + assert rc.json['response']['inodes'] == inodes, rc + if open_files: + assert rc.json['response']['open_files'] == open_files, rc + if pools: + assert rc.json['response']['pools'] == pools, rc + if containers: + assert rc.json['response']['containers'] == containers, rc + return rc.json['response'] + def assert_file_size_fd(fd, size): """Verify the file size is as expected""" @@ -2563,6 +2584,12 @@ def test_uns_create(self): print(stbuf) assert stbuf.st_ino < 100 print(os.listdir(path)) + rc = run_daos_cmd(self.conf, ['filesystem', 'query', self.dfuse.dir]) + print(rc) + assert rc.returncode == 0 + rc = run_daos_cmd(self.conf, ['filesystem', 'query', self.dfuse.dir], use_json=True) + print(rc) + assert rc.returncode == 0 @needs_dfuse def test_uns_link(self): @@ -2596,8 +2623,11 @@ def test_uns_link(self): print(stbuf) assert stbuf.st_ino < 100 print(os.listdir(path)) + self.dfuse.check_usage(inodes=2, open_files=1, containers=2, pools=1) cmd = ['cont', 'destroy', '--path', path] rc = run_daos_cmd(self.conf, cmd) + assert rc.returncode == 0 + rc = self.dfuse.check_usage(inodes=1, open_files=1, containers=1, pools=1) @needs_dfuse def test_rename_clobber(self):