From 28f7c2ca167d72c34211def591f96822923dd1d9 Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Wed, 5 Jul 2023 06:58:59 -0600 Subject: [PATCH] DAOS-13696 control: Eliminate possible data races (#12442) - Return copies of the GetAttachInfoResp instead of the direct pointer. - cache.Item implementations should assume the caller is responsible for locking/unlocking them, since the method is exported. - cache.Cache.Keys() calls should use a read lock. - Document daos_agent refresh signal in admin guide. Signed-off-by: Kris Jacque --- docs/admin/troubleshooting.md | 9 +++++++ src/control/cmd/daos_agent/infocache.go | 29 +++++++++++++++----- src/control/lib/cache/cache.go | 35 +++++++++++++++++++------ 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/docs/admin/troubleshooting.md b/docs/admin/troubleshooting.md index 1f49bf1eb4d..f7e0414be4e 100644 --- a/docs/admin/troubleshooting.md +++ b/docs/admin/troubleshooting.md @@ -463,6 +463,15 @@ fabric_iface_port: 31316 # engine 1 fabric_iface_port: 31416 ``` +### daos_agent cache of engine URIs is stale + +The `daos_agent` cache may become invalid if `daos_engine` processes restart with different +configurations or IP addresses, or if the DAOS system is reformatted. +If this happens, the `daos` tool (as well as other I/O or `libdaos` operations) may return +`-DER_BAD_TARGET` (-1035) errors. + +To resolve the issue, a privileged user may send a `SIGUSR2` signal to the `daos_agent` process to +force an immediate cache refresh. ## Diagnostic and Recovery Tools diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index 0eec423bd35..0dbdf4fc645 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -139,9 +139,6 @@ func (ci *cachedAttachInfo) Refresh(ctx context.Context) error { return errors.New("cachedAttachInfo is nil") } - ci.Lock() - defer ci.Unlock() - req := &control.GetAttachInfoReq{System: ci.system, AllRanks: true} resp, err := ci.fetch(ctx, ci.rpcClient, req) if err != nil { @@ -185,9 +182,6 @@ func (cfi *cachedFabricInfo) Refresh(ctx context.Context) error { return errors.New("cachedFabricInfo is nil") } - cfi.Lock() - defer cfi.Unlock() - results, err := cfi.fetch(ctx) if err != nil { return errors.Wrap(err, "refreshing cached fabric info") @@ -328,7 +322,28 @@ func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.Get return nil, errors.Errorf("unexpected attach info data type %T", item) } - return cai.lastResponse, nil + return copyGetAttachInfoResp(cai.lastResponse), nil +} + +func copyGetAttachInfoResp(orig *control.GetAttachInfoResp) *control.GetAttachInfoResp { + if orig == nil { + return nil + } + + cp := new(control.GetAttachInfoResp) + *cp = *orig + + // Copy slices instead of using original pointers + cp.MSRanks = make([]uint32, len(orig.MSRanks)) + _ = copy(cp.MSRanks, orig.MSRanks) + cp.ServiceRanks = make([]*control.PrimaryServiceRank, len(orig.ServiceRanks)) + _ = copy(cp.ServiceRanks, orig.ServiceRanks) + + if orig.ClientNetHint.EnvVars != nil { + cp.ClientNetHint.EnvVars = make([]string, len(orig.ClientNetHint.EnvVars)) + _ = copy(cp.ClientNetHint.EnvVars, orig.ClientNetHint.EnvVars) + } + return cp } func (c *InfoCache) getAttachInfoRemote(ctx context.Context, sys string) (*control.GetAttachInfoResp, error) { diff --git a/src/control/lib/cache/cache.go b/src/control/lib/cache/cache.go index 4ffd98e3fb7..c31d82b0a61 100644 --- a/src/control/lib/cache/cache.go +++ b/src/control/lib/cache/cache.go @@ -94,6 +94,13 @@ func (ic *ItemCache) Keys() []string { return nil } + ic.mutex.RLock() + defer ic.mutex.RUnlock() + + return ic.keys() +} + +func (ic *ItemCache) keys() []string { keys := []string{} for k := range ic.items { keys = append(keys, k) @@ -144,13 +151,14 @@ func (ic *ItemCache) GetOrCreate(ctx context.Context, key string, missFn ItemCre ic.set(item) } + item.Lock() if item.NeedsRefresh() { if err := item.Refresh(ctx); err != nil { + item.Unlock() return nil, noopRelease, errors.Wrapf(err, "fetch data for %q", key) } ic.log.Debugf("refreshed item %q", key) } - item.Lock() return item, item.Unlock, nil } @@ -174,13 +182,14 @@ func (ic *ItemCache) Get(ctx context.Context, key string) (Item, func(), error) return nil, noopRelease, err } + item.Lock() if item.NeedsRefresh() { if err := item.Refresh(ctx); err != nil { + item.Unlock() return nil, noopRelease, errors.Wrapf(err, "fetch data for %q", key) } ic.log.Debugf("refreshed item %q", key) } - item.Lock() return item, item.Unlock, nil } @@ -203,18 +212,28 @@ func (ic *ItemCache) Refresh(ctx context.Context, keys ...string) error { defer ic.mutex.Unlock() if len(keys) == 0 { - keys = ic.Keys() + keys = ic.keys() } for _, key := range keys { - item, err := ic.get(key) - if err != nil { + if err := ic.refreshItem(ctx, key); err != nil { return err } + } + return nil +} - if err := item.Refresh(ctx); err != nil { - return errors.Wrapf(err, "failed to refresh cached item %q", item.Key()) - } +func (ic *ItemCache) refreshItem(ctx context.Context, key string) error { + item, err := ic.get(key) + if err != nil { + return err } + + item.Lock() + defer item.Unlock() + if err := item.Refresh(ctx); err != nil { + return errors.Wrapf(err, "failed to refresh cached item %q", item.Key()) + } + return nil }