Skip to content

Commit

Permalink
DAOS-13696 control: Eliminate possible data races (#12442)
Browse files Browse the repository at this point in the history
- Return copies of the GetAttachInfoResp instead of the direct
  pointer.
- cache.Item implementations should assume the caller is responsible
  for locking/unlocking them, since the method is exported.
- cache.Cache.Keys() calls should use a read lock.
- Document daos_agent refresh signal in admin guide.

Signed-off-by: Kris Jacque <kris.jacque@intel.com>
  • Loading branch information
kjacque committed Jul 5, 2023
1 parent fbd7c53 commit 28f7c2c
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 15 deletions.
9 changes: 9 additions & 0 deletions docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,15 @@ fabric_iface_port: 31316
# engine 1
fabric_iface_port: 31416
```
### daos_agent cache of engine URIs is stale

The `daos_agent` cache may become invalid if `daos_engine` processes restart with different
configurations or IP addresses, or if the DAOS system is reformatted.
If this happens, the `daos` tool (as well as other I/O or `libdaos` operations) may return
`-DER_BAD_TARGET` (-1035) errors.

To resolve the issue, a privileged user may send a `SIGUSR2` signal to the `daos_agent` process to
force an immediate cache refresh.

## Diagnostic and Recovery Tools

Expand Down
29 changes: 22 additions & 7 deletions src/control/cmd/daos_agent/infocache.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,6 @@ func (ci *cachedAttachInfo) Refresh(ctx context.Context) error {
return errors.New("cachedAttachInfo is nil")
}

ci.Lock()
defer ci.Unlock()

req := &control.GetAttachInfoReq{System: ci.system, AllRanks: true}
resp, err := ci.fetch(ctx, ci.rpcClient, req)
if err != nil {
Expand Down Expand Up @@ -185,9 +182,6 @@ func (cfi *cachedFabricInfo) Refresh(ctx context.Context) error {
return errors.New("cachedFabricInfo is nil")
}

cfi.Lock()
defer cfi.Unlock()

results, err := cfi.fetch(ctx)
if err != nil {
return errors.Wrap(err, "refreshing cached fabric info")
Expand Down Expand Up @@ -328,7 +322,28 @@ func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.Get
return nil, errors.Errorf("unexpected attach info data type %T", item)
}

return cai.lastResponse, nil
return copyGetAttachInfoResp(cai.lastResponse), nil
}

func copyGetAttachInfoResp(orig *control.GetAttachInfoResp) *control.GetAttachInfoResp {
if orig == nil {
return nil
}

cp := new(control.GetAttachInfoResp)
*cp = *orig

// Copy slices instead of using original pointers
cp.MSRanks = make([]uint32, len(orig.MSRanks))
_ = copy(cp.MSRanks, orig.MSRanks)
cp.ServiceRanks = make([]*control.PrimaryServiceRank, len(orig.ServiceRanks))
_ = copy(cp.ServiceRanks, orig.ServiceRanks)

if orig.ClientNetHint.EnvVars != nil {
cp.ClientNetHint.EnvVars = make([]string, len(orig.ClientNetHint.EnvVars))
_ = copy(cp.ClientNetHint.EnvVars, orig.ClientNetHint.EnvVars)
}
return cp
}

func (c *InfoCache) getAttachInfoRemote(ctx context.Context, sys string) (*control.GetAttachInfoResp, error) {
Expand Down
35 changes: 27 additions & 8 deletions src/control/lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ func (ic *ItemCache) Keys() []string {
return nil
}

ic.mutex.RLock()
defer ic.mutex.RUnlock()

return ic.keys()
}

func (ic *ItemCache) keys() []string {
keys := []string{}
for k := range ic.items {
keys = append(keys, k)
Expand Down Expand Up @@ -144,13 +151,14 @@ func (ic *ItemCache) GetOrCreate(ctx context.Context, key string, missFn ItemCre
ic.set(item)
}

item.Lock()
if item.NeedsRefresh() {
if err := item.Refresh(ctx); err != nil {
item.Unlock()
return nil, noopRelease, errors.Wrapf(err, "fetch data for %q", key)
}
ic.log.Debugf("refreshed item %q", key)
}
item.Lock()

return item, item.Unlock, nil
}
Expand All @@ -174,13 +182,14 @@ func (ic *ItemCache) Get(ctx context.Context, key string) (Item, func(), error)
return nil, noopRelease, err
}

item.Lock()
if item.NeedsRefresh() {
if err := item.Refresh(ctx); err != nil {
item.Unlock()
return nil, noopRelease, errors.Wrapf(err, "fetch data for %q", key)
}
ic.log.Debugf("refreshed item %q", key)
}
item.Lock()

return item, item.Unlock, nil
}
Expand All @@ -203,18 +212,28 @@ func (ic *ItemCache) Refresh(ctx context.Context, keys ...string) error {
defer ic.mutex.Unlock()

if len(keys) == 0 {
keys = ic.Keys()
keys = ic.keys()
}

for _, key := range keys {
item, err := ic.get(key)
if err != nil {
if err := ic.refreshItem(ctx, key); err != nil {
return err
}
}
return nil
}

if err := item.Refresh(ctx); err != nil {
return errors.Wrapf(err, "failed to refresh cached item %q", item.Key())
}
func (ic *ItemCache) refreshItem(ctx context.Context, key string) error {
item, err := ic.get(key)
if err != nil {
return err
}

item.Lock()
defer item.Unlock()
if err := item.Refresh(ctx); err != nil {
return errors.Wrapf(err, "failed to refresh cached item %q", item.Key())
}

return nil
}

0 comments on commit 28f7c2c

Please sign in to comment.