Skip to content

Commit

Permalink
DAOS-13997 control: Allow labels for fault domain levels (#15173)
Browse files Browse the repository at this point in the history
This patch allows the fault domain levels defined in the server
config file to be assigned human-readable labels, for example:
/cluster=wolf/rack=123/node=wolf-45

- Continue allowing fully unlabeled fault paths.
- Record the domain labels as a management property for the first
  rank to join. Labels cannot be changed once set.
- All ranks joining the system must have the same labels.
- If any domain levels have a label, all of them must. For example,
  /cluster=wolf/wolf-45 would not be allowed.

Signed-off-by: Kris Jacque <kris.jacque@intel.com>
  • Loading branch information
kjacque authored Sep 27, 2024
1 parent c500d95 commit a1d734a
Show file tree
Hide file tree
Showing 9 changed files with 775 additions and 51 deletions.
1 change: 1 addition & 0 deletions src/control/fault/code/codes.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ const (
ServerNoCompatibilityInsecure
ServerPoolHasContainers
ServerHugepagesDisabled
ServerBadFaultDomainLabels
)

// server config fault codes
Expand Down
13 changes: 8 additions & 5 deletions src/control/server/config/faults.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ var (
"no DAOS IO Engines specified in configuration",
"specify at least one IO Engine configuration ('engines' list parameter) and restart the control server",
)
FaultConfigFaultDomainInvalid = serverConfigFault(
code.ServerConfigFaultDomainInvalid,
"invalid fault domain",
"specify a valid fault domain ('fault_path' parameter) or callback script ('fault_cb' parameter) and restart the control server",
)
FaultConfigFaultCallbackNotFound = serverConfigFault(
code.ServerConfigFaultCallbackNotFound,
"fault domain callback script not found",
Expand Down Expand Up @@ -113,6 +108,14 @@ var (
)
)

func FaultConfigFaultDomainInvalid(err error) *fault.Fault {
return serverConfigFault(
code.ServerConfigFaultDomainInvalid,
fmt.Sprintf("invalid fault domain: %s", err.Error()),
"specify a valid fault domain ('fault_path' parameter) or callback script ('fault_cb' parameter) and restart the control server",
)
}

func FaultConfigDuplicateFabric(curIdx, seenIdx int) *fault.Fault {
return serverConfigFault(
code.ServerConfigDuplicateFabric,
Expand Down
9 changes: 6 additions & 3 deletions src/control/server/faultdomain.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -54,8 +54,11 @@ func getFaultDomain(cfg *config.Server) (*system.FaultDomain, error) {

func newFaultDomainFromConfig(domainStr string) (*system.FaultDomain, error) {
fd, err := system.NewFaultDomainFromString(domainStr)
if err != nil || fd.NumLevels() == 0 {
return nil, config.FaultConfigFaultDomainInvalid
if err != nil {
return nil, config.FaultConfigFaultDomainInvalid(err)
}
if fd.NumLevels() == 0 {
return nil, config.FaultConfigFaultDomainInvalid(errors.New("at least one domain level is required"))
}
// TODO DAOS-6353: remove when multiple layers supported
if fd.NumLevels() > 2 {
Expand Down
12 changes: 6 additions & 6 deletions src/control/server/faultdomain_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2022 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -52,7 +52,7 @@ func TestServer_getDefaultFaultDomain(t *testing.T) {
getHostname: func() (string, error) {
return "/////////", nil
},
expErr: config.FaultConfigFaultDomainInvalid,
expErr: config.FaultConfigFaultDomainInvalid(errors.New("domain name \"\": empty string is an invalid fault domain")),
},
} {
t.Run(name, func(t *testing.T) {
Expand Down Expand Up @@ -107,13 +107,13 @@ func TestServer_getFaultDomain(t *testing.T) {
cfg: &config.Server{
FaultPath: "junk",
},
expErr: config.FaultConfigFaultDomainInvalid,
expErr: config.FaultConfigFaultDomainInvalid(errors.New("fault path must start with root (/)")),
},
"root-only path is not valid": {
cfg: &config.Server{
FaultPath: "/",
},
expErr: config.FaultConfigFaultDomainInvalid,
expErr: config.FaultConfigFaultDomainInvalid(errors.New("at least one domain level is required")),
},
"too many layers": { // TODO DAOS-6353: change when arbitrary layers supported
cfg: &config.Server{
Expand Down Expand Up @@ -284,11 +284,11 @@ func TestServer_getFaultDomainFromCallback(t *testing.T) {
},
"script returned invalid fault domain": {
scriptPath: invalidScriptPath,
expErr: config.FaultConfigFaultDomainInvalid,
expErr: config.FaultConfigFaultDomainInvalid(errors.New("fault path must start with root (/)")),
},
"script returned root fault domain": {
scriptPath: rootScriptPath,
expErr: config.FaultConfigFaultDomainInvalid,
expErr: config.FaultConfigFaultDomainInvalid(errors.New("at least one domain level is required")),
},
"script returned fault domain with too many layers": { // TODO DAOS-6353: change when multiple layers supported
scriptPath: multiLayerScriptPath,
Expand Down
9 changes: 9 additions & 0 deletions src/control/server/faults.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,15 @@ func FaultNoCompatibilityInsecure(self, other build.Version) *fault.Fault {
)
}

func FaultBadFaultDomainLabels(faultPath, addr string, reqLabels, systemLabels []string) *fault.Fault {
return serverFault(
code.ServerBadFaultDomainLabels,
fmt.Sprintf("labels in join request [%s] don't match system labels [%s] for server %s (fault path: %s)",
strings.Join(reqLabels, ", "), strings.Join(systemLabels, ", "), addr, faultPath),
"update the 'fault_path' or executable specified in 'fault_cb' in the affected server's configuration file to match the system labels",
)
}

func serverFault(code code.Code, desc, res string) *fault.Fault {
return &fault.Fault{
Domain: "server",
Expand Down
69 changes: 67 additions & 2 deletions src/control/server/mgmt_system.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@ import (
"github.com/daos-stack/daos/src/control/lib/hostlist"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/config"
"github.com/daos-stack/daos/src/control/system"
"github.com/daos-stack/daos/src/control/system/checker"
"github.com/daos-stack/daos/src/control/system/raft"
)

const fabricProviderProp = "fabric_providers"
const groupUpdatePauseProp = "group_update_paused"
const domainLabelsProp = "domain_labels"

const domainLabelsSep = "=" // invalid in a label name

// GetAttachInfo handles a request to retrieve a map of ranks to fabric URIs, in addition
// to client network autoconfiguration hints.
Expand Down Expand Up @@ -182,9 +186,9 @@ func (svc *mgmtSvc) join(ctx context.Context, req *mgmtpb.JoinReq, peerAddr *net
return nil, errors.Wrapf(err, "invalid uuid %q", req.Uuid)
}

fd, err := system.NewFaultDomainFromString(req.SrvFaultDomain)
fd, err := svc.verifyFaultDomain(req)
if err != nil {
return nil, errors.Wrapf(err, "invalid server fault domain %q", req.SrvFaultDomain)
return nil, err
}

if err := svc.checkReqFabricProvider(req, peerAddr, svc.events); err != nil {
Expand Down Expand Up @@ -255,6 +259,67 @@ func (svc *mgmtSvc) join(ctx context.Context, req *mgmtpb.JoinReq, peerAddr *net
return resp, nil
}

func (svc *mgmtSvc) verifyFaultDomain(req *mgmtpb.JoinReq) (*system.FaultDomain, error) {
fd, err := system.NewFaultDomainFromString(req.SrvFaultDomain)
if err != nil {
return nil, config.FaultConfigFaultDomainInvalid(err)
}

if fd.Empty() {
return nil, errors.New("no fault domain in join request")
}

labels := fd.Labels
if !fd.HasLabels() {
// While saving the labels, an unlabeled fault domain sets the labels to empty
// strings. This allows us to distinguish between unset and unlabeled.
labels = make([]string, fd.NumLevels())
}

sysLabels, err := svc.getDomainLabels()
if system.IsErrSystemAttrNotFound(err) {
svc.log.Debugf("setting fault domain labels for the first time: %+v", labels)
if err := svc.setDomainLabels(labels); err != nil {
return nil, errors.Wrap(err, "failed to set fault domain labels")
}
return fd, nil
}
if err != nil {
return nil, errors.Wrap(err, "failed to get current fault domain labels")
}

// If system labels are all empty strings, that indicates an unlabeled system. In errors
// and logging, clearer to present this as a completely empty array.
var printSysLabels []string
if sysLabels[0] != "" {
printSysLabels = sysLabels
}

svc.log.Tracef("system labels: [%s], request labels: [%s]", strings.Join(printSysLabels, ", "), strings.Join(labels, ", "))
if len(sysLabels) != len(labels) {
return nil, FaultBadFaultDomainLabels(req.SrvFaultDomain, req.Uri, fd.Labels, printSysLabels)
}
for i := range sysLabels {
if labels[i] != sysLabels[i] {
return nil, FaultBadFaultDomainLabels(req.SrvFaultDomain, req.Uri, fd.Labels, printSysLabels)
}
}
return fd, nil
}

func (svc *mgmtSvc) getDomainLabels() ([]string, error) {
propStr, err := system.GetMgmtProperty(svc.sysdb, domainLabelsProp)
if err != nil {
return nil, err
}
return strings.Split(propStr, domainLabelsSep), nil
}

func (svc *mgmtSvc) setDomainLabels(labels []string) error {
propStr := strings.Join(labels, domainLabelsSep)
return system.SetMgmtProperty(svc.sysdb, domainLabelsProp, propStr)
}

// allRanksJoined checks whether all ranks that the system knows about, and that are not admin
// excluded, are joined.
//
Expand Down
151 changes: 150 additions & 1 deletion src/control/server/mgmt_system_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/daos-stack/daos/src/control/build"
"github.com/daos-stack/daos/src/control/common"
"github.com/daos-stack/daos/src/control/common/proto/mgmt"
mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt"
sharedpb "github.com/daos-stack/daos/src/control/common/proto/shared"
"github.com/daos-stack/daos/src/control/common/test"
Expand Down Expand Up @@ -1953,7 +1954,7 @@ func TestServer_MgmtSvc_Join(t *testing.T) {
req: &mgmtpb.JoinReq{
SrvFaultDomain: "bad fault domain",
},
expErr: errors.New("bad fault domain"),
expErr: errors.New("invalid fault domain"),
},
"dupe host same rank diff uuid": {
req: &mgmtpb.JoinReq{
Expand Down Expand Up @@ -2345,6 +2346,154 @@ func TestServer_MgmtSvc_doGroupUpdate(t *testing.T) {
}
}

func TestMgmtSvc_verifyFaultDomain(t *testing.T) {
testURI := "tcp://localhost:10001"
for name, tc := range map[string]struct {
getSvc func(*testing.T, logging.Logger) *mgmtSvc
curLabels []string
req *mgmtpb.JoinReq
expFaultDomain *system.FaultDomain
expErr error
expLabels []string
}{
"no fault domain": {
req: &mgmtpb.JoinReq{},
expErr: errors.New("no fault domain"),
},
"invalid fault domain": {
req: &mgmtpb.JoinReq{SrvFaultDomain: "junk"},
expErr: errors.New("invalid fault domain"),
},
"failed to get system domain labels": {
getSvc: func(t *testing.T, log logging.Logger) *mgmtSvc {
svc := newTestMgmtSvcMulti(t, log, maxEngines, false)
// not a replica
svc.sysdb = raft.MockDatabaseWithCfg(t, log, &raft.DatabaseConfig{
SystemName: build.DefaultSystemName,
})

return svc
},
req: &mgmt.JoinReq{SrvFaultDomain: "/rack=r1/node=n2"},
expErr: &system.ErrNotReplica{},
},
"failed to set system domain labels": {
getSvc: func(t *testing.T, log logging.Logger) *mgmtSvc {
svc := newTestMgmtSvcMulti(t, log, maxEngines, true)
svc.sysdb = raft.MockDatabaseWithCfg(t, log, &raft.DatabaseConfig{
SystemName: build.DefaultSystemName,
Replicas: []*net.TCPAddr{common.LocalhostCtrlAddr()},
})
if err := svc.sysdb.ResignLeadership(errors.New("test")); err != nil {
t.Fatal(err)
}

return svc
},
req: &mgmt.JoinReq{SrvFaultDomain: "/rack=r1/node=n2"},
expErr: &system.ErrNotLeader{},
},
"first success with labels": {
req: &mgmt.JoinReq{SrvFaultDomain: "/rack=r1/node=n2"},
expFaultDomain: system.MustCreateFaultDomainFromString("/rack=r1/node=n2"),
expLabels: []string{"rack", "node"},
},
"first success with no labels": {
req: &mgmt.JoinReq{SrvFaultDomain: "/r1/n2"},
expFaultDomain: system.MustCreateFaultDomainFromString("/r1/n2"),
expLabels: []string{"", ""},
},
"success with labels": {
curLabels: []string{"rack", "node"},
req: &mgmt.JoinReq{SrvFaultDomain: "/rack=r1/node=n2"},
expFaultDomain: system.MustCreateFaultDomainFromString("/rack=r1/node=n2"),
expLabels: []string{"rack", "node"},
},
"success with no labels": {
curLabels: []string{"", ""},
req: &mgmt.JoinReq{SrvFaultDomain: "/r1/n2"},
expFaultDomain: system.MustCreateFaultDomainFromString("/r1/n2"),
expLabels: []string{"", ""},
},
"labeled request with unlabeled system": {
curLabels: []string{"", ""},
req: &mgmt.JoinReq{
SrvFaultDomain: "/rack=r1/node=n2",
Uri: testURI,
},
expErr: FaultBadFaultDomainLabels("/rack=r1/node=n2", testURI, []string{"rack", "node"}, nil),
expLabels: []string{"", ""},
},
"unlabeled request with labeled system": {
curLabels: []string{"rack", "node"},
req: &mgmt.JoinReq{
SrvFaultDomain: "/r1/n2",
Uri: testURI,
},
expErr: FaultBadFaultDomainLabels("/r1/n2", testURI, nil, []string{"rack", "node"}),
expLabels: []string{"rack", "node"},
},
"mismatched labels": {
curLabels: []string{"rack", "node"},
req: &mgmt.JoinReq{
SrvFaultDomain: "/rack=r1/host=n2",
Uri: testURI,
},
expErr: FaultBadFaultDomainLabels("/rack=r1/host=n2", testURI, []string{"rack", "host"}, []string{"rack", "node"}),
expLabels: []string{"rack", "node"},
},
"mismatched length": {
curLabels: []string{"rack"},
req: &mgmt.JoinReq{
SrvFaultDomain: "/rack=r1/node=n2",
Uri: testURI,
},
expErr: FaultBadFaultDomainLabels("/rack=r1/node=n2", testURI, []string{"rack", "node"}, []string{"rack"}),
expLabels: []string{"rack"},
},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
defer test.ShowBufferOnFailure(t, buf)

if tc.getSvc == nil {
tc.getSvc = func(t *testing.T, l logging.Logger) *mgmtSvc {
svc := mgmtSystemTestSetup(t, l,
system.Members{
mockMember(t, 1, 1, "stopped"),
mockMember(t, 2, 2, "stopped"),
},
[]*control.HostResponse{})
return svc
}
}
svc := tc.getSvc(t, log)
if tc.curLabels != nil {
if err := svc.setDomainLabels(tc.curLabels); err != nil {
t.Fatal(err)
}
}

fd, err := svc.verifyFaultDomain(tc.req)

test.CmpErr(t, tc.expErr, err)
test.AssertTrue(t, fd.Equals(tc.expFaultDomain), fmt.Sprintf("want %q, got %q", tc.expFaultDomain, fd))

if tc.expLabels == nil {
return
}

newLabels, labelErr := svc.getDomainLabels()
if len(tc.expLabels) == 0 {
test.AssertTrue(t, system.IsErrSystemAttrNotFound(labelErr), "")
} else if labelErr != nil {
t.Fatal(labelErr)
}
test.CmpAny(t, "", tc.expLabels, newLabels)
})
}
}

func TestMgmtSvc_updateFabricProviders(t *testing.T) {
for name, tc := range map[string]struct {
getSvc func(*testing.T, logging.Logger) *mgmtSvc
Expand Down
Loading

0 comments on commit a1d734a

Please sign in to comment.