Skip to content

Commit

Permalink
DAOS-16791 control: Add include_fabric_ifaces to agent config (#15470)
Browse files Browse the repository at this point in the history
Provide an inverse to the existing exclude_fabric_ifaces
directive. In some cases, a given environment will only have
a small number of valid interfaces, so it is simpler to
specify that rather than having to exclude all of the
invalid interfaces.

Signed-off-by: Michael MacDonald <mjmac@google.com>
  • Loading branch information
mjmac authored Nov 12, 2024
1 parent d820b71 commit 8a8366d
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 30 deletions.
38 changes: 28 additions & 10 deletions src/control/cmd/daos_agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,39 @@ type Config struct {
DisableAutoEvict bool `yaml:"disable_auto_evict,omitempty"`
EvictOnStart bool `yaml:"enable_evict_on_start,omitempty"`
ExcludeFabricIfaces common.StringSet `yaml:"exclude_fabric_ifaces,omitempty"`
IncludeFabricIfaces common.StringSet `yaml:"include_fabric_ifaces,omitempty"`
FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"`
ProviderIdx uint // TODO SRS-31: Enable with multiprovider functionality
TelemetryPort int `yaml:"telemetry_port,omitempty"`
TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"`
TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"`
}

// Validate performs basic validation of the configuration.
func (c *Config) Validate() error {
if c == nil {
return errors.New("config is nil")
}

if !daos.SystemNameIsValid(c.SystemName) {
return fmt.Errorf("invalid system name: %s", c.SystemName)
}

if c.TelemetryRetain > 0 && c.TelemetryPort == 0 {
return errors.New("telemetry_retain requires telemetry_port")
}

if c.TelemetryEnabled && c.TelemetryPort == 0 {
return errors.New("telemetry_enabled requires telemetry_port")
}

if len(c.ExcludeFabricIfaces) > 0 && len(c.IncludeFabricIfaces) > 0 {
return errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces")
}

return nil
}

// TelemetryExportEnabled returns true if client telemetry export is enabled.
func (c *Config) TelemetryExportEnabled() bool {
return c.TelemetryPort > 0
Expand Down Expand Up @@ -95,16 +121,8 @@ func LoadConfig(cfgPath string) (*Config, error) {
return nil, errors.Wrapf(err, "parsing config: %s", cfgPath)
}

if !daos.SystemNameIsValid(cfg.SystemName) {
return nil, fmt.Errorf("invalid system name: %s", cfg.SystemName)
}

if cfg.TelemetryRetain > 0 && cfg.TelemetryPort == 0 {
return nil, errors.New("telemetry_retain requires telemetry_port")
}

if cfg.TelemetryEnabled && cfg.TelemetryPort == 0 {
return nil, errors.New("telemetry_enabled requires telemetry_port")
if err := cfg.Validate(); err != nil {
return nil, errors.Wrap(err, "agent config validation failed")
}

return cfg, nil
Expand Down
16 changes: 16 additions & 0 deletions src/control/cmd/daos_agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ transport_config:
allow_insecure: true
`)

badFilterCfg := test.CreateTestFile(t, dir, `
name: shire
access_points: ["one:10001", "two:10001"]
port: 4242
runtime_dir: /tmp/runtime
log_file: /home/frodo/logfile
transport_config:
allow_insecure: true
include_fabric_ifaces: ["ib0"]
exclude_fabric_ifaces: ["ib3"]
`)

for name, tc := range map[string]struct {
path string
expResult *Config
Expand Down Expand Up @@ -128,6 +140,10 @@ transport_config:
path: badLogMaskCfg,
expErr: errors.New("not a valid log level"),
},
"bad filter config": {
path: badFilterCfg,
expErr: errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces"),
},
"all options": {
path: optCfg,
expResult: &Config{
Expand Down
53 changes: 42 additions & 11 deletions src/control/cmd/daos_agent/fabric.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,47 @@ func (nfm NUMAFabricMap) MaxNUMANode() int {
return max
}

type filterMode int

const (
// filterModeExclude indicates that devices in the set should be excluded
filterModeExclude filterMode = 0
// filterModeInclude indicates that only devices in the set should be included
filterModeInclude filterMode = 1
)

type deviceFilter struct {
deviceSet common.StringSet
mode filterMode
}

func (df *deviceFilter) ShouldIgnore(devName string) bool {
if df == nil || df.deviceSet == nil {
return false
}
if df.mode == filterModeExclude {
return df.deviceSet.Has(devName)
}
return !df.deviceSet.Has(devName)
}

func newDeviceFilter(deviceSet common.StringSet, mode filterMode) *deviceFilter {
return &deviceFilter{
deviceSet: deviceSet,
mode: mode,
}
}

// NUMAFabric represents a set of fabric interfaces organized by NUMA node.
type NUMAFabric struct {
log logging.Logger
mutex sync.RWMutex

numaMap NUMAFabricMap

currentNumaDevIdx map[int]int // current device idx to use on each NUMA node
currentNUMANode int // current NUMA node to search
ignoreIfaces common.StringSet
currentNumaDevIdx map[int]int // current device idx to use on each NUMA node
currentNUMANode int // current NUMA node to search
ifaceFilter *deviceFilter // set of interface names for filtering

getAddrInterface func(name string) (addrFI, error)
}
Expand All @@ -112,12 +143,12 @@ func (n *NUMAFabric) Add(numaNode int, fi *FabricInterface) error {
return nil
}

// WithIgnoredDevices adds a set of fabric interface names that should be ignored when
// selecting a device.
func (n *NUMAFabric) WithIgnoredDevices(ifaces common.StringSet) *NUMAFabric {
n.ignoreIfaces = ifaces
if len(ifaces) > 0 {
n.log.Tracef("ignoring fabric devices: %s", n.ignoreIfaces)
// WithDeviceFilter adds a set of fabric interface names that should be used for
// filtering when selecting a device.
func (n *NUMAFabric) WithDeviceFilter(filter *deviceFilter) *NUMAFabric {
if filter != nil {
n.ifaceFilter = filter
n.log.Tracef("fabric device filter: %+v", n.ifaceFilter)
}
return n
}
Expand Down Expand Up @@ -234,8 +265,8 @@ func (n *NUMAFabric) getDeviceFromNUMA(numaNode int, netDevClass hardware.NetDev
for checked := 0; checked < n.getNumDevices(numaNode); checked++ {
fabricIF := n.getNextDevice(numaNode)

if n.ignoreIfaces.Has(fabricIF.Name) {
n.log.Tracef("device %s: ignored (ignore list %s)", fabricIF, n.ignoreIfaces)
if n.ifaceFilter.ShouldIgnore(fabricIF.Name) {
n.log.Tracef("device %s: ignored (filter: %+v)", fabricIF, n.ifaceFilter)
continue
}

Expand Down
61 changes: 54 additions & 7 deletions src/control/cmd/daos_agent/fabric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
for name, tc := range map[string]struct {
nf *NUMAFabric
params *FabricIfaceParams
ignore []string
include []string
exclude []string
expErr error
expResults []*FabricInterface
}{
Expand Down Expand Up @@ -723,7 +724,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
},
},
},
"ignore interface": {
"include interface": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
Expand All @@ -749,7 +750,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
ignore: []string{"t1"},
include: []string{"t2"},
expResults: []*FabricInterface{
{
Name: "t2",
Expand All @@ -763,7 +764,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
},
},
},
"ignore all interfaces": {
"exclude interface": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
Expand All @@ -773,6 +774,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
},
1: {
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t2"),
Name: "t2",
Expand All @@ -787,8 +790,46 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
ignore: []string{"t1", "t2"},
expErr: errors.New("no suitable fabric interface"),
exclude: []string{"t1"},
expResults: []*FabricInterface{
{
Name: "t2",
Domain: "t2",
NetDevClass: hardware.Ether,
},
{
Name: "t2",
Domain: "t2",
NetDevClass: hardware.Ether,
},
},
},
"exclude all interfaces": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t1"),
Name: "t1",
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t2"),
Name: "t2",
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
},
},
},
params: &FabricIfaceParams{
NUMANode: 0,
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
exclude: []string{"t1", "t2"},
expErr: errors.New("no suitable fabric interface"),
},
} {
t.Run(name, func(t *testing.T) {
Expand All @@ -800,7 +841,13 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
tc.nf.getAddrInterface = getMockNetInterfaceSuccess
}

tc.nf = tc.nf.WithIgnoredDevices(common.NewStringSet(tc.ignore...))
mode := filterModeExclude
devSet := common.NewStringSet(tc.exclude...)
if len(tc.include) > 0 {
mode = filterModeInclude
devSet = common.NewStringSet(tc.include...)
}
tc.nf = tc.nf.WithDeviceFilter(newDeviceFilter(devSet, mode))
}

numDevices := 0
Expand Down
9 changes: 8 additions & 1 deletion src/control/cmd/daos_agent/infocache.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,20 @@ func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryI
return ic
}

func fabricDeviceFilter(cfg *Config) *deviceFilter {
if len(cfg.ExcludeFabricIfaces) > 0 {
return newDeviceFilter(cfg.ExcludeFabricIfaces, filterModeExclude)
}
return newDeviceFilter(cfg.IncludeFabricIfaces, filterModeInclude)
}

func getFabricScanFn(log logging.Logger, cfg *Config, scanner *hardware.FabricScanner) fabricScanFn {
return func(ctx context.Context, provs ...string) (*NUMAFabric, error) {
fis, err := scanner.Scan(ctx, provs...)
if err != nil {
return nil, err
}
return NUMAFabricFromScan(ctx, log, fis).WithIgnoredDevices(cfg.ExcludeFabricIfaces), nil
return NUMAFabricFromScan(ctx, log, fis).WithDeviceFilter(fabricDeviceFilter(cfg)), nil
}
}

Expand Down
7 changes: 6 additions & 1 deletion utils/config/daos_agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,15 @@
#cache_expiration: 30

## Ignore a subset of fabric interfaces when selecting an interface for client
## applications.
## applications. (Mutually exclusive with include).
#
#exclude_fabric_ifaces: ["lo", "eth1"]

## Conversely, only consider a specific set of fabric interfaces when selecting
## an interface for client applications. (Mutually exclusive with exclude).
#
#include_fabric_ifaces: ["eth0"]

# Manually define the fabric interfaces and domains to be used by the agent,
# organized by NUMA node.
# If not defined, the agent will automatically detect all fabric interfaces and
Expand Down

0 comments on commit 8a8366d

Please sign in to comment.