Skip to content

Commit

Permalink
DAOS-13292 control: Use cart API to detect fabric (#13989)
Browse files Browse the repository at this point in the history
- Add a lib/hardware package to collect fabric interface
  information through CART API.
- Remove custom OFI and UCX packages and dependencies.
- Update Go githook to ignore deleted files.

* Compensate for DAOS-15588

For systems without Infiniband, getting info for verbs produces a Mercury
error. For all other providers, including UCX verbs, it returns no error
and instead returns no results. We'll simulate that behavior here until
the underlying bug is fixed.

Signed-off-by: Kris Jacque <kris.jacque@intel.com>
  • Loading branch information
kjacque authored and mjmac committed Apr 30, 2024
1 parent 47d5a35 commit 0227079
Show file tree
Hide file tree
Showing 21 changed files with 645 additions and 1,567 deletions.
3 changes: 1 addition & 2 deletions src/control/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def is_firmware_mgmt_build(benv):

def get_build_tags(benv):
"Get custom go build tags."
tags = ["ucx", "spdk"]
tags = ["spdk"]
if is_firmware_mgmt_build(benv):
tags.append("firmware")
if not is_release_build(benv):
Expand Down Expand Up @@ -124,7 +124,6 @@ def scons():

denv.Tool('go_builder')

denv.require('ofi', 'ucx')
# Sets CGO_LDFLAGS for rpath options
denv.d_add_rpaths("..", True, True)
denv.AppendENVPath("CGO_CFLAGS", denv.subst("$_CPPINCFLAGS"), sep=" ")
Expand Down
8 changes: 0 additions & 8 deletions src/control/cmd/daos_agent/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,6 @@ func (cmd *startCmd) Execute(_ []string) error {
}
cmd.Debugf("created dRPC server: %s", time.Since(createDrpcStart))

hwprovInitStart := time.Now()
hwprovFini, err := hwprov.Init(cmd.Logger)
if err != nil {
return err
}
defer hwprovFini()
cmd.Debugf("initialized hardware providers: %s", time.Since(hwprovInitStart))

cacheStart := time.Now()
cache := NewInfoCache(ctx, cmd.Logger, cmd.ctlInvoker, cmd.cfg)
if cmd.attachInfoCacheDisabled() {
Expand Down
2 changes: 2 additions & 0 deletions src/control/lib/daos/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ const (
BadTarget Status = -C.DER_BAD_TARGET
// GroupVersionMismatch indicates that group versions didn't match
GroupVersionMismatch Status = -C.DER_GRPVER
// MercuryFatalError indicates a fatal (non-retryable) Mercury error
MercuryFatalError Status = -C.DER_HG_FATAL
// NoService indicates the pool service is not up and didn't process the pool request
NoService Status = -C.DER_NO_SERVICE
)
Expand Down
58 changes: 58 additions & 0 deletions src/control/lib/hardware/cart/bindings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//
// (C) Copyright 2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

package cart

/*
#cgo LDFLAGS: -lcart
#include <cart/types.h>
#include <cart/api.h>
*/
import "C"

import (
"unsafe"

"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/logging"
)

func getProtocolInfo(log logging.Logger, provider string) ([]*crtFabricDevice, error) {
var cInfo *C.struct_crt_protocol_info
var cProtoStr *C.char
if provider != "" {
log.Debugf("getting fabric protocol info from CART for %q", provider)
cProtoStr = C.CString(provider)
defer C.free(unsafe.Pointer(cProtoStr))
} else {
log.Debug("getting all fabric protocol info from CART")
}

if err := daos.Status(C.crt_protocol_info_get(cProtoStr, &cInfo)); err != daos.Success {
return nil, errors.Wrap(err, "crt_hg_get_protocol_info")
}
defer C.crt_protocol_info_free(cInfo)

infoList := make([]*crtFabricDevice, 0)

for cur := cInfo; cur != nil; cur = cur.next {
infoList = append(infoList, cToCrtProtocolInfo(cur))
}

log.Debugf("CART protocol info discovered:\n%+v", infoList)
return infoList, nil
}

func cToCrtProtocolInfo(cInfo *C.struct_crt_protocol_info) *crtFabricDevice {
return &crtFabricDevice{
Class: C.GoString(cInfo.class_name),
Protocol: C.GoString(cInfo.protocol_name),
Device: C.GoString(cInfo.device_name),
}
}
158 changes: 158 additions & 0 deletions src/control/lib/hardware/cart/cart.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
//
// (C) Copyright 2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

package cart

import (
"context"
"fmt"
"strings"

"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/lib/hardware"
"github.com/daos-stack/daos/src/control/logging"
)

const (
classLibFabric = "ofi"
classUCX = "ucx"
classNA = "na"
)

// crtFabricDevice is a single fabric device discovered by CART.
type crtFabricDevice struct {
Class string `json:"class"`
Protocol string `json:"protocol"`
Device string `json:"device"`
}

// isUCX indicates whether this is a UCX device.
func (cfd *crtFabricDevice) IsUCX() bool {
return cfd.Class == classUCX
}

// OSName returns the OS level network device name for this device.
func (cfd *crtFabricDevice) OSName() string {
if cfd.IsUCX() {
return getOSNameFromUCXDevice(cfd.Device)
}
return cfd.Device
}

// ProviderName returns the DAOS fabric provider name for this device's protocol.
func (cfd *crtFabricDevice) ProviderName() string {
return fmt.Sprintf("%s+%s", cfd.Class, cfd.Protocol)
}

type getProtocolFn func(log logging.Logger, provider string) ([]*crtFabricDevice, error)

// Provider provides access to the CART API.
type Provider struct {
log logging.Logger
getProtocolInfo getProtocolFn
}

// NewProvider creates a new CART Provider.
func NewProvider(log logging.Logger) *Provider {
return &Provider{
log: log,
}
}

// GetFabricInterfaces fetches information about the system fabric interfaces via CART.
func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) {
if p == nil {
return nil, errors.New("nil CART Provider")
}

ch := make(chan *fabricResult)
go p.getFabricInterfaces(provider, ch)
select {
case <-ctx.Done():
return nil, ctx.Err()
case result := <-ch:
return result.fiSet, result.err
}
}

type fabricResult struct {
fiSet *hardware.FabricInterfaceSet
err error
}

type providerPriorities map[string]int

func (p providerPriorities) getPriority(provName string) int {
prio, ok := p[provName]
if !ok {
prio = len(p)
p[provName] = prio
}
return prio
}

func (p *Provider) getFabricInterfaces(provider string, ch chan *fabricResult) {
if p.getProtocolInfo == nil {
p.getProtocolInfo = getProtocolInfo
}

devices, err := p.getProtocolInfo(p.log, provider)
if err != nil {
// TODO DAOS-15588: Remove this special handling for verbs once the
// underlying Mercury bug is fixed.
// Currently requesting verbs on a system without Infiniband results in
// a Mercury error.
if errors.Is(err, daos.MercuryFatalError) && strings.HasSuffix(provider, "verbs") {
ch <- &fabricResult{
fiSet: hardware.NewFabricInterfaceSet(),
}
return
}

provMsg := ""
if provider != "" {
provMsg = fmt.Sprintf(" for provider %q", provider)
}
ch <- &fabricResult{
err: errors.Wrapf(err, "fetching fabric interfaces%s", provMsg),
}
return
}

fis := hardware.NewFabricInterfaceSet()
priorities := make(providerPriorities)
for _, dev := range devices {
fis.Update(crtFabricDeviceToFabricInterface(dev, priorities))
}

ch <- &fabricResult{
fiSet: fis,
}
}

func crtFabricDeviceToFabricInterface(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricInterface {
return &hardware.FabricInterface{
Name: dev.Device,
OSName: dev.OSName(),
Providers: getProviderSet(dev, priorities),
}
}

// getProviderSet returns a set of one or more DAOS providers associated with the protocol info.
func getProviderSet(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricProviderSet {
if dev.IsUCX() {
// UCX determines its own priorities within the provider set
return getProviderSetFromUCXTransport(dev.Protocol)
}

name := dev.ProviderName()
return hardware.NewFabricProviderSet(&hardware.FabricProvider{
Name: name,
Priority: priorities.getPriority(name),
})
}
Loading

0 comments on commit 0227079

Please sign in to comment.