Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-13292 control: Use cart API to detect fabric #13989

Merged
merged 12 commits into from
Apr 29, 2024
Merged
3 changes: 1 addition & 2 deletions src/control/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def is_firmware_mgmt_build(benv):

def get_build_tags(benv):
"Get custom go build tags."
tags = ["ucx", "spdk"]
tags = ["spdk"]
if is_firmware_mgmt_build(benv):
tags.append("firmware")
if not is_release_build(benv):
Expand Down Expand Up @@ -124,7 +124,6 @@ def scons():

denv.Tool('go_builder')

denv.require('ofi', 'ucx')
# Sets CGO_LDFLAGS for rpath options
denv.d_add_rpaths("..", True, True)
denv.AppendENVPath("CGO_CFLAGS", denv.subst("$_CPPINCFLAGS"), sep=" ")
Expand Down
8 changes: 0 additions & 8 deletions src/control/cmd/daos_agent/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,6 @@ func (cmd *startCmd) Execute(_ []string) error {
}
cmd.Debugf("created dRPC server: %s", time.Since(createDrpcStart))

hwprovInitStart := time.Now()
hwprovFini, err := hwprov.Init(cmd.Logger)
if err != nil {
return err
}
defer hwprovFini()
cmd.Debugf("initialized hardware providers: %s", time.Since(hwprovInitStart))

cacheStart := time.Now()
cache := NewInfoCache(ctx, cmd.Logger, cmd.ctlInvoker, cmd.cfg)
if cmd.attachInfoCacheDisabled() {
Expand Down
2 changes: 2 additions & 0 deletions src/control/lib/daos/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ const (
BadTarget Status = -C.DER_BAD_TARGET
// GroupVersionMismatch indicates that group versions didn't match
GroupVersionMismatch Status = -C.DER_GRPVER
// MercuryFatalError indicates a fatal (non-retryable) Mercury error
MercuryFatalError Status = -C.DER_HG_FATAL
// NoService indicates the pool service is not up and didn't process the pool request
NoService Status = -C.DER_NO_SERVICE
)
Expand Down
58 changes: 58 additions & 0 deletions src/control/lib/hardware/cart/bindings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//
// (C) Copyright 2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

package cart

/*
#cgo LDFLAGS: -lcart
#include <cart/types.h>
#include <cart/api.h>
*/
import "C"

import (
"unsafe"

"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/logging"
)

func getProtocolInfo(log logging.Logger, provider string) ([]*crtFabricDevice, error) {
var cInfo *C.struct_crt_protocol_info
var cProtoStr *C.char
if provider != "" {
log.Debugf("getting fabric protocol info from CART for %q", provider)
cProtoStr = C.CString(provider)
defer C.free(unsafe.Pointer(cProtoStr))
} else {
log.Debug("getting all fabric protocol info from CART")
}

if err := daos.Status(C.crt_protocol_info_get(cProtoStr, &cInfo)); err != daos.Success {
return nil, errors.Wrap(err, "crt_hg_get_protocol_info")
}
defer C.crt_protocol_info_free(cInfo)

infoList := make([]*crtFabricDevice, 0)

for cur := cInfo; cur != nil; cur = cur.next {
infoList = append(infoList, cToCrtProtocolInfo(cur))
}

log.Debugf("CART protocol info discovered:\n%+v", infoList)
return infoList, nil
}

func cToCrtProtocolInfo(cInfo *C.struct_crt_protocol_info) *crtFabricDevice {
return &crtFabricDevice{
Class: C.GoString(cInfo.class_name),
Protocol: C.GoString(cInfo.protocol_name),
Device: C.GoString(cInfo.device_name),
}
}
158 changes: 158 additions & 0 deletions src/control/lib/hardware/cart/cart.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
//
// (C) Copyright 2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

package cart

import (
"context"
"fmt"
"strings"

"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/lib/hardware"
"github.com/daos-stack/daos/src/control/logging"
)

const (
classLibFabric = "ofi"
classUCX = "ucx"
classNA = "na"
)

// crtFabricDevice is a single fabric device discovered by CART.
type crtFabricDevice struct {
Class string `json:"class"`
Protocol string `json:"protocol"`
Device string `json:"device"`
}

// isUCX indicates whether this is a UCX device.
func (cfd *crtFabricDevice) IsUCX() bool {
return cfd.Class == classUCX
}

// OSName returns the OS level network device name for this device.
func (cfd *crtFabricDevice) OSName() string {
if cfd.IsUCX() {
return getOSNameFromUCXDevice(cfd.Device)
}
return cfd.Device
}

// ProviderName returns the DAOS fabric provider name for this device's protocol.
func (cfd *crtFabricDevice) ProviderName() string {
return fmt.Sprintf("%s+%s", cfd.Class, cfd.Protocol)
}

type getProtocolFn func(log logging.Logger, provider string) ([]*crtFabricDevice, error)

// Provider provides access to the CART API.
type Provider struct {
log logging.Logger
getProtocolInfo getProtocolFn
}

// NewProvider creates a new CART Provider.
func NewProvider(log logging.Logger) *Provider {
return &Provider{
log: log,
}
}

// GetFabricInterfaces fetches information about the system fabric interfaces via CART.
func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) {
if p == nil {
return nil, errors.New("nil CART Provider")
}

ch := make(chan *fabricResult)
go p.getFabricInterfaces(provider, ch)
select {
case <-ctx.Done():
return nil, ctx.Err()
case result := <-ch:
return result.fiSet, result.err
}
}

type fabricResult struct {
fiSet *hardware.FabricInterfaceSet
err error
}

type providerPriorities map[string]int

func (p providerPriorities) getPriority(provName string) int {
prio, ok := p[provName]
if !ok {
prio = len(p)
p[provName] = prio
}
return prio
}

func (p *Provider) getFabricInterfaces(provider string, ch chan *fabricResult) {
if p.getProtocolInfo == nil {
p.getProtocolInfo = getProtocolInfo
}

devices, err := p.getProtocolInfo(p.log, provider)
if err != nil {
// TODO DAOS-15588: Remove this special handling for verbs once the
// underlying Mercury bug is fixed.
// Currently requesting verbs on a system without Infiniband results in
// a Mercury error.
if errors.Is(err, daos.MercuryFatalError) && strings.HasSuffix(provider, "verbs") {
ch <- &fabricResult{
fiSet: hardware.NewFabricInterfaceSet(),
}
return
}

provMsg := ""
if provider != "" {
provMsg = fmt.Sprintf(" for provider %q", provider)
}
ch <- &fabricResult{
err: errors.Wrapf(err, "fetching fabric interfaces%s", provMsg),
}
return
}

fis := hardware.NewFabricInterfaceSet()
priorities := make(providerPriorities)
for _, dev := range devices {
fis.Update(crtFabricDeviceToFabricInterface(dev, priorities))
}

ch <- &fabricResult{
fiSet: fis,
}
}

func crtFabricDeviceToFabricInterface(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricInterface {
return &hardware.FabricInterface{
Name: dev.Device,
OSName: dev.OSName(),
Providers: getProviderSet(dev, priorities),
}
}

// getProviderSet returns a set of one or more DAOS providers associated with the protocol info.
func getProviderSet(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricProviderSet {
if dev.IsUCX() {
// UCX determines its own priorities within the provider set
return getProviderSetFromUCXTransport(dev.Protocol)
}

name := dev.ProviderName()
return hardware.NewFabricProviderSet(&hardware.FabricProvider{
Name: name,
Priority: priorities.getPriority(name),
})
}
Loading
Loading