-
Notifications
You must be signed in to change notification settings - Fork 301
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DAOS-13292 control: Use cart API to detect fabric (#13989)
- Add a lib/hardware package to collect fabric interface information through CART API. - Remove custom OFI and UCX packages and dependencies. - Update Go githook to ignore deleted files. * Compensate for DAOS-15588 For systems without Infiniband, getting info for verbs produces a Mercury error. For all other providers, including UCX verbs, it returns no error and instead returns no results. We'll simulate that behavior here until the underlying bug is fixed. Signed-off-by: Kris Jacque <kris.jacque@intel.com>
- Loading branch information
Showing
21 changed files
with
645 additions
and
1,567 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// | ||
// (C) Copyright 2024 Intel Corporation. | ||
// | ||
// SPDX-License-Identifier: BSD-2-Clause-Patent | ||
// | ||
|
||
package cart | ||
|
||
/* | ||
#cgo LDFLAGS: -lcart | ||
#include <cart/types.h> | ||
#include <cart/api.h> | ||
*/ | ||
import "C" | ||
|
||
import ( | ||
"unsafe" | ||
|
||
"github.com/pkg/errors" | ||
|
||
"github.com/daos-stack/daos/src/control/lib/daos" | ||
"github.com/daos-stack/daos/src/control/logging" | ||
) | ||
|
||
func getProtocolInfo(log logging.Logger, provider string) ([]*crtFabricDevice, error) { | ||
var cInfo *C.struct_crt_protocol_info | ||
var cProtoStr *C.char | ||
if provider != "" { | ||
log.Debugf("getting fabric protocol info from CART for %q", provider) | ||
cProtoStr = C.CString(provider) | ||
defer C.free(unsafe.Pointer(cProtoStr)) | ||
} else { | ||
log.Debug("getting all fabric protocol info from CART") | ||
} | ||
|
||
if err := daos.Status(C.crt_protocol_info_get(cProtoStr, &cInfo)); err != daos.Success { | ||
return nil, errors.Wrap(err, "crt_hg_get_protocol_info") | ||
} | ||
defer C.crt_protocol_info_free(cInfo) | ||
|
||
infoList := make([]*crtFabricDevice, 0) | ||
|
||
for cur := cInfo; cur != nil; cur = cur.next { | ||
infoList = append(infoList, cToCrtProtocolInfo(cur)) | ||
} | ||
|
||
log.Debugf("CART protocol info discovered:\n%+v", infoList) | ||
return infoList, nil | ||
} | ||
|
||
func cToCrtProtocolInfo(cInfo *C.struct_crt_protocol_info) *crtFabricDevice { | ||
return &crtFabricDevice{ | ||
Class: C.GoString(cInfo.class_name), | ||
Protocol: C.GoString(cInfo.protocol_name), | ||
Device: C.GoString(cInfo.device_name), | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
// | ||
// (C) Copyright 2024 Intel Corporation. | ||
// | ||
// SPDX-License-Identifier: BSD-2-Clause-Patent | ||
// | ||
|
||
package cart | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/pkg/errors" | ||
|
||
"github.com/daos-stack/daos/src/control/lib/daos" | ||
"github.com/daos-stack/daos/src/control/lib/hardware" | ||
"github.com/daos-stack/daos/src/control/logging" | ||
) | ||
|
||
const ( | ||
classLibFabric = "ofi" | ||
classUCX = "ucx" | ||
classNA = "na" | ||
) | ||
|
||
// crtFabricDevice is a single fabric device discovered by CART. | ||
type crtFabricDevice struct { | ||
Class string `json:"class"` | ||
Protocol string `json:"protocol"` | ||
Device string `json:"device"` | ||
} | ||
|
||
// isUCX indicates whether this is a UCX device. | ||
func (cfd *crtFabricDevice) IsUCX() bool { | ||
return cfd.Class == classUCX | ||
} | ||
|
||
// OSName returns the OS level network device name for this device. | ||
func (cfd *crtFabricDevice) OSName() string { | ||
if cfd.IsUCX() { | ||
return getOSNameFromUCXDevice(cfd.Device) | ||
} | ||
return cfd.Device | ||
} | ||
|
||
// ProviderName returns the DAOS fabric provider name for this device's protocol. | ||
func (cfd *crtFabricDevice) ProviderName() string { | ||
return fmt.Sprintf("%s+%s", cfd.Class, cfd.Protocol) | ||
} | ||
|
||
type getProtocolFn func(log logging.Logger, provider string) ([]*crtFabricDevice, error) | ||
|
||
// Provider provides access to the CART API. | ||
type Provider struct { | ||
log logging.Logger | ||
getProtocolInfo getProtocolFn | ||
} | ||
|
||
// NewProvider creates a new CART Provider. | ||
func NewProvider(log logging.Logger) *Provider { | ||
return &Provider{ | ||
log: log, | ||
} | ||
} | ||
|
||
// GetFabricInterfaces fetches information about the system fabric interfaces via CART. | ||
func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) { | ||
if p == nil { | ||
return nil, errors.New("nil CART Provider") | ||
} | ||
|
||
ch := make(chan *fabricResult) | ||
go p.getFabricInterfaces(provider, ch) | ||
select { | ||
case <-ctx.Done(): | ||
return nil, ctx.Err() | ||
case result := <-ch: | ||
return result.fiSet, result.err | ||
} | ||
} | ||
|
||
type fabricResult struct { | ||
fiSet *hardware.FabricInterfaceSet | ||
err error | ||
} | ||
|
||
type providerPriorities map[string]int | ||
|
||
func (p providerPriorities) getPriority(provName string) int { | ||
prio, ok := p[provName] | ||
if !ok { | ||
prio = len(p) | ||
p[provName] = prio | ||
} | ||
return prio | ||
} | ||
|
||
func (p *Provider) getFabricInterfaces(provider string, ch chan *fabricResult) { | ||
if p.getProtocolInfo == nil { | ||
p.getProtocolInfo = getProtocolInfo | ||
} | ||
|
||
devices, err := p.getProtocolInfo(p.log, provider) | ||
if err != nil { | ||
// TODO DAOS-15588: Remove this special handling for verbs once the | ||
// underlying Mercury bug is fixed. | ||
// Currently requesting verbs on a system without Infiniband results in | ||
// a Mercury error. | ||
if errors.Is(err, daos.MercuryFatalError) && strings.HasSuffix(provider, "verbs") { | ||
ch <- &fabricResult{ | ||
fiSet: hardware.NewFabricInterfaceSet(), | ||
} | ||
return | ||
} | ||
|
||
provMsg := "" | ||
if provider != "" { | ||
provMsg = fmt.Sprintf(" for provider %q", provider) | ||
} | ||
ch <- &fabricResult{ | ||
err: errors.Wrapf(err, "fetching fabric interfaces%s", provMsg), | ||
} | ||
return | ||
} | ||
|
||
fis := hardware.NewFabricInterfaceSet() | ||
priorities := make(providerPriorities) | ||
for _, dev := range devices { | ||
fis.Update(crtFabricDeviceToFabricInterface(dev, priorities)) | ||
} | ||
|
||
ch <- &fabricResult{ | ||
fiSet: fis, | ||
} | ||
} | ||
|
||
func crtFabricDeviceToFabricInterface(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricInterface { | ||
return &hardware.FabricInterface{ | ||
Name: dev.Device, | ||
OSName: dev.OSName(), | ||
Providers: getProviderSet(dev, priorities), | ||
} | ||
} | ||
|
||
// getProviderSet returns a set of one or more DAOS providers associated with the protocol info. | ||
func getProviderSet(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricProviderSet { | ||
if dev.IsUCX() { | ||
// UCX determines its own priorities within the provider set | ||
return getProviderSetFromUCXTransport(dev.Protocol) | ||
} | ||
|
||
name := dev.ProviderName() | ||
return hardware.NewFabricProviderSet(&hardware.FabricProvider{ | ||
Name: name, | ||
Priority: priorities.getPriority(name), | ||
}) | ||
} |
Oops, something went wrong.