Skip to content

Commit

Permalink
return a slice of tags and update tags to match the gpu collector
Browse files Browse the repository at this point in the history
  • Loading branch information
wiyu committed Dec 4, 2024
1 parent 1edf062 commit 3ba8d1a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
9 changes: 4 additions & 5 deletions pkg/process/checks/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ func fmtProcesses(
lookupIdProbe *LookupIdProbe,
zombiesIgnored bool,
serviceExtractor *parser.ServiceExtractor,
deviceUUIDByPid map[int32]string,
deviceUUIDByPid map[int32][]string,
) map[string][]*model.Process {
procsByCtr := make(map[string][]*model.Process)

Expand Down Expand Up @@ -532,10 +532,9 @@ func fmtProcesses(
ProcessContext: serviceExtractor.GetServiceContext(fp.Pid),
}

log.Infof("Getting getGPUID from fmtProcesses in process check for PID: %d, map:%v", fp.Pid, deviceUUIDByPid)
if gpuUUID, ok := deviceUUIDByPid[fp.Pid]; ok {
log.Infof("Found GPU UUID %s for PID: %d", gpuUUID, fp.Pid)
proc.Tags = []string{"gpu_device:" + gpuUUID}
if gpuTags, ok := deviceUUIDByPid[fp.Pid]; ok {
log.Infof("Found GPU tags %s for PID: %d", gpuTags, fp.Pid)
proc.Tags = gpuTags
}

if connRates != nil {
Expand Down
29 changes: 19 additions & 10 deletions pkg/process/procutil/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
package procutil

import (
"errors"

"github.com/DataDog/datadog-agent/pkg/util/log"
"github.com/NVIDIA/go-nvml/pkg/nvml"

Expand All @@ -18,7 +16,7 @@ import (
type NVMLProbe struct {
nvml nvml.Interface

DeviceUUIDByPid map[int32]string
DeviceUUIDByPid map[int32][]string
}

// NewGpuProbe creates a new GPU probe
Expand All @@ -33,7 +31,7 @@ func NewGpuProbe(config pkgconfigmodel.Reader) *NVMLProbe {
log.Info("Created NVML probe")
return &NVMLProbe{
nvml: nvmlLib,
DeviceUUIDByPid: make(map[int32]string),
DeviceUUIDByPid: make(map[int32][]string),
}
}

Expand All @@ -52,7 +50,7 @@ func (p *NVMLProbe) Scan() {
return
}

deviceUUIDByPid := make(map[int32]string)
deviceUUIDByPid := make(map[int32][]string)
for di := 0; di < count; di++ {
device, ret := p.nvml.DeviceGetHandleByIndex(di)
log.Infof("Finished DeviceGetHandleByIndex device: %d, ret: %s", device, ret)
Expand All @@ -61,8 +59,8 @@ func (p *NVMLProbe) Scan() {
return
}

deviceUUID, err := device.GetUUID()
if !errors.Is(err, nvml.SUCCESS) {
gpuUUID, err := device.GetUUID()
if ret == nvml.SUCCESS {
log.Warn("Failed to get GPU UUID %v", err)
}

Expand All @@ -74,13 +72,24 @@ func (p *NVMLProbe) Scan() {
}
log.Infof("Found %d processes on device %d\n", len(processInfos), di)

deviceName, ret := device.GetName()
if ret != nvml.SUCCESS {
deviceName = "unknown"
log.Warnf("failed to get device name: %s", nvml.ErrorString(ret))
}

for _, processInfo := range processInfos {
log.Infof("Found pid %d on device %s\n", processInfo.Pid, deviceUUID)
deviceUUIDByPid[int32(processInfo.Pid)] = deviceUUID
log.Infof("Found pid %d on device %s\n", processInfo.Pid, gpuUUID)
gpuTags := []string{
"gpu_vendor:nvidia",
"gpu_uuid:" + gpuUUID,
"gpu_model:" + deviceName,
}
deviceUUIDByPid[int32(processInfo.Pid)] = gpuTags
}
}
p.DeviceUUIDByPid = deviceUUIDByPid
log.Info("Scan completed")
log.Infof("Scan completed %v", p.DeviceUUIDByPid)
}

// Close closes the probe
Expand Down

0 comments on commit 3ba8d1a

Please sign in to comment.