Skip to content

Commit

Permalink
The GetValuesSince binding added
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <vfedorov@nvidia.com>
  • Loading branch information
nvvfedorov committed Jan 17, 2024
1 parent 2aab8ae commit 509e719
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 57 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ clean:
rm -f samples/processInfo/processInfo
rm -f samples/restApi/restApi
rm -f samples/topology/topology

lint:
golangci-lint run ./...
8 changes: 8 additions & 0 deletions pkg/dcgm/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -1002,3 +1002,11 @@ var (
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)

const (
DCGM_GROUP_ALL_GPUS uint = 0x7fffffff
DCGM_GROUP_ALL_NVSWITCHES uint = 0x7ffffffe
DCGM_GROUP_ALL_INSTANCES uint = 0x7ffffffd
DCGM_GROUP_ALL_COMPUTE_INSTANCES uint = 0x7ffffffc
DCGM_GROUP_ALL_ENTITIES uint = 0x7ffffffb
)
31 changes: 31 additions & 0 deletions pkg/dcgm/enum.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

type FieldEntityGroup uint32

const (
DCGM_FE_NONE FieldEntityGroup = iota // Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
DCGM_FE_GPU // Field is associated with a GPU entity
DCGM_FE_VGPU // Field is associated with a VGPU entity
DCGM_FE_SWITCH // Field is associated with a Switch entity
DCGM_FE_GPU_I // Field is associated with a GPU Instance entity
DCGM_FE_GPU_CI // Field is associated with a GPU Compute Instance entity
DCGM_FE_LINK // Field is associated with an NVLink
DCGM_FE_CPU // Field is associated with a CPU node
DCGM_FE_CPU_CORE // Field is associated with a CPU
)
114 changes: 72 additions & 42 deletions pkg/dcgm/field_values.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,66 +29,96 @@ extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGr
import "C"
import (
"fmt"
"math/rand"
"sync"
"time"
"unsafe"
)

// FieldValueEntity represents a field value entity
type FieldValueEntity struct {
ts int64
value [4096]byte
}

// AsInt64 returns field value as int64
func (v FieldValueEntity) AsInt64() int64 {
ptr := (*C.int64_t)(unsafe.Pointer(&v.value[0]))
return int64(*ptr)
}

// FieldValueRecords represents field value records output in the format: [entityGroupId][entityId][fieldId][]FieldValue
type FieldValues map[FieldEntityGroup]map[uint]map[int][]FieldValueEntity

type callback struct {
mu sync.Mutex
Values FieldValues
}

func (cb *callback) processValues(entityGroupID FieldEntityGroup, entityID uint, values []C.dcgmFieldValue_v1) {
if cb.Values == nil {
cb.Values = FieldValues{}
}

if _, exists := cb.Values[entityGroupID]; !exists {
cb.Values[entityGroupID] = map[uint]map[int][]FieldValueEntity{}
}

if _, exists := cb.Values[entityGroupID][entityID]; !exists {
cb.Values[entityGroupID][entityID] = map[int][]FieldValueEntity{}
}

cb.mu.Lock()

for _, v := range values {
if v.status > -1 {
cb.Values[entityGroupID][entityID][int(v.fieldId)] = append(cb.Values[entityGroupID][entityID][int(v.fieldId)],
FieldValueEntity{
ts: int64(v.ts),
value: v.value,
})
}
}

cb.mu.Unlock()
}

//export go_dcgmFieldValueEntityEnumeration
func go_dcgmFieldValueEntityEnumeration(
entityGroupId C.dcgm_field_entity_group_t,
entityId C.dcgm_field_eid_t,
entityGroupID C.dcgm_field_entity_group_t,
entityID C.dcgm_field_eid_t,
values *C.dcgmFieldValue_v1,
numValues C.int,
userData unsafe.Pointer) C.int {
valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(unsafe.Pointer(values))[0:numValues]
processor := (*callback)(unsafe.Pointer(userData))
processor.processValues(FieldEntityGroup(entityGroupID), uint(entityID), valuesSlice)
return 0
}

func GetValuesSince() {
// GetValuesSince reads all field values that have updated since a given timestamp
//
// GPUGroupID Identifier for a group of GPUs. Look at CreateGroup for details on creating the group. Alternatively, pass in the group id as
// DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or
// DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches.
// fieldGroup Group of Fields to return data
// since is a time to request values since. zero value (0) = request all data
func GetValuesSince(GPUGroupID uint, fieldGroup FieldHandle, since time.Time) (FieldValues, time.Time, error) {
var (
groupID C.dcgmGpuGrp_t
fieldGroupID C.dcgmFieldGrp_t
sinceTimestamp C.longlong
nextSinceTimestamp C.longlong
userData unsafe.Pointer
)
groupID = C.DCGM_GROUP_ALL_GPUS

// Create a fieldGroup
const (
xid int = iota
)

deviceFields := make([]Short, 1)
deviceFields[xid] = C.DCGM_FI_DEV_XID_ERRORS

fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64())
fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields)
if err != nil {
// ToDo: Return error
return
}
defer func() {
_ = FieldGroupDestroy(fieldsGroup)
}()
fieldGroupID = fieldsGroup.handle
sinceTimestamp = 0
UNUSED(userData)
cbResult := &callback{}

result := C.dcgmGetValuesSince_v2(handle.handle,
groupID,
fieldGroupID,
sinceTimestamp,
C.dcgmGpuGrp_t(GPUGroupID),
C.dcgmFieldGrp_t(fieldGroup.handle),
C.longlong(since.Unix()),
&nextSinceTimestamp,
(C.dcgmFieldValueEnumeration_f)(unsafe.Pointer(C.fieldValueEntityCallback)),
nil)
UNUSED(result)
// if result != C.DCGM_ST_OK {
// //return fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result))
// }

// // Process the data as needed
unsafe.Pointer(cbResult))
if result != C.DCGM_ST_OK {
return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result))
}

// return nil
return cbResult.Values, time.Unix(int64(nextSinceTimestamp), 0), nil
}

func UNUSED(x ...interface{}) {}
3 changes: 1 addition & 2 deletions pkg/dcgm/field_values_cb.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@ int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId,
dcgmFieldValue_v1 *values,
int numValues,
void *userData) {
go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData);
return 0; // Assuming success
return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData);
}
77 changes: 64 additions & 13 deletions pkg/dcgm/field_values_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,82 @@
package dcgm

import (
"fmt"
"math/rand"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestGetValuesSince(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

const gpu = 0
const gpu uint = 0

// Inject errors
t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu)
err := InjectFieldValue(gpu,
DCGM_FI_DEV_XID_ERRORS,
DCGM_FT_INT64,
0,
time.Now().Add(60*time.Second).UnixMicro(),
int64(1),
// Create a group of fields
const (
xid int = iota
)

deviceFields := make([]Short, 1)
deviceFields[xid] = DCGM_FI_DEV_XID_ERRORS

fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64())
fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields)
assert.NoError(t, err)
// Force an update of the fields so that we can fetch initial values.
err = UpdateAllFields()
assert.NoError(t, err)
GetValuesSince()
defer func() {
_ = FieldGroupDestroy(fieldsGroup)
}()

t.Run("When there is no data return error", func(t *testing.T) {
values, nextTime, err := GetValuesSince(DCGM_GROUP_ALL_GPUS, fieldsGroup, time.Time{})
require.Error(t, err)
assert.Empty(t, nextTime)
assert.Len(t, values, 0)
})

t.Run("When there are a few entries", func(t *testing.T) {
expectedNumberOfErrors := int64(43)
expectedInjectedValuesCount := 0
t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu)
err = InjectFieldValue(gpu,
DCGM_FI_DEV_XID_ERRORS,
DCGM_FT_INT64,
0,
time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(),
expectedNumberOfErrors,
)
require.NoError(t, err)
expectedInjectedValuesCount++
for i := 4; i > 0; i-- {
err = InjectFieldValue(gpu,
DCGM_FI_DEV_XID_ERRORS,
DCGM_FT_INT64,
0,
time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(),
int64(i),
)
require.NoError(t, err)
expectedInjectedValuesCount++
}
// Force an update of the fields so that we can fetch initial values.
err = UpdateAllFields()
assert.NoError(t, err)
values, nextTime, err := GetValuesSince(DCGM_GROUP_ALL_GPUS, fieldsGroup, time.Time{})
assert.NoError(t, err)
assert.Greater(t, nextTime, time.Time{})
require.Contains(t, values, DCGM_FE_GPU)
require.Contains(t, values[DCGM_FE_GPU], gpu)
require.Contains(t, values[DCGM_FE_GPU][0], DCGM_FI_DEV_XID_ERRORS)
require.Len(t, values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS], expectedInjectedValuesCount)
assert.Equal(t, int64(43), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][0].AsInt64())
assert.Equal(t, int64(4), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][1].AsInt64())
assert.Equal(t, int64(3), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][2].AsInt64())
assert.Equal(t, int64(2), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][3].AsInt64())
assert.Equal(t, int64(1), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][4].AsInt64())
})
}

0 comments on commit 509e719

Please sign in to comment.