From 509e71935e2a6158cfa92b19392203a1095c5939 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Wed, 17 Jan 2024 13:33:56 -0600 Subject: [PATCH] The GetValuesSince binding added Signed-off-by: Vadym Fedorov --- Makefile | 3 + pkg/dcgm/const.go | 8 +++ pkg/dcgm/enum.go | 31 +++++++++ pkg/dcgm/field_values.go | 114 +++++++++++++++++++++------------- pkg/dcgm/field_values_cb.c | 3 +- pkg/dcgm/field_values_test.go | 77 +++++++++++++++++++---- 6 files changed, 179 insertions(+), 57 deletions(-) create mode 100644 pkg/dcgm/enum.go diff --git a/Makefile b/Makefile index c86cfbc..f6c3424 100644 --- a/Makefile +++ b/Makefile @@ -43,3 +43,6 @@ clean: rm -f samples/processInfo/processInfo rm -f samples/restApi/restApi rm -f samples/topology/topology + +lint: + golangci-lint run ./... \ No newline at end of file diff --git a/pkg/dcgm/const.go b/pkg/dcgm/const.go index 96b63f5..b8ce0d1 100644 --- a/pkg/dcgm/const.go +++ b/pkg/dcgm/const.go @@ -1002,3 +1002,11 @@ var ( const ( DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) ) + +const ( + DCGM_GROUP_ALL_GPUS uint = 0x7fffffff + DCGM_GROUP_ALL_NVSWITCHES uint = 0x7ffffffe + DCGM_GROUP_ALL_INSTANCES uint = 0x7ffffffd + DCGM_GROUP_ALL_COMPUTE_INSTANCES uint = 0x7ffffffc + DCGM_GROUP_ALL_ENTITIES uint = 0x7ffffffb +) diff --git a/pkg/dcgm/enum.go b/pkg/dcgm/enum.go new file mode 100644 index 0000000..53ec05a --- /dev/null +++ b/pkg/dcgm/enum.go @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +type FieldEntityGroup uint32 + +const ( + DCGM_FE_NONE FieldEntityGroup = iota // Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL + DCGM_FE_GPU // Field is associated with a GPU entity + DCGM_FE_VGPU // Field is associated with a VGPU entity + DCGM_FE_SWITCH // Field is associated with a Switch entity + DCGM_FE_GPU_I // Field is associated with a GPU Instance entity + DCGM_FE_GPU_CI // Field is associated with a GPU Compute Instance entity + DCGM_FE_LINK // Field is associated with an NVLink + DCGM_FE_CPU // Field is associated with a CPU node + DCGM_FE_CPU_CORE // Field is associated with a CPU +) diff --git a/pkg/dcgm/field_values.go b/pkg/dcgm/field_values.go index f916c47..9c5124b 100644 --- a/pkg/dcgm/field_values.go +++ b/pkg/dcgm/field_values.go @@ -29,66 +29,96 @@ extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGr import "C" import ( "fmt" - "math/rand" + "sync" + "time" "unsafe" ) +// FieldValueEntity represents a field value entity +type FieldValueEntity struct { + ts int64 + value [4096]byte +} + +// AsInt64 returns field value as int64 +func (v FieldValueEntity) AsInt64() int64 { + ptr := (*C.int64_t)(unsafe.Pointer(&v.value[0])) + return int64(*ptr) +} + +// FieldValueRecords represents field value records output in the format: [entityGroupId][entityId][fieldId][]FieldValue +type FieldValues map[FieldEntityGroup]map[uint]map[int][]FieldValueEntity + +type callback struct { + mu sync.Mutex + Values FieldValues +} + +func (cb *callback) processValues(entityGroupID FieldEntityGroup, entityID uint, values []C.dcgmFieldValue_v1) { + if cb.Values == nil { + cb.Values = FieldValues{} + } + + if _, exists := cb.Values[entityGroupID]; !exists { + cb.Values[entityGroupID] = map[uint]map[int][]FieldValueEntity{} + } + + if _, exists := cb.Values[entityGroupID][entityID]; !exists { + cb.Values[entityGroupID][entityID] = map[int][]FieldValueEntity{} + } + + cb.mu.Lock() + + for _, v := range values { + if v.status > -1 { + cb.Values[entityGroupID][entityID][int(v.fieldId)] = append(cb.Values[entityGroupID][entityID][int(v.fieldId)], + FieldValueEntity{ + ts: int64(v.ts), + value: v.value, + }) + } + } + + cb.mu.Unlock() +} + //export go_dcgmFieldValueEntityEnumeration func go_dcgmFieldValueEntityEnumeration( - entityGroupId C.dcgm_field_entity_group_t, - entityId C.dcgm_field_eid_t, + entityGroupID C.dcgm_field_entity_group_t, + entityID C.dcgm_field_eid_t, values *C.dcgmFieldValue_v1, numValues C.int, userData unsafe.Pointer) C.int { + valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(unsafe.Pointer(values))[0:numValues] + processor := (*callback)(unsafe.Pointer(userData)) + processor.processValues(FieldEntityGroup(entityGroupID), uint(entityID), valuesSlice) return 0 } -func GetValuesSince() { +// GetValuesSince reads all field values that have updated since a given timestamp +// +// GPUGroupID Identifier for a group of GPUs. Look at CreateGroup for details on creating the group. Alternatively, pass in the group id as +// DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or +// DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. +// fieldGroup Group of Fields to return data +// since is a time to request values since. zero value (0) = request all data +func GetValuesSince(GPUGroupID uint, fieldGroup FieldHandle, since time.Time) (FieldValues, time.Time, error) { var ( - groupID C.dcgmGpuGrp_t - fieldGroupID C.dcgmFieldGrp_t - sinceTimestamp C.longlong nextSinceTimestamp C.longlong - userData unsafe.Pointer - ) - groupID = C.DCGM_GROUP_ALL_GPUS - - // Create a fieldGroup - const ( - xid int = iota ) - deviceFields := make([]Short, 1) - deviceFields[xid] = C.DCGM_FI_DEV_XID_ERRORS - - fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64()) - fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields) - if err != nil { - // ToDo: Return error - return - } - defer func() { - _ = FieldGroupDestroy(fieldsGroup) - }() - fieldGroupID = fieldsGroup.handle - sinceTimestamp = 0 - UNUSED(userData) + cbResult := &callback{} result := C.dcgmGetValuesSince_v2(handle.handle, - groupID, - fieldGroupID, - sinceTimestamp, + C.dcgmGpuGrp_t(GPUGroupID), + C.dcgmFieldGrp_t(fieldGroup.handle), + C.longlong(since.Unix()), &nextSinceTimestamp, (C.dcgmFieldValueEnumeration_f)(unsafe.Pointer(C.fieldValueEntityCallback)), - nil) - UNUSED(result) - // if result != C.DCGM_ST_OK { - // //return fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result)) - // } - - // // Process the data as needed + unsafe.Pointer(cbResult)) + if result != C.DCGM_ST_OK { + return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result)) + } - // return nil + return cbResult.Values, time.Unix(int64(nextSinceTimestamp), 0), nil } - -func UNUSED(x ...interface{}) {} diff --git a/pkg/dcgm/field_values_cb.c b/pkg/dcgm/field_values_cb.c index 6901b03..e5d0b19 100644 --- a/pkg/dcgm/field_values_cb.c +++ b/pkg/dcgm/field_values_cb.c @@ -7,6 +7,5 @@ int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId, dcgmFieldValue_v1 *values, int numValues, void *userData) { - go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData); -return 0; // Assuming success + return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData); } diff --git a/pkg/dcgm/field_values_test.go b/pkg/dcgm/field_values_test.go index 826f7d6..3d201d7 100644 --- a/pkg/dcgm/field_values_test.go +++ b/pkg/dcgm/field_values_test.go @@ -17,10 +17,13 @@ package dcgm import ( + "fmt" + "math/rand" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestGetValuesSince(t *testing.T) { @@ -28,20 +31,68 @@ func TestGetValuesSince(t *testing.T) { defer teardownTest(t) runOnlyWithLiveGPUs(t) - const gpu = 0 + const gpu uint = 0 - // Inject errors - t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) - err := InjectFieldValue(gpu, - DCGM_FI_DEV_XID_ERRORS, - DCGM_FT_INT64, - 0, - time.Now().Add(60*time.Second).UnixMicro(), - int64(1), + // Create a group of fields + const ( + xid int = iota ) + + deviceFields := make([]Short, 1) + deviceFields[xid] = DCGM_FI_DEV_XID_ERRORS + + fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64()) + fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields) assert.NoError(t, err) - // Force an update of the fields so that we can fetch initial values. - err = UpdateAllFields() - assert.NoError(t, err) - GetValuesSince() + defer func() { + _ = FieldGroupDestroy(fieldsGroup) + }() + + t.Run("When there is no data return error", func(t *testing.T) { + values, nextTime, err := GetValuesSince(DCGM_GROUP_ALL_GPUS, fieldsGroup, time.Time{}) + require.Error(t, err) + assert.Empty(t, nextTime) + assert.Len(t, values, 0) + }) + + t.Run("When there are a few entries", func(t *testing.T) { + expectedNumberOfErrors := int64(43) + expectedInjectedValuesCount := 0 + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) + err = InjectFieldValue(gpu, + DCGM_FI_DEV_XID_ERRORS, + DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(), + expectedNumberOfErrors, + ) + require.NoError(t, err) + expectedInjectedValuesCount++ + for i := 4; i > 0; i-- { + err = InjectFieldValue(gpu, + DCGM_FI_DEV_XID_ERRORS, + DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(i), + ) + require.NoError(t, err) + expectedInjectedValuesCount++ + } + // Force an update of the fields so that we can fetch initial values. + err = UpdateAllFields() + assert.NoError(t, err) + values, nextTime, err := GetValuesSince(DCGM_GROUP_ALL_GPUS, fieldsGroup, time.Time{}) + assert.NoError(t, err) + assert.Greater(t, nextTime, time.Time{}) + require.Contains(t, values, DCGM_FE_GPU) + require.Contains(t, values[DCGM_FE_GPU], gpu) + require.Contains(t, values[DCGM_FE_GPU][0], DCGM_FI_DEV_XID_ERRORS) + require.Len(t, values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS], expectedInjectedValuesCount) + assert.Equal(t, int64(43), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][0].AsInt64()) + assert.Equal(t, int64(4), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][1].AsInt64()) + assert.Equal(t, int64(3), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][2].AsInt64()) + assert.Equal(t, int64(2), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][3].AsInt64()) + assert.Equal(t, int64(1), values[DCGM_FE_GPU][0][DCGM_FI_DEV_XID_ERRORS][4].AsInt64()) + }) }