From 6d38e3b3805a0e18b99ef02b3bce0301fa3eaa45 Mon Sep 17 00:00:00 2001 From: Rahul Ganesh Date: Wed, 23 Aug 2023 10:08:13 -0700 Subject: [PATCH 1/3] Retrieve resource pool memory usage and validate the memory availaibility for vsphere machine configs for create/upgrade as a preflight check Signed-off-by: Rahul Ganesh --- pkg/executables/govc.go | 74 ++++++++++++ pkg/executables/govc_test.go | 79 +++++++++++++ pkg/providers/vsphere/mocks/client.go | 20 ++++ pkg/providers/vsphere/vsphere.go | 156 +++++++++++++++++++++++++- pkg/providers/vsphere/vsphere_test.go | 153 ++++++++++++++++++++++--- 5 files changed, 467 insertions(+), 15 deletions(-) diff --git a/pkg/executables/govc.go b/pkg/executables/govc.go index c60aff0d9380..96035be71a38 100644 --- a/pkg/executables/govc.go +++ b/pkg/executables/govc.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "path/filepath" + "regexp" "strconv" "strings" "time" @@ -39,6 +40,7 @@ const ( DeployOptsFile = "deploy-opts.json" disk1 = "Hard disk 1" disk2 = "Hard disk 2" + MemoryAvailable = "Memory_Available" ) var requiredEnvs = []string{govcUsernameKey, govcPasswordKey, govcURLKey, govcInsecure, govcDatacenterKey} @@ -1143,3 +1145,75 @@ func (g *Govc) SetGroupRoleOnObject(ctx context.Context, principal string, role return nil } + +type resourcePoolInfo struct { + ResourcePoolIdentifier *resourcePool +} + +type resourcePool struct { + memoryUsage string + memoryLimit string +} + +// GetResourcePoolInfo returns the pool info for the provided resource pool. +func (g *Govc) GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error) { + params := []string{"pool.info", "-dc", datacenter, resourcepool} + params = append(params, args...) + response, err := g.exec(ctx, params...) + if err != nil { + return nil, fmt.Errorf("getting resource pool information: %v", err) + } + + scanner := bufio.NewScanner(strings.NewReader(response.String())) + var resourcePoolInfoResponse resourcePoolInfo + resourcePoolInfoResponse.ResourcePoolIdentifier = new(resourcePool) + for scanner.Scan() { + metaData := scanner.Text() + if strings.Contains(metaData, "Mem Usage") { + resourcePoolInfoResponse.ResourcePoolIdentifier.memoryUsage = strings.Split(metaData, ":")[1] + } + if strings.Contains(metaData, "Mem Limit") { + resourcePoolInfoResponse.ResourcePoolIdentifier.memoryLimit = strings.Split(metaData, ":")[1] + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("failure reading memory allocation for resource pool") + } + + poolInfo, err := getPoolInfo(resourcePoolInfoResponse.ResourcePoolIdentifier) + if err != nil { + return nil, err + } + return poolInfo, nil +} + +// helper function that parses the resource pool response and returns memory requirements. +func getPoolInfo(rp *resourcePool) (map[string]int, error) { + memoryUsed, err := getValueFromString(rp.memoryUsage) + if err != nil { + return nil, fmt.Errorf("unable to obtain memory usage for resource pool %s: %v", rp.memoryUsage, err) + } + memoryLimit, err := getValueFromString(rp.memoryLimit) + if err != nil { + return nil, fmt.Errorf("unable to obtain memory limit for resource pool %s: %v", rp.memoryLimit, err) + } + poolInfo := make(map[string]int) + if memoryLimit != -1 { + poolInfo[MemoryAvailable] = memoryLimit - memoryUsed + } else { + poolInfo[MemoryAvailable] = memoryLimit + } + return poolInfo, nil +} + +func getValueFromString(str string) (int, error) { + splitResponse := strings.Split(strings.TrimSpace(str), " ") + nonNumericRegex := regexp.MustCompile(`[^0-9- ]+`) + cleanedString := nonNumericRegex.ReplaceAllString(splitResponse[0], "") + numValue, err := strconv.Atoi(cleanedString) + if err != nil { + return 0, err + } + return numValue, nil +} diff --git a/pkg/executables/govc_test.go b/pkg/executables/govc_test.go index 424fdde9834c..f3c03109167e 100644 --- a/pkg/executables/govc_test.go +++ b/pkg/executables/govc_test.go @@ -1643,3 +1643,82 @@ func TestGovcGetHardDiskSizeError(t *testing.T) { }) } } + +func TestGovcGetResourcePoolInfo(t *testing.T) { + datacenter := "SDDC-Datacenter" + resourcePool := "*/Resources/Test-ResourcePool" + govcErr := errors.New("error PoolInfo()") + ctx := context.Background() + _, g, executable, env := setup(t) + + tests := []struct { + testName string + response string + govcErr error + wantErr error + wantMemInfo map[string]int + }{ + { + testName: "pool_info_memory_limit_set", + response: `Name: Test-ResourcePool + Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool + Mem Usage: 100MB (11.3%) + Mem Shares: normal + Mem Reservation: 0MB (expandable=true) + Mem Limit: 1000MB`, + govcErr: nil, + wantErr: nil, + wantMemInfo: map[string]int{executables.MemoryAvailable: 900}, + }, + { + testName: "pool_info_memory_limit_unset", + response: `Name: Test-ResourcePool + Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool + Mem Usage: 100MB (11.3%) + Mem Shares: normal + Mem Reservation: 0MB (expandable=true) + Mem Limit: -1MB`, + govcErr: nil, + wantErr: nil, + wantMemInfo: map[string]int{executables.MemoryAvailable: -1}, + }, + { + testName: "pool_info_memory_usage_corrupt", + response: `Name: Test-ResourcePool + Mem Usage:corrupt-val + Mem Limit:-1MB`, + govcErr: nil, + wantErr: fmt.Errorf("unable to obtain memory usage for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"), + wantMemInfo: nil, + }, + { + testName: "pool_info_memory_limit_corrupt", + response: `Name: Test-ResourcePool + Mem Usage:100 + Mem Limit:corrupt-val`, + govcErr: nil, + wantErr: fmt.Errorf("unable to obtain memory limit for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"), + wantMemInfo: nil, + }, + { + testName: "pool_info_error", + response: "", + govcErr: govcErr, + wantErr: fmt.Errorf("getting resource pool information: %v", govcErr), + wantMemInfo: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.testName, func(t *testing.T) { + gt := NewWithT(t) + responseBytes := bytes.NewBuffer([]byte(tt.response)) + executable.EXPECT().ExecuteWithEnv(ctx, env, "pool.info", "-dc", datacenter, resourcePool).Return(*responseBytes, tt.govcErr) + poolMemInfo, err := g.GetResourcePoolInfo(ctx, datacenter, resourcePool) + if tt.wantErr != nil { + gt.Expect(err.Error()).To(Equal(tt.wantErr.Error())) + } + gt.Expect(poolMemInfo).To(Equal(tt.wantMemInfo)) + }) + } +} diff --git a/pkg/providers/vsphere/mocks/client.go b/pkg/providers/vsphere/mocks/client.go index 886647f29002..ac5ff8d60e7b 100644 --- a/pkg/providers/vsphere/mocks/client.go +++ b/pkg/providers/vsphere/mocks/client.go @@ -258,6 +258,26 @@ func (mr *MockProviderGovcClientMockRecorder) GetLibraryElementContentVersion(ar return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLibraryElementContentVersion", reflect.TypeOf((*MockProviderGovcClient)(nil).GetLibraryElementContentVersion), arg0, arg1) } +// GetResourcePoolInfo mocks base method. +func (m *MockProviderGovcClient) GetResourcePoolInfo(arg0 context.Context, arg1, arg2 string, arg3 ...string) (map[string]int, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1, arg2} + for _, a := range arg3 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "GetResourcePoolInfo", varargs...) + ret0, _ := ret[0].(map[string]int) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetResourcePoolInfo indicates an expected call of GetResourcePoolInfo. +func (mr *MockProviderGovcClientMockRecorder) GetResourcePoolInfo(arg0, arg1, arg2 interface{}, arg3 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1, arg2}, arg3...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetResourcePoolInfo", reflect.TypeOf((*MockProviderGovcClient)(nil).GetResourcePoolInfo), varargs...) +} + // GetTags mocks base method. func (m *MockProviderGovcClient) GetTags(arg0 context.Context, arg1 string) ([]string, error) { m.ctrl.T.Helper() diff --git a/pkg/providers/vsphere/vsphere.go b/pkg/providers/vsphere/vsphere.go index ec848a93f541..9e4066fdfd01 100644 --- a/pkg/providers/vsphere/vsphere.go +++ b/pkg/providers/vsphere/vsphere.go @@ -52,6 +52,7 @@ const ( backOffPeriod = 5 * time.Second disk1 = "Hard disk 1" disk2 = "Hard disk 2" + MemoryAvailable = "Memory_Available" ethtoolDaemonSetName = "vsphere-disable-udp-offload" ) @@ -122,6 +123,7 @@ type ProviderGovcClient interface { CreateRole(ctx context.Context, name string, privileges []string) error SetGroupRoleOnObject(ctx context.Context, principal string, role string, object string, domain string) error GetHardDiskSize(ctx context.Context, vm, datacenter string) (map[string]float64, error) + GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error) } type ProviderKubectlClient interface { @@ -338,7 +340,9 @@ func (p *vsphereProvider) SetupAndValidateCreateCluster(ctx context.Context, clu if err := p.validateDatastoreUsageForCreate(ctx, vSphereClusterSpec); err != nil { return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err) } - + if err := p.validateMemoryUsageForCreate(ctx, vSphereClusterSpec); err != nil { + return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err) + } if err := p.generateSSHKeysIfNotSet(clusterSpec.VSphereMachineConfigs); err != nil { return fmt.Errorf("failed setup and validations: %v", err) } @@ -419,6 +423,10 @@ func (p *vsphereProvider) SetupAndValidateUpgradeCluster(ctx context.Context, cl return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err) } + if err := p.validateMemoryUsageForUpgrade(ctx, vSphereClusterSpec, cluster); err != nil { + return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err) + } + if !p.skippedValidations[validations.VSphereUserPriv] { if err := p.validator.validateVsphereUserPrivs(ctx, vSphereClusterSpec); err != nil { return fmt.Errorf("validating vsphere user privileges: %v", err) @@ -590,6 +598,152 @@ func (p *vsphereProvider) validateDatastoreUsageForCreate(ctx context.Context, v return nil } +type memoryUsage struct { + availableMemoryMiB int + needMemoryMiB int +} + +func (p *vsphereProvider) getPrevMachineConfigMemoryUsage(ctx context.Context, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, count int) (memoryMiB int, err error) { + em, err := p.providerKubectlClient.GetEksaVSphereMachineConfig(ctx, mc.Name, cluster.KubeconfigFile, mc.GetNamespace()) + if err != nil { + return 0, err + } + if em != nil && em.Spec.ResourcePool == mc.Spec.ResourcePool { + return em.Spec.MemoryMiB * em.Spec.NumCPUs * count, nil + } + return 0, nil +} + +func (p *vsphereProvider) getMachineConfigMemoryRequirements(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, count int) (available int, need int, err error) { + poolInfo, err := p.providerGovcClient.GetResourcePoolInfo(ctx, dc, mc.Spec.ResourcePool) + if err != nil { + return 0, 0, err + } + needMemoryMiB := mc.Spec.MemoryMiB * mc.Spec.NumCPUs * count + return poolInfo[MemoryAvailable], needMemoryMiB, nil +} + +func (p *vsphereProvider) calculateResourcePoolMemoryUsage(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, mu map[string]*memoryUsage, prevCount, newCount int) error { + availableMemoryMiB, needMemoryMiB, err := p.getMachineConfigMemoryRequirements(ctx, dc, mc, newCount) + if err != nil { + return err + } + + // the last old machine is deleted only after the desired number of new machines are rolled out, so reduce 1 count to accommodate for the last old machine which is not deleted until the end of rollout + prevUsage, err := p.getPrevMachineConfigMemoryUsage(ctx, mc, cluster, prevCount-1) + if err != nil { + return err + } + // update the available memory with previous usage only when memory limit is set + if availableMemoryMiB != -1 { + availableMemoryMiB += prevUsage + } + updateMemoryUsageMap(mc, needMemoryMiB, availableMemoryMiB, prevUsage, mu) + return nil +} + +func updateMemoryUsageMap(mc *v1alpha1.VSphereMachineConfig, needMiB, availableMiB, prevUsage int, mu map[string]*memoryUsage) { + if _, ok := mu[mc.Spec.ResourcePool]; ok && mu[mc.Spec.ResourcePool].availableMemoryMiB != -1 { + mu[mc.Spec.ResourcePool].needMemoryMiB += needMiB + mu[mc.Spec.ResourcePool].availableMemoryMiB += prevUsage + } else { + mu[mc.Spec.ResourcePool] = &memoryUsage{ + availableMemoryMiB: availableMiB, + needMemoryMiB: needMiB, + } + } +} + +func (p *vsphereProvider) validateMemoryUsageForCreate(ctx context.Context, clusterSpec *Spec) error { + memoryUsage := make(map[string]*memoryUsage) + datacenter := clusterSpec.VSphereDatacenter.Spec.Datacenter + cpMachineConfig := clusterSpec.controlPlaneMachineConfig() + controlPlaneAvailableMiB, controlPlaneNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, cpMachineConfig, clusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count) + if err != nil { + return fmt.Errorf("calculating memory usage for control plane: %v", err) + } + updateMemoryUsageMap(cpMachineConfig, controlPlaneNeedMiB, controlPlaneAvailableMiB, 0, memoryUsage) + for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { + workerMachineConfig := clusterSpec.workerMachineConfig(workerNodeGroupConfiguration) + workerAvailableMiB, workerNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, workerMachineConfig, *workerNodeGroupConfiguration.Count) + if err != nil { + return fmt.Errorf("calculating memory usage for worker node groups: %v", err) + } + updateMemoryUsageMap(workerMachineConfig, workerNeedMiB, workerAvailableMiB, 0, memoryUsage) + } + etcdMachineConfig := clusterSpec.etcdMachineConfig() + if etcdMachineConfig != nil { + etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count) + if err != nil { + return err + } + updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage) + } + for resourcePool, usage := range memoryUsage { + if usage.availableMemoryMiB != -1 { + if usage.needMemoryMiB > usage.availableMemoryMiB { + return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool) + } + } + } + logger.V(5).Info("Memory availability for machine configs in requested resource pool validated") + return nil +} + +func (p *vsphereProvider) validateMemoryUsageForUpgrade(ctx context.Context, currentClusterSpec *Spec, cluster *types.Cluster) error { + memoryUsage := make(map[string]*memoryUsage) + prevEksaCluster, err := p.providerKubectlClient.GetEksaCluster(ctx, cluster, currentClusterSpec.Cluster.GetName()) + if err != nil { + return err + } + + datacenter := currentClusterSpec.VSphereDatacenter.Spec.Datacenter + cpMachineConfig := currentClusterSpec.controlPlaneMachineConfig() + if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, cpMachineConfig, cluster, memoryUsage, prevEksaCluster.Spec.ControlPlaneConfiguration.Count, currentClusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count); err != nil { + return fmt.Errorf("calculating memory usage for control plane: %v", err) + } + + prevMachineConfigRefs := machineRefSliceToMap(prevEksaCluster.MachineConfigRefs()) + if err := p.getWorkerNodeGroupMemoryUsage(ctx, datacenter, currentClusterSpec, cluster, memoryUsage, prevMachineConfigRefs); err != nil { + return fmt.Errorf("calculating memory usage for worker node groups: %v", err) + } + + etcdMachineConfig := currentClusterSpec.etcdMachineConfig() + if etcdMachineConfig != nil { + etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, currentClusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count) + if err != nil { + return err + } + // older etcd machines are not deleted until the end of rollout so do not account for the previous usage + updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage) + } + + for resourcePool, usage := range memoryUsage { + if usage.availableMemoryMiB != -1 && usage.needMemoryMiB > usage.availableMemoryMiB { + return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool) + } + } + logger.V(5).Info("Memory availability for machine configs in requested resource pool validated") + return nil +} + +func (p *vsphereProvider) getWorkerNodeGroupMemoryUsage(ctx context.Context, datacenter string, currentClusterSpec *Spec, cluster *types.Cluster, memoryUsage map[string]*memoryUsage, prevMachineConfigRefs map[string]v1alpha1.Ref) error { + for _, workerNodeGroupConfiguration := range currentClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { + prevCount := 0 + workerMachineConfig := currentClusterSpec.workerMachineConfig(workerNodeGroupConfiguration) + if _, ok := prevMachineConfigRefs[workerNodeGroupConfiguration.MachineGroupRef.Name]; ok { + prevCount = *workerNodeGroupConfiguration.Count + } else { + // set count to 1 when no previous machine config was found for the current worker node group in the spec to avoid a negative count in calculation + prevCount = 1 + } + if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, workerMachineConfig, cluster, memoryUsage, prevCount, *workerNodeGroupConfiguration.Count); err != nil { + return fmt.Errorf("calculating memory usage: %v", err) + } + } + return nil +} + func (p *vsphereProvider) UpdateSecrets(ctx context.Context, cluster *types.Cluster, _ *cluster.Spec) error { var contents bytes.Buffer err := p.createSecret(ctx, cluster, &contents) diff --git a/pkg/providers/vsphere/vsphere_test.go b/pkg/providers/vsphere/vsphere_test.go index 2d0488e2b5e0..97d090f3c2e4 100644 --- a/pkg/providers/vsphere/vsphere_test.go +++ b/pkg/providers/vsphere/vsphere_test.go @@ -155,6 +155,10 @@ func (pc *DummyProviderGovcClient) GetHardDiskSize(ctx context.Context, vm, data return map[string]float64{"Hard disk 1": 23068672}, nil } +func (pc *DummyProviderGovcClient) GetResourcePoolInfo(ctx context.Context, datacenter, resourcePool string, args ...string) (map[string]int, error) { + return map[string]int{"Memory_Available": -1}, nil +} + func (pc *DummyProviderGovcClient) GetTags(ctx context.Context, path string) (tags []string, err error) { return []string{eksd119ReleaseTag, eksd121ReleaseTag, pc.osTag}, nil } @@ -1469,8 +1473,8 @@ func TestSetupAndValidateUpgradeClusterMissingPrivError(t *testing.T) { provider.providerKubectlClient = kubectl setupContext(t) - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(1) - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(3) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(5) vscb := mocks.NewMockVSphereClientBuilder(mockCtrl) vscb.EXPECT().Build(ctx, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), clusterSpec.VSphereDatacenter.Spec.Datacenter).Return(nil, fmt.Errorf("error")) @@ -1735,8 +1739,8 @@ func TestSetupAndValidateUpgradeCluster(t *testing.T) { provider.providerKubectlClient = kubectl setupContext(t) - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(3) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(3) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(5) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { t.Fatalf("unexpected failure %v", err) @@ -1799,8 +1803,8 @@ func TestSetupAndValidateUpgradeClusterCPSshNotExists(t *testing.T) { provider.providerKubectlClient = kubectl cluster := &types.Cluster{} - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(3) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(3) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(5) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { t.Fatalf("unexpected failure %v", err) @@ -1820,8 +1824,8 @@ func TestSetupAndValidateUpgradeClusterWorkerSshNotExists(t *testing.T) { provider.providerKubectlClient = kubectl cluster := &types.Cluster{} - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(3) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(3) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(5) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { @@ -1842,8 +1846,8 @@ func TestSetupAndValidateUpgradeClusterEtcdSshNotExists(t *testing.T) { provider.providerKubectlClient = kubectl cluster := &types.Cluster{} - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(3) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(3) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Times(5) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { @@ -1862,13 +1866,15 @@ func TestSetupAndValidateUpgradeClusterSameMachineConfigforCPandEtcd(t *testing. mockCtrl := gomock.NewController(t) kubectl := mocks.NewMockProviderKubectlClient(mockCtrl) provider.providerKubectlClient = kubectl - cluster := &types.Cluster{} - kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(2) + kubectl.EXPECT().GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()).Return(clusterSpec.Cluster.DeepCopy(), nil).Times(3) for _, mc := range clusterSpec.VSphereMachineConfigs { - kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Return(mc, nil) + kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Return(mc, nil).AnyTimes() } - + // resourcePoolResponse := map[string]int{ + // "Memory_Available": -1, + // } + // provider.govc.EXPECT().GetResourcePoolInfo(tt.ctx, tt.clusterSpec.VSphereDatacenter.Spec.Datacenter, tt.clusterSpec.VSphereMachineConfigs[controlPlaneMachineConfigName].Spec.ResourcePool).Return(resourcePoolResponse, nil) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { t.Fatalf("unexpected failure %v", err) @@ -2338,6 +2344,10 @@ func TestSetupAndValidateCreateClusterFullCloneDiskGiBLessThan20TemplateDiskSize tt.govc.EXPECT().GetWorkloadAvailableSpace(tt.ctx, tt.clusterSpec.VSphereMachineConfigs[controlPlaneMachineConfigName].Spec.Datastore).Return(100.0, nil) tt.ipValidator.EXPECT().ValidateControlPlaneIPUniqueness(tt.cluster) + resourcePoolResponse := map[string]int{ + "Memory_Available": -1, + } + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, tt.clusterSpec.VSphereDatacenter.Spec.Datacenter, tt.clusterSpec.VSphereMachineConfigs[controlPlaneMachineConfigName].Spec.ResourcePool).Return(resourcePoolResponse, nil) err := tt.provider.SetupAndValidateCreateCluster(context.Background(), tt.clusterSpec) assert.NoError(t, err, "No error expected for provider.SetupAndValidateCreateCluster()") @@ -3472,6 +3482,121 @@ func TestValidateMachineConfigsDatastoreUsageUpgradeError(t *testing.T) { thenErrorExpected(t, fmt.Sprintf("not enough space in datastore %s for given diskGiB and count for respective machine groups", tt.clusterSpec.VSphereMachineConfigs[tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name].Spec.Datastore), err) } +func TestValidateMachineConfigsMemoryUsageCreateSuccess(t *testing.T) { + tt := newProviderTest(t) + machineConfigs := tt.clusterSpec.VSphereMachineConfigs + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + machineConfigs[tt.clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name].Spec.ResourcePool = "test-resourcepool" + for _, config := range machineConfigs { + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: -1}, nil) + } + vSpec := NewSpec(tt.clusterSpec) + err := tt.provider.validateMemoryUsageForCreate(tt.ctx, vSpec) + if err != nil { + t.Fatalf("unexpected failure %v", err) + } +} + +func TestValidateMachineConfigsMemoryUsageCreateError(t *testing.T) { + tt := newProviderTest(t) + machineConfigs := tt.clusterSpec.VSphereMachineConfigs + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + for _, config := range machineConfigs { + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: 100000}, nil) + } + vSpec := NewSpec(tt.clusterSpec) + err := tt.provider.validateMemoryUsageForCreate(tt.ctx, vSpec) + resourcePool := machineConfigs[tt.clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name].Spec.ResourcePool + thenErrorExpected(t, fmt.Sprintf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool), err) +} + +func TestSetupAndValidateCreateClusterMemoryUsageError(t *testing.T) { + tt := newProviderTest(t) + tt.setExpectationForSetup() + tt.setExpectationForVCenterValidation() + tt.setExpectationsForDefaultDiskAndCloneModeGovcCalls() + tt.setExpectationsForMachineConfigsVCenterValidation() + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + cpMachineConfig := tt.machineConfigs[tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name] + for _, mc := range tt.machineConfigs { + tt.govc.EXPECT().SearchTemplate(tt.ctx, tt.datacenterConfig.Spec.Datacenter, mc.Spec.Template).Return(mc.Spec.Template, nil).AnyTimes() + } + tt.govc.EXPECT().GetTags(tt.ctx, cpMachineConfig.Spec.Template).Return([]string{eksd119ReleaseTag, ubuntuOSTag}, nil) + tt.govc.EXPECT().ListTags(tt.ctx) + tt.govc.EXPECT().GetWorkloadAvailableSpace(tt.ctx, cpMachineConfig.Spec.Datastore).Return(1000.0, nil).AnyTimes() + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, cpMachineConfig.Spec.ResourcePool).Return(nil, fmt.Errorf("error")) + err := tt.provider.SetupAndValidateCreateCluster(tt.ctx, tt.clusterSpec) + thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for control plane: error", err) +} + +func TestValidateMachineConfigsMemoryUsageUpgradeSuccess(t *testing.T) { + tt := newProviderTest(t) + cluster := &types.Cluster{ + Name: "test", + } + tt.kubectl.EXPECT().GetEksaCluster(tt.ctx, cluster, tt.clusterSpec.Cluster.GetName()).Return(tt.clusterSpec.Cluster.DeepCopy(), nil) + vSpec := NewSpec(tt.clusterSpec) + vSpec.Cluster.Spec.ControlPlaneConfiguration.Count += 2 + // change the worker node group to test there is no negative count scenario + wnMachineConfig := getMachineConfig(vSpec.Spec, "test-wn") + newMachineConfigName := "new-test-wn" + newWorkerMachineConfig := wnMachineConfig.DeepCopy() + newWorkerMachineConfig.Name = newMachineConfigName + vSpec.VSphereMachineConfigs[newMachineConfigName] = newWorkerMachineConfig + vSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineGroupRef.Name = newMachineConfigName + machineConfigs := tt.clusterSpec.VSphereMachineConfigs + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + for _, config := range machineConfigs { + tt.kubectl.EXPECT().GetEksaVSphereMachineConfig(tt.ctx, config.Name, cluster.KubeconfigFile, config.Namespace).Return(config, nil).AnyTimes() + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: -1}, nil).AnyTimes() + } + err := tt.provider.validateMemoryUsageForUpgrade(tt.ctx, vSpec, cluster) + if err != nil { + t.Fatalf("unexpected failure %v", err) + } +} + +func TestValidateMachineConfigsMemoryUsageUpgradeError(t *testing.T) { + tt := newProviderTest(t) + cluster := &types.Cluster{ + Name: "test", + } + tt.kubectl.EXPECT().GetEksaCluster(tt.ctx, cluster, tt.clusterSpec.Cluster.GetName()).Return(tt.clusterSpec.Cluster.DeepCopy(), nil) + machineConfigs := tt.clusterSpec.VSphereMachineConfigs + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + for _, config := range machineConfigs { + tt.kubectl.EXPECT().GetEksaVSphereMachineConfig(tt.ctx, config.Name, cluster.KubeconfigFile, config.Namespace).AnyTimes() + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: 10000}, nil) + } + vSpec := NewSpec(tt.clusterSpec) + vSpec.Cluster.Spec.ControlPlaneConfiguration.Count += 2 + err := tt.provider.validateMemoryUsageForUpgrade(tt.ctx, vSpec, cluster) + resourcePool := machineConfigs[tt.clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name].Spec.ResourcePool + thenErrorExpected(t, fmt.Sprintf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool), err) +} + +func TestSetupAndValidateUpgradeClusterMemoryUsageError(t *testing.T) { + tt := newProviderTest(t) + cluster := &types.Cluster{ + Name: "test", + } + tt.setExpectationForSetup() + tt.setExpectationForVCenterValidation() + tt.setExpectationsForDefaultDiskAndCloneModeGovcCalls() + tt.setExpectationsForMachineConfigsVCenterValidation() + tt.kubectl.EXPECT().GetEksaCluster(tt.ctx, cluster, tt.clusterSpec.Cluster.GetName()).Return(tt.clusterSpec.Cluster.DeepCopy(), nil).Times(2) + tt.kubectl.EXPECT().GetEksaVSphereMachineConfig(tt.ctx, gomock.Any(), cluster.KubeconfigFile, tt.clusterSpec.Cluster.GetNamespace()).AnyTimes() + cpMachineConfig := tt.machineConfigs[tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name] + tt.govc.EXPECT().SearchTemplate(tt.ctx, tt.datacenterConfig.Spec.Datacenter, cpMachineConfig.Spec.Template).Return(cpMachineConfig.Spec.Template, nil).AnyTimes() + tt.govc.EXPECT().GetTags(tt.ctx, cpMachineConfig.Spec.Template).Return([]string{eksd119ReleaseTag, ubuntuOSTag}, nil) + tt.govc.EXPECT().ListTags(tt.ctx) + tt.govc.EXPECT().GetWorkloadAvailableSpace(tt.ctx, cpMachineConfig.Spec.Datastore).Return(1000.0, nil).AnyTimes() + datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, cpMachineConfig.Spec.ResourcePool).Return(nil, fmt.Errorf("error")) + err := tt.provider.SetupAndValidateUpgradeCluster(tt.ctx, cluster, tt.clusterSpec, tt.clusterSpec) + thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for control plane: error", err) +} + func TestValidateMachineConfigsNameUniquenessSuccess(t *testing.T) { tt := newProviderTest(t) cluster := &types.Cluster{ From 01e748daf54d9b38b3dd3f7655031d6dc917cd08 Mon Sep 17 00:00:00 2001 From: Rahul Ganesh Date: Sun, 17 Sep 2023 22:19:27 -0700 Subject: [PATCH 2/3] remove commented lines and add functional comments Signed-off-by: Rahul Ganesh --- pkg/executables/govc.go | 3 ++- pkg/providers/vsphere/vsphere_test.go | 4 ---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/executables/govc.go b/pkg/executables/govc.go index 96035be71a38..6f3c653ae224 100644 --- a/pkg/executables/govc.go +++ b/pkg/executables/govc.go @@ -1188,7 +1188,7 @@ func (g *Govc) GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool return poolInfo, nil } -// helper function that parses the resource pool response and returns memory requirements. +// getPoolInfo parses resource pool response and returns memory requirements. func getPoolInfo(rp *resourcePool) (map[string]int, error) { memoryUsed, err := getValueFromString(rp.memoryUsage) if err != nil { @@ -1207,6 +1207,7 @@ func getPoolInfo(rp *resourcePool) (map[string]int, error) { return poolInfo, nil } +// getValueFromString cleans the input string and returns the extracted numerical value. func getValueFromString(str string) (int, error) { splitResponse := strings.Split(strings.TrimSpace(str), " ") nonNumericRegex := regexp.MustCompile(`[^0-9- ]+`) diff --git a/pkg/providers/vsphere/vsphere_test.go b/pkg/providers/vsphere/vsphere_test.go index 97d090f3c2e4..73c6cacd618b 100644 --- a/pkg/providers/vsphere/vsphere_test.go +++ b/pkg/providers/vsphere/vsphere_test.go @@ -1871,10 +1871,6 @@ func TestSetupAndValidateUpgradeClusterSameMachineConfigforCPandEtcd(t *testing. for _, mc := range clusterSpec.VSphereMachineConfigs { kubectl.EXPECT().GetEksaVSphereMachineConfig(ctx, gomock.Any(), cluster.KubeconfigFile, clusterSpec.Cluster.GetNamespace()).Return(mc, nil).AnyTimes() } - // resourcePoolResponse := map[string]int{ - // "Memory_Available": -1, - // } - // provider.govc.EXPECT().GetResourcePoolInfo(tt.ctx, tt.clusterSpec.VSphereDatacenter.Spec.Datacenter, tt.clusterSpec.VSphereMachineConfigs[controlPlaneMachineConfigName].Spec.ResourcePool).Return(resourcePoolResponse, nil) err := provider.SetupAndValidateUpgradeCluster(ctx, cluster, clusterSpec, clusterSpec) if err != nil { t.Fatalf("unexpected failure %v", err) From a30e86a9bd38c55995a9f7213d2cf5c9b07919dd Mon Sep 17 00:00:00 2001 From: Rahul Ganesh Date: Wed, 20 Sep 2023 14:26:11 -0700 Subject: [PATCH 3/3] Add a helper function that returns Machine Config with count. Use the helper function in validation to simplify the algorithm Signed-off-by: Rahul Ganesh --- pkg/providers/vsphere/spec.go | 30 ++++++ pkg/providers/vsphere/vsphere.go | 149 +++++++++----------------- pkg/providers/vsphere/vsphere_test.go | 16 +-- 3 files changed, 88 insertions(+), 107 deletions(-) diff --git a/pkg/providers/vsphere/spec.go b/pkg/providers/vsphere/spec.go index abc3114a2883..3e24135cb347 100644 --- a/pkg/providers/vsphere/spec.go +++ b/pkg/providers/vsphere/spec.go @@ -37,6 +37,36 @@ func (s *Spec) machineConfigs() []*anywherev1.VSphereMachineConfig { return machineConfigs } +// MachineConfigCount represents a machineConfig with it's associated count. +type MachineConfigCount struct { + *anywherev1.VSphereMachineConfig + Count int +} + +func (s *Spec) machineConfigsWithCount() []MachineConfigCount { + machineConfigs := make([]MachineConfigCount, 0, len(s.VSphereMachineConfigs)) + cpMachineConfig := MachineConfigCount{ + VSphereMachineConfig: s.controlPlaneMachineConfig(), + Count: s.Cluster.Spec.ControlPlaneConfiguration.Count, + } + machineConfigs = append(machineConfigs, cpMachineConfig) + if s.etcdMachineConfig() != nil { + etcdMachineConfig := MachineConfigCount{ + VSphereMachineConfig: s.etcdMachineConfig(), + Count: s.Cluster.Spec.ExternalEtcdConfiguration.Count, + } + machineConfigs = append(machineConfigs, etcdMachineConfig) + } + for _, wc := range s.Cluster.Spec.WorkerNodeGroupConfigurations { + workerNodeGroupConfig := MachineConfigCount{ + VSphereMachineConfig: s.workerMachineConfig(wc), + Count: *wc.Count, + } + machineConfigs = append(machineConfigs, workerNodeGroupConfig) + } + return machineConfigs +} + func etcdMachineConfig(s *cluster.Spec) *anywherev1.VSphereMachineConfig { if s.Cluster.Spec.ExternalEtcdConfiguration == nil || s.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef == nil { return nil diff --git a/pkg/providers/vsphere/vsphere.go b/pkg/providers/vsphere/vsphere.go index 9e4066fdfd01..05ae41d77dc4 100644 --- a/pkg/providers/vsphere/vsphere.go +++ b/pkg/providers/vsphere/vsphere.go @@ -340,7 +340,7 @@ func (p *vsphereProvider) SetupAndValidateCreateCluster(ctx context.Context, clu if err := p.validateDatastoreUsageForCreate(ctx, vSphereClusterSpec); err != nil { return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err) } - if err := p.validateMemoryUsageForCreate(ctx, vSphereClusterSpec); err != nil { + if err := p.validateMemoryUsage(ctx, vSphereClusterSpec, nil); err != nil { return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err) } if err := p.generateSSHKeysIfNotSet(clusterSpec.VSphereMachineConfigs); err != nil { @@ -423,7 +423,7 @@ func (p *vsphereProvider) SetupAndValidateUpgradeCluster(ctx context.Context, cl return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err) } - if err := p.validateMemoryUsageForUpgrade(ctx, vSphereClusterSpec, cluster); err != nil { + if err := p.validateMemoryUsage(ctx, vSphereClusterSpec, cluster); err != nil { return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err) } @@ -598,147 +598,98 @@ func (p *vsphereProvider) validateDatastoreUsageForCreate(ctx context.Context, v return nil } -type memoryUsage struct { - availableMemoryMiB int - needMemoryMiB int -} - -func (p *vsphereProvider) getPrevMachineConfigMemoryUsage(ctx context.Context, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, count int) (memoryMiB int, err error) { +// getPrevMachineConfigMemoryUsage returns the memoryMiB freed up from the given machineConfig based on the count. +func (p *vsphereProvider) getPrevMachineConfigMemoryUsage(ctx context.Context, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, machineConfigCount int) (memoryMiB int, err error) { em, err := p.providerKubectlClient.GetEksaVSphereMachineConfig(ctx, mc.Name, cluster.KubeconfigFile, mc.GetNamespace()) if err != nil { return 0, err } if em != nil && em.Spec.ResourcePool == mc.Spec.ResourcePool { - return em.Spec.MemoryMiB * em.Spec.NumCPUs * count, nil + return em.Spec.MemoryMiB * machineConfigCount, nil } return 0, nil } -func (p *vsphereProvider) getMachineConfigMemoryRequirements(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, count int) (available int, need int, err error) { - poolInfo, err := p.providerGovcClient.GetResourcePoolInfo(ctx, dc, mc.Spec.ResourcePool) +// getMachineConfigMemoryAvailability accepts a machine config and returns available memory in the config's resource pool along with needed memory for the machine config. +func (p *vsphereProvider) getMachineConfigMemoryAvailability(ctx context.Context, datacenter string, mc *v1alpha1.VSphereMachineConfig, machineConfigCount int) (availableMemoryMiB, needMemoryMiB int, err error) { + poolInfo, err := p.providerGovcClient.GetResourcePoolInfo(ctx, datacenter, mc.Spec.ResourcePool) if err != nil { return 0, 0, err } - needMemoryMiB := mc.Spec.MemoryMiB * mc.Spec.NumCPUs * count + needMemoryMiB = mc.Spec.MemoryMiB * machineConfigCount return poolInfo[MemoryAvailable], needMemoryMiB, nil } -func (p *vsphereProvider) calculateResourcePoolMemoryUsage(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, mu map[string]*memoryUsage, prevCount, newCount int) error { - availableMemoryMiB, needMemoryMiB, err := p.getMachineConfigMemoryRequirements(ctx, dc, mc, newCount) - if err != nil { - return err - } - - // the last old machine is deleted only after the desired number of new machines are rolled out, so reduce 1 count to accommodate for the last old machine which is not deleted until the end of rollout - prevUsage, err := p.getPrevMachineConfigMemoryUsage(ctx, mc, cluster, prevCount-1) - if err != nil { - return err +// updateMemoryUsageMap updates the memory availability for the machine config's resource pool. +func updateMemoryUsageMap(mc *v1alpha1.VSphereMachineConfig, needMiB, availableMiB int, mu map[string]int) { + if _, ok := mu[mc.Spec.ResourcePool]; !ok { + mu[mc.Spec.ResourcePool] = availableMiB } - // update the available memory with previous usage only when memory limit is set - if availableMemoryMiB != -1 { - availableMemoryMiB += prevUsage + // needMiB can be ignored when the resource pool memory limit is unset + if availableMiB != -1 { + mu[mc.Spec.ResourcePool] -= needMiB } - updateMemoryUsageMap(mc, needMemoryMiB, availableMemoryMiB, prevUsage, mu) - return nil } -func updateMemoryUsageMap(mc *v1alpha1.VSphereMachineConfig, needMiB, availableMiB, prevUsage int, mu map[string]*memoryUsage) { - if _, ok := mu[mc.Spec.ResourcePool]; ok && mu[mc.Spec.ResourcePool].availableMemoryMiB != -1 { - mu[mc.Spec.ResourcePool].needMemoryMiB += needMiB - mu[mc.Spec.ResourcePool].availableMemoryMiB += prevUsage - } else { - mu[mc.Spec.ResourcePool] = &memoryUsage{ - availableMemoryMiB: availableMiB, - needMemoryMiB: needMiB, - } +func addPrevMachineConfigMemoryUsage(mc *v1alpha1.VSphereMachineConfig, prevUsage int, memoryUsage map[string]int) { + // when the memory limit for the respective resource pool is unset, skip accounting for previous usage and validating the needed memory + if _, ok := memoryUsage[mc.Spec.ResourcePool]; ok && memoryUsage[mc.Spec.ResourcePool] != -1 { + memoryUsage[mc.Spec.ResourcePool] += prevUsage } } -func (p *vsphereProvider) validateMemoryUsageForCreate(ctx context.Context, clusterSpec *Spec) error { - memoryUsage := make(map[string]*memoryUsage) +func (p *vsphereProvider) validateMemoryUsage(ctx context.Context, clusterSpec *Spec, cluster *types.Cluster) error { + memoryUsage := make(map[string]int) datacenter := clusterSpec.VSphereDatacenter.Spec.Datacenter - cpMachineConfig := clusterSpec.controlPlaneMachineConfig() - controlPlaneAvailableMiB, controlPlaneNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, cpMachineConfig, clusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count) - if err != nil { - return fmt.Errorf("calculating memory usage for control plane: %v", err) - } - updateMemoryUsageMap(cpMachineConfig, controlPlaneNeedMiB, controlPlaneAvailableMiB, 0, memoryUsage) - for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { - workerMachineConfig := clusterSpec.workerMachineConfig(workerNodeGroupConfiguration) - workerAvailableMiB, workerNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, workerMachineConfig, *workerNodeGroupConfiguration.Count) + for _, mc := range clusterSpec.machineConfigsWithCount() { + availableMemoryMiB, needMemoryMiB, err := p.getMachineConfigMemoryAvailability(ctx, datacenter, mc.VSphereMachineConfig, mc.Count) if err != nil { - return fmt.Errorf("calculating memory usage for worker node groups: %v", err) + return fmt.Errorf("calculating memory usage for machine config %v: %v", mc.VSphereMachineConfig.ObjectMeta.Name, err) } - updateMemoryUsageMap(workerMachineConfig, workerNeedMiB, workerAvailableMiB, 0, memoryUsage) + updateMemoryUsageMap(mc.VSphereMachineConfig, needMemoryMiB, availableMemoryMiB, memoryUsage) } - etcdMachineConfig := clusterSpec.etcdMachineConfig() - if etcdMachineConfig != nil { - etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count) + // account for previous cluster resources that are freed up during upgrade. + if cluster != nil { + err := p.updatePrevClusterMemoryUsage(ctx, clusterSpec, cluster, memoryUsage) if err != nil { return err } - updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage) } - for resourcePool, usage := range memoryUsage { - if usage.availableMemoryMiB != -1 { - if usage.needMemoryMiB > usage.availableMemoryMiB { - return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool) - } + for resourcePool, remaniningMiB := range memoryUsage { + if remaniningMiB != -1 && remaniningMiB < 0 { + return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool) } } logger.V(5).Info("Memory availability for machine configs in requested resource pool validated") return nil } -func (p *vsphereProvider) validateMemoryUsageForUpgrade(ctx context.Context, currentClusterSpec *Spec, cluster *types.Cluster) error { - memoryUsage := make(map[string]*memoryUsage) - prevEksaCluster, err := p.providerKubectlClient.GetEksaCluster(ctx, cluster, currentClusterSpec.Cluster.GetName()) +// updatePrevClusterMemoryUsage calculates memory freed up from previous CP and worker nodes during upgrade and adds up the memory usage for the specific resource pool. +func (p *vsphereProvider) updatePrevClusterMemoryUsage(ctx context.Context, clusterSpec *Spec, cluster *types.Cluster, memoryUsage map[string]int) error { + prevEksaCluster, err := p.providerKubectlClient.GetEksaCluster(ctx, cluster, clusterSpec.Cluster.GetName()) if err != nil { return err } - - datacenter := currentClusterSpec.VSphereDatacenter.Spec.Datacenter - cpMachineConfig := currentClusterSpec.controlPlaneMachineConfig() - if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, cpMachineConfig, cluster, memoryUsage, prevEksaCluster.Spec.ControlPlaneConfiguration.Count, currentClusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count); err != nil { - return fmt.Errorf("calculating memory usage for control plane: %v", err) - } - prevMachineConfigRefs := machineRefSliceToMap(prevEksaCluster.MachineConfigRefs()) - if err := p.getWorkerNodeGroupMemoryUsage(ctx, datacenter, currentClusterSpec, cluster, memoryUsage, prevMachineConfigRefs); err != nil { - return fmt.Errorf("calculating memory usage for worker node groups: %v", err) - } - - etcdMachineConfig := currentClusterSpec.etcdMachineConfig() - if etcdMachineConfig != nil { - etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, currentClusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count) + if _, ok := prevMachineConfigRefs[clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name]; ok { + cpMachineConfig := clusterSpec.controlPlaneMachineConfig() + // The last CP machine is deleted only after the desired number of new worker machines are rolled out, so don't add it's memory + prevCPusage, err := p.getPrevMachineConfigMemoryUsage(ctx, cpMachineConfig, cluster, prevEksaCluster.Spec.ControlPlaneConfiguration.Count-1) if err != nil { - return err - } - // older etcd machines are not deleted until the end of rollout so do not account for the previous usage - updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage) - } - - for resourcePool, usage := range memoryUsage { - if usage.availableMemoryMiB != -1 && usage.needMemoryMiB > usage.availableMemoryMiB { - return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool) + return fmt.Errorf("calculating previous memory usage for control plane: %v", err) } + addPrevMachineConfigMemoryUsage(cpMachineConfig, prevCPusage, memoryUsage) } - logger.V(5).Info("Memory availability for machine configs in requested resource pool validated") - return nil -} - -func (p *vsphereProvider) getWorkerNodeGroupMemoryUsage(ctx context.Context, datacenter string, currentClusterSpec *Spec, cluster *types.Cluster, memoryUsage map[string]*memoryUsage, prevMachineConfigRefs map[string]v1alpha1.Ref) error { - for _, workerNodeGroupConfiguration := range currentClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { - prevCount := 0 - workerMachineConfig := currentClusterSpec.workerMachineConfig(workerNodeGroupConfiguration) + for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { + workerMachineConfig := clusterSpec.workerMachineConfig(workerNodeGroupConfiguration) if _, ok := prevMachineConfigRefs[workerNodeGroupConfiguration.MachineGroupRef.Name]; ok { - prevCount = *workerNodeGroupConfiguration.Count - } else { - // set count to 1 when no previous machine config was found for the current worker node group in the spec to avoid a negative count in calculation - prevCount = 1 - } - if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, workerMachineConfig, cluster, memoryUsage, prevCount, *workerNodeGroupConfiguration.Count); err != nil { - return fmt.Errorf("calculating memory usage: %v", err) + prevCount := *workerNodeGroupConfiguration.Count + // The last worker machine is deleted only after the desired number of new worker machines are rolled out, so don't add it's memory + prevWorkerUsage, err := p.getPrevMachineConfigMemoryUsage(ctx, workerMachineConfig, cluster, prevCount-1) + if err != nil { + return fmt.Errorf("calculating previous memory usage for worker node group - %v: %v", workerMachineConfig.Name, err) + } + addPrevMachineConfigMemoryUsage(workerMachineConfig, prevWorkerUsage, memoryUsage) } } return nil diff --git a/pkg/providers/vsphere/vsphere_test.go b/pkg/providers/vsphere/vsphere_test.go index 73c6cacd618b..fa2f5e2df6f4 100644 --- a/pkg/providers/vsphere/vsphere_test.go +++ b/pkg/providers/vsphere/vsphere_test.go @@ -3487,7 +3487,7 @@ func TestValidateMachineConfigsMemoryUsageCreateSuccess(t *testing.T) { tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: -1}, nil) } vSpec := NewSpec(tt.clusterSpec) - err := tt.provider.validateMemoryUsageForCreate(tt.ctx, vSpec) + err := tt.provider.validateMemoryUsage(tt.ctx, vSpec, nil) if err != nil { t.Fatalf("unexpected failure %v", err) } @@ -3498,10 +3498,10 @@ func TestValidateMachineConfigsMemoryUsageCreateError(t *testing.T) { machineConfigs := tt.clusterSpec.VSphereMachineConfigs datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter for _, config := range machineConfigs { - tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: 100000}, nil) + tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: 10000}, nil) } vSpec := NewSpec(tt.clusterSpec) - err := tt.provider.validateMemoryUsageForCreate(tt.ctx, vSpec) + err := tt.provider.validateMemoryUsage(tt.ctx, vSpec, nil) resourcePool := machineConfigs[tt.clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name].Spec.ResourcePool thenErrorExpected(t, fmt.Sprintf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool), err) } @@ -3522,7 +3522,7 @@ func TestSetupAndValidateCreateClusterMemoryUsageError(t *testing.T) { tt.govc.EXPECT().GetWorkloadAvailableSpace(tt.ctx, cpMachineConfig.Spec.Datastore).Return(1000.0, nil).AnyTimes() tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, cpMachineConfig.Spec.ResourcePool).Return(nil, fmt.Errorf("error")) err := tt.provider.SetupAndValidateCreateCluster(tt.ctx, tt.clusterSpec) - thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for control plane: error", err) + thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for machine config test-cp: error", err) } func TestValidateMachineConfigsMemoryUsageUpgradeSuccess(t *testing.T) { @@ -3546,7 +3546,7 @@ func TestValidateMachineConfigsMemoryUsageUpgradeSuccess(t *testing.T) { tt.kubectl.EXPECT().GetEksaVSphereMachineConfig(tt.ctx, config.Name, cluster.KubeconfigFile, config.Namespace).Return(config, nil).AnyTimes() tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, config.Spec.ResourcePool).Return(map[string]int{MemoryAvailable: -1}, nil).AnyTimes() } - err := tt.provider.validateMemoryUsageForUpgrade(tt.ctx, vSpec, cluster) + err := tt.provider.validateMemoryUsage(tt.ctx, vSpec, cluster) if err != nil { t.Fatalf("unexpected failure %v", err) } @@ -3566,7 +3566,7 @@ func TestValidateMachineConfigsMemoryUsageUpgradeError(t *testing.T) { } vSpec := NewSpec(tt.clusterSpec) vSpec.Cluster.Spec.ControlPlaneConfiguration.Count += 2 - err := tt.provider.validateMemoryUsageForUpgrade(tt.ctx, vSpec, cluster) + err := tt.provider.validateMemoryUsage(tt.ctx, vSpec, cluster) resourcePool := machineConfigs[tt.clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name].Spec.ResourcePool thenErrorExpected(t, fmt.Sprintf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool), err) } @@ -3580,7 +3580,7 @@ func TestSetupAndValidateUpgradeClusterMemoryUsageError(t *testing.T) { tt.setExpectationForVCenterValidation() tt.setExpectationsForDefaultDiskAndCloneModeGovcCalls() tt.setExpectationsForMachineConfigsVCenterValidation() - tt.kubectl.EXPECT().GetEksaCluster(tt.ctx, cluster, tt.clusterSpec.Cluster.GetName()).Return(tt.clusterSpec.Cluster.DeepCopy(), nil).Times(2) + tt.kubectl.EXPECT().GetEksaCluster(tt.ctx, cluster, tt.clusterSpec.Cluster.GetName()).Return(tt.clusterSpec.Cluster.DeepCopy(), nil).Times(1) tt.kubectl.EXPECT().GetEksaVSphereMachineConfig(tt.ctx, gomock.Any(), cluster.KubeconfigFile, tt.clusterSpec.Cluster.GetNamespace()).AnyTimes() cpMachineConfig := tt.machineConfigs[tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name] tt.govc.EXPECT().SearchTemplate(tt.ctx, tt.datacenterConfig.Spec.Datacenter, cpMachineConfig.Spec.Template).Return(cpMachineConfig.Spec.Template, nil).AnyTimes() @@ -3590,7 +3590,7 @@ func TestSetupAndValidateUpgradeClusterMemoryUsageError(t *testing.T) { datacenter := tt.clusterSpec.VSphereDatacenter.Spec.Datacenter tt.govc.EXPECT().GetResourcePoolInfo(tt.ctx, datacenter, cpMachineConfig.Spec.ResourcePool).Return(nil, fmt.Errorf("error")) err := tt.provider.SetupAndValidateUpgradeCluster(tt.ctx, cluster, tt.clusterSpec, tt.clusterSpec) - thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for control plane: error", err) + thenErrorExpected(t, "validating vsphere machine configs resource pool memory usage: calculating memory usage for machine config test-cp: error", err) } func TestValidateMachineConfigsNameUniquenessSuccess(t *testing.T) {