Skip to content

Commit

Permalink
Validate Memory Availability for machine configs for Vsphere provider…
Browse files Browse the repository at this point in the history
… as a pre-flight check for create and upgrade operations.

Signed-off-by: Rahul Ganesh <rahulgab@amazon.com>
  • Loading branch information
g-gaston authored and Rahul Ganesh committed Sep 26, 2023
1 parent 2e62e98 commit ce65db3
Show file tree
Hide file tree
Showing 5 changed files with 467 additions and 15 deletions.
74 changes: 74 additions & 0 deletions pkg/executables/govc.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"net/http"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -39,6 +40,7 @@ const (
DeployOptsFile = "deploy-opts.json"
disk1 = "Hard disk 1"
disk2 = "Hard disk 2"
MemoryAvailable = "Memory_Available"
)

var requiredEnvs = []string{govcUsernameKey, govcPasswordKey, govcURLKey, govcInsecure, govcDatacenterKey}
Expand Down Expand Up @@ -1143,3 +1145,75 @@ func (g *Govc) SetGroupRoleOnObject(ctx context.Context, principal string, role

return nil
}

type resourcePoolInfo struct {
ResourcePoolIdentifier *resourcePool
}

type resourcePool struct {
memoryUsage string
memoryLimit string
}

// GetResourcePoolInfo returns the pool info for the provided resource pool.
func (g *Govc) GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error) {
params := []string{"pool.info", "-dc", datacenter, resourcepool}
params = append(params, args...)
response, err := g.exec(ctx, params...)
if err != nil {
return nil, fmt.Errorf("getting resource pool information: %v", err)
}

scanner := bufio.NewScanner(strings.NewReader(response.String()))
var resourcePoolInfoResponse resourcePoolInfo
resourcePoolInfoResponse.ResourcePoolIdentifier = new(resourcePool)
for scanner.Scan() {
metaData := scanner.Text()
if strings.Contains(metaData, "Mem Usage") {
resourcePoolInfoResponse.ResourcePoolIdentifier.memoryUsage = strings.Split(metaData, ":")[1]
}
if strings.Contains(metaData, "Mem Limit") {
resourcePoolInfoResponse.ResourcePoolIdentifier.memoryLimit = strings.Split(metaData, ":")[1]
}
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failure reading memory allocation for resource pool")
}

poolInfo, err := getPoolInfo(resourcePoolInfoResponse.ResourcePoolIdentifier)
if err != nil {
return nil, err
}
return poolInfo, nil
}

// helper function that parses the resource pool response and returns memory requirements.
func getPoolInfo(rp *resourcePool) (map[string]int, error) {
memoryUsed, err := getValueFromString(rp.memoryUsage)
if err != nil {
return nil, fmt.Errorf("unable to obtain memory usage for resource pool %s: %v", rp.memoryUsage, err)
}
memoryLimit, err := getValueFromString(rp.memoryLimit)
if err != nil {
return nil, fmt.Errorf("unable to obtain memory limit for resource pool %s: %v", rp.memoryLimit, err)
}
poolInfo := make(map[string]int)
if memoryLimit != -1 {
poolInfo[MemoryAvailable] = memoryLimit - memoryUsed
} else {
poolInfo[MemoryAvailable] = memoryLimit
}
return poolInfo, nil
}

func getValueFromString(str string) (int, error) {
splitResponse := strings.Split(strings.TrimSpace(str), " ")
nonNumericRegex := regexp.MustCompile(`[^0-9- ]+`)
cleanedString := nonNumericRegex.ReplaceAllString(splitResponse[0], "")
numValue, err := strconv.Atoi(cleanedString)
if err != nil {
return 0, err
}
return numValue, nil
}
79 changes: 79 additions & 0 deletions pkg/executables/govc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1643,3 +1643,82 @@ func TestGovcGetHardDiskSizeError(t *testing.T) {
})
}
}

func TestGovcGetResourcePoolInfo(t *testing.T) {
datacenter := "SDDC-Datacenter"
resourcePool := "*/Resources/Test-ResourcePool"
govcErr := errors.New("error PoolInfo()")
ctx := context.Background()
_, g, executable, env := setup(t)

tests := []struct {
testName string
response string
govcErr error
wantErr error
wantMemInfo map[string]int
}{
{
testName: "pool_info_memory_limit_set",
response: `Name: Test-ResourcePool
Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool
Mem Usage: 100MB (11.3%)
Mem Shares: normal
Mem Reservation: 0MB (expandable=true)
Mem Limit: 1000MB`,
govcErr: nil,
wantErr: nil,
wantMemInfo: map[string]int{executables.MemoryAvailable: 900},
},
{
testName: "pool_info_memory_limit_unset",
response: `Name: Test-ResourcePool
Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool
Mem Usage: 100MB (11.3%)
Mem Shares: normal
Mem Reservation: 0MB (expandable=true)
Mem Limit: -1MB`,
govcErr: nil,
wantErr: nil,
wantMemInfo: map[string]int{executables.MemoryAvailable: -1},
},
{
testName: "pool_info_memory_usage_corrupt",
response: `Name: Test-ResourcePool
Mem Usage:corrupt-val
Mem Limit:-1MB`,
govcErr: nil,
wantErr: fmt.Errorf("unable to obtain memory usage for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"),
wantMemInfo: nil,
},
{
testName: "pool_info_memory_limit_corrupt",
response: `Name: Test-ResourcePool
Mem Usage:100
Mem Limit:corrupt-val`,
govcErr: nil,
wantErr: fmt.Errorf("unable to obtain memory limit for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"),
wantMemInfo: nil,
},
{
testName: "pool_info_error",
response: "",
govcErr: govcErr,
wantErr: fmt.Errorf("getting resource pool information: %v", govcErr),
wantMemInfo: nil,
},
}

for _, tt := range tests {
t.Run(tt.testName, func(t *testing.T) {
gt := NewWithT(t)
responseBytes := bytes.NewBuffer([]byte(tt.response))
executable.EXPECT().ExecuteWithEnv(ctx, env, "pool.info", "-dc", datacenter, resourcePool).Return(*responseBytes, tt.govcErr)
poolMemInfo, err := g.GetResourcePoolInfo(ctx, datacenter, resourcePool)
if tt.wantErr != nil {
gt.Expect(err.Error()).To(Equal(tt.wantErr.Error()))
}
gt.Expect(poolMemInfo).To(Equal(tt.wantMemInfo))
})
}
}
20 changes: 20 additions & 0 deletions pkg/providers/vsphere/mocks/client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

156 changes: 155 additions & 1 deletion pkg/providers/vsphere/vsphere.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
backOffPeriod = 5 * time.Second
disk1 = "Hard disk 1"
disk2 = "Hard disk 2"
MemoryAvailable = "Memory_Available"
ethtoolDaemonSetName = "vsphere-disable-udp-offload"
)

Expand Down Expand Up @@ -122,6 +123,7 @@ type ProviderGovcClient interface {
CreateRole(ctx context.Context, name string, privileges []string) error
SetGroupRoleOnObject(ctx context.Context, principal string, role string, object string, domain string) error
GetHardDiskSize(ctx context.Context, vm, datacenter string) (map[string]float64, error)
GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error)
}

type ProviderKubectlClient interface {
Expand Down Expand Up @@ -338,7 +340,9 @@ func (p *vsphereProvider) SetupAndValidateCreateCluster(ctx context.Context, clu
if err := p.validateDatastoreUsageForCreate(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err)
}

if err := p.validateMemoryUsageForCreate(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err)
}
if err := p.generateSSHKeysIfNotSet(clusterSpec.VSphereMachineConfigs); err != nil {
return fmt.Errorf("failed setup and validations: %v", err)
}
Expand Down Expand Up @@ -419,6 +423,10 @@ func (p *vsphereProvider) SetupAndValidateUpgradeCluster(ctx context.Context, cl
return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err)
}

if err := p.validateMemoryUsageForUpgrade(ctx, vSphereClusterSpec, cluster); err != nil {
return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err)
}

if !p.skippedValidations[validations.VSphereUserPriv] {
if err := p.validator.validateVsphereUserPrivs(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere user privileges: %v", err)
Expand Down Expand Up @@ -592,6 +600,152 @@ func (p *vsphereProvider) validateDatastoreUsageForCreate(ctx context.Context, v
return nil
}

type memoryUsage struct {
availableMemoryMiB int
needMemoryMiB int
}

func (p *vsphereProvider) getPrevMachineConfigMemoryUsage(ctx context.Context, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, count int) (memoryMiB int, err error) {
em, err := p.providerKubectlClient.GetEksaVSphereMachineConfig(ctx, mc.Name, cluster.KubeconfigFile, mc.GetNamespace())
if err != nil {
return 0, err
}
if em != nil && em.Spec.ResourcePool == mc.Spec.ResourcePool {
return em.Spec.MemoryMiB * em.Spec.NumCPUs * count, nil
}
return 0, nil
}

func (p *vsphereProvider) getMachineConfigMemoryRequirements(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, count int) (available int, need int, err error) {
poolInfo, err := p.providerGovcClient.GetResourcePoolInfo(ctx, dc, mc.Spec.ResourcePool)
if err != nil {
return 0, 0, err
}
needMemoryMiB := mc.Spec.MemoryMiB * mc.Spec.NumCPUs * count
return poolInfo[MemoryAvailable], needMemoryMiB, nil
}

func (p *vsphereProvider) calculateResourcePoolMemoryUsage(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, mu map[string]*memoryUsage, prevCount, newCount int) error {
availableMemoryMiB, needMemoryMiB, err := p.getMachineConfigMemoryRequirements(ctx, dc, mc, newCount)
if err != nil {
return err
}

// the last old machine is deleted only after the desired number of new machines are rolled out, so reduce 1 count to accommodate for the last old machine which is not deleted until the end of rollout
prevUsage, err := p.getPrevMachineConfigMemoryUsage(ctx, mc, cluster, prevCount-1)
if err != nil {
return err
}
// update the available memory with previous usage only when memory limit is set
if availableMemoryMiB != -1 {
availableMemoryMiB += prevUsage
}
updateMemoryUsageMap(mc, needMemoryMiB, availableMemoryMiB, prevUsage, mu)
return nil
}

func updateMemoryUsageMap(mc *v1alpha1.VSphereMachineConfig, needMiB, availableMiB, prevUsage int, mu map[string]*memoryUsage) {
if _, ok := mu[mc.Spec.ResourcePool]; ok && mu[mc.Spec.ResourcePool].availableMemoryMiB != -1 {
mu[mc.Spec.ResourcePool].needMemoryMiB += needMiB
mu[mc.Spec.ResourcePool].availableMemoryMiB += prevUsage
} else {
mu[mc.Spec.ResourcePool] = &memoryUsage{
availableMemoryMiB: availableMiB,
needMemoryMiB: needMiB,
}
}
}

func (p *vsphereProvider) validateMemoryUsageForCreate(ctx context.Context, clusterSpec *Spec) error {
memoryUsage := make(map[string]*memoryUsage)
datacenter := clusterSpec.VSphereDatacenter.Spec.Datacenter
cpMachineConfig := clusterSpec.controlPlaneMachineConfig()
controlPlaneAvailableMiB, controlPlaneNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, cpMachineConfig, clusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count)
if err != nil {
return fmt.Errorf("calculating memory usage for control plane: %v", err)
}
updateMemoryUsageMap(cpMachineConfig, controlPlaneNeedMiB, controlPlaneAvailableMiB, 0, memoryUsage)
for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
workerMachineConfig := clusterSpec.workerMachineConfig(workerNodeGroupConfiguration)
workerAvailableMiB, workerNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, workerMachineConfig, *workerNodeGroupConfiguration.Count)
if err != nil {
return fmt.Errorf("calculating memory usage for worker node groups: %v", err)
}
updateMemoryUsageMap(workerMachineConfig, workerNeedMiB, workerAvailableMiB, 0, memoryUsage)
}
etcdMachineConfig := clusterSpec.etcdMachineConfig()
if etcdMachineConfig != nil {
etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count)
if err != nil {
return err
}
updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage)
}
for resourcePool, usage := range memoryUsage {
if usage.availableMemoryMiB != -1 {
if usage.needMemoryMiB > usage.availableMemoryMiB {
return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool)
}
}
}
logger.V(5).Info("Memory availability for machine configs in requested resource pool validated")
return nil
}

func (p *vsphereProvider) validateMemoryUsageForUpgrade(ctx context.Context, currentClusterSpec *Spec, cluster *types.Cluster) error {
memoryUsage := make(map[string]*memoryUsage)
prevEksaCluster, err := p.providerKubectlClient.GetEksaCluster(ctx, cluster, currentClusterSpec.Cluster.GetName())
if err != nil {
return err
}

datacenter := currentClusterSpec.VSphereDatacenter.Spec.Datacenter
cpMachineConfig := currentClusterSpec.controlPlaneMachineConfig()
if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, cpMachineConfig, cluster, memoryUsage, prevEksaCluster.Spec.ControlPlaneConfiguration.Count, currentClusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count); err != nil {
return fmt.Errorf("calculating memory usage for control plane: %v", err)
}

prevMachineConfigRefs := machineRefSliceToMap(prevEksaCluster.MachineConfigRefs())
if err := p.getWorkerNodeGroupMemoryUsage(ctx, datacenter, currentClusterSpec, cluster, memoryUsage, prevMachineConfigRefs); err != nil {
return fmt.Errorf("calculating memory usage for worker node groups: %v", err)
}

etcdMachineConfig := currentClusterSpec.etcdMachineConfig()
if etcdMachineConfig != nil {
etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, currentClusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count)
if err != nil {
return err
}
// older etcd machines are not deleted until the end of rollout so do not account for the previous usage
updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage)
}

for resourcePool, usage := range memoryUsage {
if usage.availableMemoryMiB != -1 && usage.needMemoryMiB > usage.availableMemoryMiB {
return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool)
}
}
logger.V(5).Info("Memory availability for machine configs in requested resource pool validated")
return nil
}

func (p *vsphereProvider) getWorkerNodeGroupMemoryUsage(ctx context.Context, datacenter string, currentClusterSpec *Spec, cluster *types.Cluster, memoryUsage map[string]*memoryUsage, prevMachineConfigRefs map[string]v1alpha1.Ref) error {
for _, workerNodeGroupConfiguration := range currentClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
prevCount := 0
workerMachineConfig := currentClusterSpec.workerMachineConfig(workerNodeGroupConfiguration)
if _, ok := prevMachineConfigRefs[workerNodeGroupConfiguration.MachineGroupRef.Name]; ok {
prevCount = *workerNodeGroupConfiguration.Count
} else {
// set count to 1 when no previous machine config was found for the current worker node group in the spec to avoid a negative count in calculation
prevCount = 1
}
if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, workerMachineConfig, cluster, memoryUsage, prevCount, *workerNodeGroupConfiguration.Count); err != nil {
return fmt.Errorf("calculating memory usage: %v", err)
}
}
return nil
}

func (p *vsphereProvider) UpdateSecrets(ctx context.Context, cluster *types.Cluster, _ *cluster.Spec) error {
var contents bytes.Buffer
err := p.createSecret(ctx, cluster, &contents)
Expand Down
Loading

0 comments on commit ce65db3

Please sign in to comment.