Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate Memory Usage for given resource pool for Vsphere provider #6680

Merged
merged 3 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pkg/executables/govc.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"net/http"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -39,6 +40,7 @@ const (
DeployOptsFile = "deploy-opts.json"
disk1 = "Hard disk 1"
disk2 = "Hard disk 2"
MemoryAvailable = "Memory_Available"
)

var requiredEnvs = []string{govcUsernameKey, govcPasswordKey, govcURLKey, govcInsecure, govcDatacenterKey}
Expand Down Expand Up @@ -1143,3 +1145,76 @@ func (g *Govc) SetGroupRoleOnObject(ctx context.Context, principal string, role

return nil
}

type resourcePoolInfo struct {
ResourcePoolIdentifier *resourcePool
}

type resourcePool struct {
memoryUsage string
memoryLimit string
}

// GetResourcePoolInfo returns the pool info for the provided resource pool.
func (g *Govc) GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error) {
params := []string{"pool.info", "-dc", datacenter, resourcepool}
params = append(params, args...)
response, err := g.exec(ctx, params...)
if err != nil {
return nil, fmt.Errorf("getting resource pool information: %v", err)
}

scanner := bufio.NewScanner(strings.NewReader(response.String()))
var resourcePoolInfoResponse resourcePoolInfo
resourcePoolInfoResponse.ResourcePoolIdentifier = new(resourcePool)
for scanner.Scan() {
metaData := scanner.Text()
if strings.Contains(metaData, "Mem Usage") {
resourcePoolInfoResponse.ResourcePoolIdentifier.memoryUsage = strings.Split(metaData, ":")[1]
}
if strings.Contains(metaData, "Mem Limit") {
resourcePoolInfoResponse.ResourcePoolIdentifier.memoryLimit = strings.Split(metaData, ":")[1]
}
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failure reading memory allocation for resource pool")
}

poolInfo, err := getPoolInfo(resourcePoolInfoResponse.ResourcePoolIdentifier)
if err != nil {
return nil, err
}
return poolInfo, nil
}

// getPoolInfo parses resource pool response and returns memory requirements.
func getPoolInfo(rp *resourcePool) (map[string]int, error) {
memoryUsed, err := getValueFromString(rp.memoryUsage)
if err != nil {
return nil, fmt.Errorf("unable to obtain memory usage for resource pool %s: %v", rp.memoryUsage, err)
}
memoryLimit, err := getValueFromString(rp.memoryLimit)
if err != nil {
return nil, fmt.Errorf("unable to obtain memory limit for resource pool %s: %v", rp.memoryLimit, err)
}
poolInfo := make(map[string]int)
if memoryLimit != -1 {
poolInfo[MemoryAvailable] = memoryLimit - memoryUsed
} else {
poolInfo[MemoryAvailable] = memoryLimit
}
return poolInfo, nil
}

// getValueFromString cleans the input string and returns the extracted numerical value.
func getValueFromString(str string) (int, error) {
splitResponse := strings.Split(strings.TrimSpace(str), " ")
nonNumericRegex := regexp.MustCompile(`[^0-9- ]+`)
cleanedString := nonNumericRegex.ReplaceAllString(splitResponse[0], "")
numValue, err := strconv.Atoi(cleanedString)
if err != nil {
return 0, err
}
return numValue, nil
}
79 changes: 79 additions & 0 deletions pkg/executables/govc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1643,3 +1643,82 @@ func TestGovcGetHardDiskSizeError(t *testing.T) {
})
}
}

func TestGovcGetResourcePoolInfo(t *testing.T) {
datacenter := "SDDC-Datacenter"
resourcePool := "*/Resources/Test-ResourcePool"
govcErr := errors.New("error PoolInfo()")
ctx := context.Background()
_, g, executable, env := setup(t)

tests := []struct {
testName string
response string
govcErr error
wantErr error
wantMemInfo map[string]int
}{
{
testName: "pool_info_memory_limit_set",
response: `Name: Test-ResourcePool
Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool
Mem Usage: 100MB (11.3%)
Mem Shares: normal
Mem Reservation: 0MB (expandable=true)
Mem Limit: 1000MB`,
govcErr: nil,
wantErr: nil,
wantMemInfo: map[string]int{executables.MemoryAvailable: 900},
},
{
testName: "pool_info_memory_limit_unset",
response: `Name: Test-ResourcePool
Path: /SDDC-Datacenter/host/Cluster-1/Resources/Test-ResourcePool
Mem Usage: 100MB (11.3%)
Mem Shares: normal
Mem Reservation: 0MB (expandable=true)
Mem Limit: -1MB`,
govcErr: nil,
wantErr: nil,
wantMemInfo: map[string]int{executables.MemoryAvailable: -1},
},
{
testName: "pool_info_memory_usage_corrupt",
response: `Name: Test-ResourcePool
Mem Usage:corrupt-val
Mem Limit:-1MB`,
govcErr: nil,
wantErr: fmt.Errorf("unable to obtain memory usage for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"),
wantMemInfo: nil,
},
{
testName: "pool_info_memory_limit_corrupt",
response: `Name: Test-ResourcePool
Mem Usage:100
Mem Limit:corrupt-val`,
govcErr: nil,
wantErr: fmt.Errorf("unable to obtain memory limit for resource pool corrupt-val: strconv.Atoi: parsing \"-\": invalid syntax"),
wantMemInfo: nil,
},
{
testName: "pool_info_error",
response: "",
govcErr: govcErr,
wantErr: fmt.Errorf("getting resource pool information: %v", govcErr),
wantMemInfo: nil,
},
}

for _, tt := range tests {
t.Run(tt.testName, func(t *testing.T) {
gt := NewWithT(t)
responseBytes := bytes.NewBuffer([]byte(tt.response))
executable.EXPECT().ExecuteWithEnv(ctx, env, "pool.info", "-dc", datacenter, resourcePool).Return(*responseBytes, tt.govcErr)
poolMemInfo, err := g.GetResourcePoolInfo(ctx, datacenter, resourcePool)
if tt.wantErr != nil {
gt.Expect(err.Error()).To(Equal(tt.wantErr.Error()))
}
gt.Expect(poolMemInfo).To(Equal(tt.wantMemInfo))
})
}
}
20 changes: 20 additions & 0 deletions pkg/providers/vsphere/mocks/client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

156 changes: 155 additions & 1 deletion pkg/providers/vsphere/vsphere.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
backOffPeriod = 5 * time.Second
disk1 = "Hard disk 1"
disk2 = "Hard disk 2"
MemoryAvailable = "Memory_Available"
ethtoolDaemonSetName = "vsphere-disable-udp-offload"
)

Expand Down Expand Up @@ -122,6 +123,7 @@ type ProviderGovcClient interface {
CreateRole(ctx context.Context, name string, privileges []string) error
SetGroupRoleOnObject(ctx context.Context, principal string, role string, object string, domain string) error
GetHardDiskSize(ctx context.Context, vm, datacenter string) (map[string]float64, error)
GetResourcePoolInfo(ctx context.Context, datacenter, resourcepool string, args ...string) (map[string]int, error)
}

type ProviderKubectlClient interface {
Expand Down Expand Up @@ -338,7 +340,9 @@ func (p *vsphereProvider) SetupAndValidateCreateCluster(ctx context.Context, clu
if err := p.validateDatastoreUsageForCreate(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err)
}

if err := p.validateMemoryUsageForCreate(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err)
}
if err := p.generateSSHKeysIfNotSet(clusterSpec.VSphereMachineConfigs); err != nil {
return fmt.Errorf("failed setup and validations: %v", err)
}
Expand Down Expand Up @@ -419,6 +423,10 @@ func (p *vsphereProvider) SetupAndValidateUpgradeCluster(ctx context.Context, cl
return fmt.Errorf("validating vsphere machine configs datastore usage: %v", err)
}

if err := p.validateMemoryUsageForUpgrade(ctx, vSphereClusterSpec, cluster); err != nil {
return fmt.Errorf("validating vsphere machine configs resource pool memory usage: %v", err)
}

if !p.skippedValidations[validations.VSphereUserPriv] {
if err := p.validator.validateVsphereUserPrivs(ctx, vSphereClusterSpec); err != nil {
return fmt.Errorf("validating vsphere user privileges: %v", err)
Expand Down Expand Up @@ -590,6 +598,152 @@ func (p *vsphereProvider) validateDatastoreUsageForCreate(ctx context.Context, v
return nil
}

type memoryUsage struct {
availableMemoryMiB int
needMemoryMiB int
}

func (p *vsphereProvider) getPrevMachineConfigMemoryUsage(ctx context.Context, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, count int) (memoryMiB int, err error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count is quite an ambiguous parameter here, what is this a count of?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renaming it to machineConifgCount

em, err := p.providerKubectlClient.GetEksaVSphereMachineConfig(ctx, mc.Name, cluster.KubeconfigFile, mc.GetNamespace())
if err != nil {
return 0, err
}
if em != nil && em.Spec.ResourcePool == mc.Spec.ResourcePool {
return em.Spec.MemoryMiB * em.Spec.NumCPUs * count, nil
}
return 0, nil
}

func (p *vsphereProvider) getMachineConfigMemoryRequirements(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, count int) (available int, need int, err error) {
chrisdoherty4 marked this conversation as resolved.
Show resolved Hide resolved
poolInfo, err := p.providerGovcClient.GetResourcePoolInfo(ctx, dc, mc.Spec.ResourcePool)
if err != nil {
return 0, 0, err
}
needMemoryMiB := mc.Spec.MemoryMiB * mc.Spec.NumCPUs * count
chrisdoherty4 marked this conversation as resolved.
Show resolved Hide resolved
return poolInfo[MemoryAvailable], needMemoryMiB, nil
}

func (p *vsphereProvider) calculateResourcePoolMemoryUsage(ctx context.Context, dc string, mc *v1alpha1.VSphereMachineConfig, cluster *types.Cluster, mu map[string]*memoryUsage, prevCount, newCount int) error {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The keys for mu are really unclear. You can either create a dedicated data structure and give it an appropriate description or clearly document the param on this method. Same for counts.

Mind expanding dc as well given its of type string, its not clear what that means.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will expand the parameter names

availableMemoryMiB, needMemoryMiB, err := p.getMachineConfigMemoryRequirements(ctx, dc, mc, newCount)
if err != nil {
return err
}

// the last old machine is deleted only after the desired number of new machines are rolled out, so reduce 1 count to accommodate for the last old machine which is not deleted until the end of rollout
prevUsage, err := p.getPrevMachineConfigMemoryUsage(ctx, mc, cluster, prevCount-1)
if err != nil {
return err
}
// update the available memory with previous usage only when memory limit is set
if availableMemoryMiB != -1 {
availableMemoryMiB += prevUsage
}
updateMemoryUsageMap(mc, needMemoryMiB, availableMemoryMiB, prevUsage, mu)
return nil
}

func updateMemoryUsageMap(mc *v1alpha1.VSphereMachineConfig, needMiB, availableMiB, prevUsage int, mu map[string]*memoryUsage) {
if _, ok := mu[mc.Spec.ResourcePool]; ok && mu[mc.Spec.ResourcePool].availableMemoryMiB != -1 {
mu[mc.Spec.ResourcePool].needMemoryMiB += needMiB
mu[mc.Spec.ResourcePool].availableMemoryMiB += prevUsage
} else {
mu[mc.Spec.ResourcePool] = &memoryUsage{
availableMemoryMiB: availableMiB,
needMemoryMiB: needMiB,
}
}
}

func (p *vsphereProvider) validateMemoryUsageForCreate(ctx context.Context, clusterSpec *Spec) error {
memoryUsage := make(map[string]*memoryUsage)
datacenter := clusterSpec.VSphereDatacenter.Spec.Datacenter
cpMachineConfig := clusterSpec.controlPlaneMachineConfig()
controlPlaneAvailableMiB, controlPlaneNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, cpMachineConfig, clusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A method called getMachineConfigMemoryRequirements that returns 'available memory' is a little confusing. This may just be a naming thing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have added an comment and changed the naming a bit

if err != nil {
return fmt.Errorf("calculating memory usage for control plane: %v", err)
}
updateMemoryUsageMap(cpMachineConfig, controlPlaneNeedMiB, controlPlaneAvailableMiB, 0, memoryUsage)
for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
workerMachineConfig := clusterSpec.workerMachineConfig(workerNodeGroupConfiguration)
workerAvailableMiB, workerNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, workerMachineConfig, *workerNodeGroupConfiguration.Count)
if err != nil {
return fmt.Errorf("calculating memory usage for worker node groups: %v", err)
}
updateMemoryUsageMap(workerMachineConfig, workerNeedMiB, workerAvailableMiB, 0, memoryUsage)
}
etcdMachineConfig := clusterSpec.etcdMachineConfig()
if etcdMachineConfig != nil {
etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, clusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count)
if err != nil {
return err
}
updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage)
}
for resourcePool, usage := range memoryUsage {
if usage.availableMemoryMiB != -1 {
if usage.needMemoryMiB > usage.availableMemoryMiB {
return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool)
}
}
}
logger.V(5).Info("Memory availability for machine configs in requested resource pool validated")
return nil
}

func (p *vsphereProvider) validateMemoryUsageForUpgrade(ctx context.Context, currentClusterSpec *Spec, cluster *types.Cluster) error {
memoryUsage := make(map[string]*memoryUsage)
prevEksaCluster, err := p.providerKubectlClient.GetEksaCluster(ctx, cluster, currentClusterSpec.Cluster.GetName())
if err != nil {
return err
}

datacenter := currentClusterSpec.VSphereDatacenter.Spec.Datacenter
cpMachineConfig := currentClusterSpec.controlPlaneMachineConfig()
if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, cpMachineConfig, cluster, memoryUsage, prevEksaCluster.Spec.ControlPlaneConfiguration.Count, currentClusterSpec.Cluster.Spec.ControlPlaneConfiguration.Count); err != nil {
return fmt.Errorf("calculating memory usage for control plane: %v", err)
}

prevMachineConfigRefs := machineRefSliceToMap(prevEksaCluster.MachineConfigRefs())
if err := p.getWorkerNodeGroupMemoryUsage(ctx, datacenter, currentClusterSpec, cluster, memoryUsage, prevMachineConfigRefs); err != nil {
return fmt.Errorf("calculating memory usage for worker node groups: %v", err)
}

etcdMachineConfig := currentClusterSpec.etcdMachineConfig()
if etcdMachineConfig != nil {
etcdAvailableMiB, etcdNeedMiB, err := p.getMachineConfigMemoryRequirements(ctx, datacenter, etcdMachineConfig, currentClusterSpec.Cluster.Spec.ExternalEtcdConfiguration.Count)
if err != nil {
return err
}
// older etcd machines are not deleted until the end of rollout so do not account for the previous usage
updateMemoryUsageMap(etcdMachineConfig, etcdNeedMiB, etcdAvailableMiB, 0, memoryUsage)
}

for resourcePool, usage := range memoryUsage {
if usage.availableMemoryMiB != -1 && usage.needMemoryMiB > usage.availableMemoryMiB {
return fmt.Errorf("not enough memory avaialable in resource pool %v for given memoryMiB and count for respective machine groups", resourcePool)
}
}
logger.V(5).Info("Memory availability for machine configs in requested resource pool validated")
return nil
}

func (p *vsphereProvider) getWorkerNodeGroupMemoryUsage(ctx context.Context, datacenter string, currentClusterSpec *Spec, cluster *types.Cluster, memoryUsage map[string]*memoryUsage, prevMachineConfigRefs map[string]v1alpha1.Ref) error {
for _, workerNodeGroupConfiguration := range currentClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
prevCount := 0
workerMachineConfig := currentClusterSpec.workerMachineConfig(workerNodeGroupConfiguration)
if _, ok := prevMachineConfigRefs[workerNodeGroupConfiguration.MachineGroupRef.Name]; ok {
prevCount = *workerNodeGroupConfiguration.Count
} else {
// set count to 1 when no previous machine config was found for the current worker node group in the spec to avoid a negative count in calculation
prevCount = 1
}
if err := p.calculateResourcePoolMemoryUsage(ctx, datacenter, workerMachineConfig, cluster, memoryUsage, prevCount, *workerNodeGroupConfiguration.Count); err != nil {
return fmt.Errorf("calculating memory usage: %v", err)
}
}
return nil
}

func (p *vsphereProvider) UpdateSecrets(ctx context.Context, cluster *types.Cluster, _ *cluster.Spec) error {
var contents bytes.Buffer
err := p.createSecret(ctx, cluster, &contents)
Expand Down
Loading