Skip to content

Commit

Permalink
Add poweroff hardware cleanup step after Tinkerbell E2E tests (#8140)
Browse files Browse the repository at this point in the history
* power down hardware in clean up step after tests

fix linting errors

* address PR comments

* change cleanup machines to cleanup resources

* moved context timeout inside poweroff hardware; leftover machines -> resources
  • Loading branch information
cxbrowne1207 authored May 29, 2024
1 parent 7abe87c commit 249f20d
Show file tree
Hide file tree
Showing 22 changed files with 149 additions and 101 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
reports:
e2e-reports:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ phases:
-v 4
--skip ${SKIPPED_TESTS}
--bundles-override=${BUNDLES_OVERRIDE}
--cleanup-vms=true
--cleanup-resources=true
--test-report-folder=reports
--branch-name=${BRANCH_NAME}
--baremetal-branch=${BAREMETAL_BRANCH}
Expand Down
8 changes: 4 additions & 4 deletions cmd/integration_test/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ const (
maxConcurrentTestsFlagName = "max-concurrent-tests"
skipFlagName = "skip"
bundlesOverrideFlagName = "bundles-override"
cleanupVmsFlagName = "cleanup-vms"
cleanupResourcesFlagName = "cleanup-resources"
testReportFolderFlagName = "test-report-folder"
branchNameFlagName = "branch-name"
instanceConfigFlagName = "instance-config"
Expand Down Expand Up @@ -66,7 +66,7 @@ func init() {
runE2ECmd.Flags().IntP(maxConcurrentTestsFlagName, "p", 1, "Maximum number of parallel tests that can be run at a time")
runE2ECmd.Flags().StringSlice(skipFlagName, nil, "List of tests to skip")
runE2ECmd.Flags().Bool(bundlesOverrideFlagName, false, "Flag to indicate if the tests should run with a bundles override")
runE2ECmd.Flags().Bool(cleanupVmsFlagName, false, "Flag to indicate if VSphere VMs should be cleaned up automatically as tests complete")
runE2ECmd.Flags().Bool(cleanupResourcesFlagName, false, "Flag to indicate if test resources should be cleaned up automatically as tests complete")
runE2ECmd.Flags().String(testReportFolderFlagName, "", "Folder destination for JUnit tests reports")
runE2ECmd.Flags().String(branchNameFlagName, "main", "EKS-A origin branch from where the tests are being run")
runE2ECmd.Flags().String(baremetalBranchFlagName, "main", "Branch for baremetal tests to run on")
Expand All @@ -88,7 +88,7 @@ func runE2E(ctx context.Context) error {
maxConcurrentTests := viper.GetInt(maxConcurrentTestsFlagName)
testsToSkip := viper.GetStringSlice(skipFlagName)
bundlesOverride := viper.GetBool(bundlesOverrideFlagName)
cleanupVms := viper.GetBool(cleanupVmsFlagName)
cleanupResources := viper.GetBool(cleanupResourcesFlagName)
testReportFolder := viper.GetString(testReportFolderFlagName)
branchName := viper.GetString(branchNameFlagName)
baremetalBranchName := viper.GetString(baremetalBranchFlagName)
Expand All @@ -102,7 +102,7 @@ func runE2E(ctx context.Context) error {
Regex: testRegex,
TestsToSkip: testsToSkip,
BundlesOverride: bundlesOverride,
CleanupVms: cleanupVms,
CleanupResources: cleanupResources,
TestReportFolder: testReportFolder,
BranchName: branchName,
TestInstanceConfigFile: instanceConfigFile,
Expand Down
89 changes: 89 additions & 0 deletions internal/test/cleanup/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,25 @@ import (
"fmt"
"os"
"strconv"
"strings"
"time"

"github.com/aws/aws-sdk-go/aws/session"
"github.com/bmc-toolbox/bmclib/v2"
"github.com/go-logr/logr"
prismgoclient "github.com/nutanix-cloud-native/prism-go-client"
v3 "github.com/nutanix-cloud-native/prism-go-client/v3"

"github.com/aws/eks-anywhere/internal/pkg/api"
"github.com/aws/eks-anywhere/internal/pkg/ec2"
"github.com/aws/eks-anywhere/internal/pkg/s3"
"github.com/aws/eks-anywhere/pkg/errors"
"github.com/aws/eks-anywhere/pkg/executables"
"github.com/aws/eks-anywhere/pkg/filewriter"
"github.com/aws/eks-anywhere/pkg/logger"
"github.com/aws/eks-anywhere/pkg/providers/cloudstack/decoder"
"github.com/aws/eks-anywhere/pkg/providers/nutanix"
"github.com/aws/eks-anywhere/pkg/providers/tinkerbell/hardware"
"github.com/aws/eks-anywhere/pkg/retrier"
"github.com/aws/eks-anywhere/pkg/validations"
)
Expand Down Expand Up @@ -189,3 +195,86 @@ func NutanixTestResources(clusterName, endpoint, port string, insecure, ignoreEr
}
return nil
}

// TinkerbellTestResources cleans up machines by powering them down.
func TinkerbellTestResources(inventoryCSVFilePath string, ignoreErrors bool) error {
hardwarePool, err := api.NewHardwareMapFromFile(inventoryCSVFilePath)
if err != nil {
return fmt.Errorf("failed to create hardware map from inventory csv: %v", err)
}

logger.Info("Powering off hardware: %+v", hardwarePool)
return powerOffHardwarePool(hardwarePool, ignoreErrors)
}

func powerOffHardwarePool(hardware map[string]*hardware.Machine, ignoreErrors bool) error {
errList := []error{}
for _, h := range hardware {
if err := powerOffHardware(h, ignoreErrors); err != nil {
errList = append(errList, err)
}
}

if len(errList) > 0 {
return fmt.Errorf("failed to power off %d hardware: %+v", len(errList), errors.NewAggregate(errList))
}

return nil
}

func powerOffHardware(h *hardware.Machine, ignoreErrors bool) (reterror error) {
ctx, done := context.WithTimeout(context.Background(), 2*time.Minute)
defer done()
bmcClient := newBmclibClient(logr.Discard(), h.BMCIPAddress, h.BMCUsername, h.BMCPassword)

if err := bmcClient.Open(ctx); err != nil {
md := bmcClient.GetMetadata()
logger.Info("Warning: Failed to open connection to BMC: %v, hardware: %v, providersAttempted: %v, failedProviderDetail: %v", err, h.BMCIPAddress, md.ProvidersAttempted, md.SuccessfulOpenConns)
return handlePowerOffHardwareError(err, ignoreErrors)
}

md := bmcClient.GetMetadata()
logger.Info("Connected to BMC: hardware: %v, providersAttempted: %v, successfulProvider: %v", h.BMCIPAddress, md.ProvidersAttempted, md.SuccessfulOpenConns)

defer func() {
if err := bmcClient.Close(ctx); err != nil {
md := bmcClient.GetMetadata()
logger.Info("Warning: BMC close connection failed: %v, hardware: %v, providersAttempted: %v, failedProviderDetail: %v", err, h.BMCIPAddress, md.ProvidersAttempted, md.FailedProviderDetail)
reterror = handlePowerOffHardwareError(err, ignoreErrors)
}
}()

state, err := bmcClient.GetPowerState(ctx)
if err != nil {
state = "unknown"
}
if strings.Contains(strings.ToLower(state), "off") {
return nil
}

if _, err := bmcClient.SetPowerState(ctx, "off"); err != nil {
md := bmcClient.GetMetadata()
logger.Info("Warning: failed to power off hardware: %v, hardware: %v, providersAttempted: %v, failedProviderDetail: %v", err, h.BMCIPAddress, md.ProvidersAttempted, md.SuccessfulOpenConns)
return handlePowerOffHardwareError(err, ignoreErrors)
}

return nil
}

func handlePowerOffHardwareError(err error, ignoreErrors bool) error {
if err != nil && !ignoreErrors {
return err
}
return nil
}

// newBmclibClient creates a new BMClib client.
func newBmclibClient(log logr.Logger, hostIP, username, password string) *bmclib.Client {
o := []bmclib.Option{}
log = log.WithValues("host", hostIP, "username", username)
o = append(o, bmclib.WithLogger(log))
client := bmclib.NewClient(hostIP, username, password, o...)
client.Registry.Drivers = client.Registry.PreferProtocol("redfish")

return client
}
8 changes: 4 additions & 4 deletions internal/test/e2e/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type ParallelRunConf struct {
Regex string
TestsToSkip []string
BundlesOverride bool
CleanupVms bool
CleanupResources bool
TestReportFolder string
BranchName string
BaremetalBranchName string
Expand Down Expand Up @@ -199,7 +199,7 @@ type instanceRunConf struct {
BundlesOverride bool
TestRunnerType TestRunnerType
TestRunnerConfig TestInfraConfig
CleanupVMs bool
CleanupResources bool
Logger logr.Logger
Session *session.Session
}
Expand Down Expand Up @@ -231,7 +231,7 @@ func RunTests(conf instanceRunConf, inventoryCatalogue map[string]*hardwareCatal
"branch_name", conf.BranchName, "ip_pool", conf.IPPool.ToString(),
"hardware_count", conf.HardwareCount, "tinkerbell_airgapped_test", conf.TinkerbellAirgappedTest,
"bundles_override", conf.BundlesOverride, "test_runner_type", conf.TestRunnerType,
"cleanup_vms", conf.CleanupVMs)
"cleanup_resources", conf.CleanupResources)

instanceId, err := testRunner.createInstance(conf)
if err != nil {
Expand Down Expand Up @@ -519,7 +519,7 @@ func newInstanceRunConf(awsSession *session.Session, conf ParallelRunConf, jobNu
BundlesOverride: conf.BundlesOverride,
TestReportFolder: conf.TestReportFolder,
BranchName: conf.BranchName,
CleanupVMs: conf.CleanupVms,
CleanupResources: conf.CleanupResources,
TestRunnerType: testRunnerType,
TestRunnerConfig: *testRunnerConfig,
Logger: conf.Logger.WithValues("jobID", jobID, "test", testRegex),
Expand Down
6 changes: 3 additions & 3 deletions internal/test/e2e/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type E2ESession struct {
ipPool networkutils.IPPool
testEnvVars map[string]string
bundlesOverride bool
cleanupVms bool
cleanup bool
requiredFiles []string
branchName string
hardware []*api.Hardware
Expand All @@ -57,7 +57,7 @@ func newE2ESession(instanceId string, conf instanceRunConf) (*E2ESession, error)
ipPool: conf.IPPool,
testEnvVars: make(map[string]string),
bundlesOverride: conf.BundlesOverride,
cleanupVms: conf.CleanupVMs,
cleanup: conf.CleanupResources,
requiredFiles: requiredFiles,
branchName: conf.BranchName,
hardware: conf.Hardware,
Expand Down Expand Up @@ -187,7 +187,7 @@ func (e *E2ESession) setup(regex string) error {
// Adding JobId to Test Env variables
e.testEnvVars[e2etests.JobIdVar] = e.jobId
e.testEnvVars[e2etests.BundlesOverrideVar] = strconv.FormatBool(e.bundlesOverride)
e.testEnvVars[e2etests.CleanupVmsVar] = strconv.FormatBool(e.cleanupVms)
e.testEnvVars[e2etests.CleanupResourcesVar] = strconv.FormatBool(e.cleanup)

if e.branchName != "" {
e.testEnvVars[e2etests.BranchNameEnvVar] = e.branchName
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ In order to use bundle overrides, take your bundle overrides yaml file and move
You will also need to set the environment variable `T_BUNDLES_OVERRIDE=true`

### Cleaning up VM's after a test run
In order to clean up VM's after a test runs automatically, set `T_CLEANUP_VMS=true`
In order to clean up VM's after a test runs automatically, set `T_CLEANUP_RESOURCES=true`

## VSphere tests requisites
The following env variables need to be set:
Expand Down
12 changes: 6 additions & 6 deletions test/e2e/cloudstack_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3292,12 +3292,12 @@ func TestCloudStackKubernetes126RedhatTo127UpgradeWithCheckpoint(t *testing.T) {
)

clusterOpts = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube127)), framework.ExpectFailure(true),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes126Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "false"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes126Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "false"))

commandOpts := []framework.CommandOpt{framework.WithExternalEtcdWaitTimeout("10m")}

clusterOpts2 = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube127)), framework.ExpectFailure(false),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes127Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "true"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes127Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "true"))

runUpgradeFlowWithCheckpoint(
test,
Expand All @@ -3322,12 +3322,12 @@ func TestCloudStackKubernetes127RedhatTo128UpgradeWithCheckpoint(t *testing.T) {
)

clusterOpts = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube128)), framework.ExpectFailure(true),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes127Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "false"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes127Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "false"))

commandOpts := []framework.CommandOpt{framework.WithExternalEtcdWaitTimeout("10m")}

clusterOpts2 = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube128)), framework.ExpectFailure(false),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes128Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "true"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes128Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "true"))

runUpgradeFlowWithCheckpoint(
test,
Expand All @@ -3352,12 +3352,12 @@ func TestCloudStackKubernetes129RedhatTo130UpgradeWithCheckpoint(t *testing.T) {
)

clusterOpts = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube130)), framework.ExpectFailure(true),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes129Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "false"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes129Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "false"))

commandOpts := []framework.CommandOpt{framework.WithExternalEtcdWaitTimeout("10m")}

clusterOpts2 = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube130)), framework.ExpectFailure(false),
provider.WithProviderUpgrade(provider.Redhat9Kubernetes130Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "true"))
provider.WithProviderUpgrade(provider.Redhat9Kubernetes130Template()), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "true"))

runUpgradeFlowWithCheckpoint(
test,
Expand Down
4 changes: 2 additions & 2 deletions test/e2e/vsphere_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3924,12 +3924,12 @@ func TestVSphereKubernetes127UbuntuTo128UpgradeWithCheckpoint(t *testing.T) {
)

clusterOpts = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube128)), framework.ExpectFailure(true),
provider.WithProviderUpgrade(provider.Ubuntu128Template(), api.WithResourcePoolforCPMachines(vsphereInvalidResourcePoolUpdateVar)), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "false"))
provider.WithProviderUpgrade(provider.Ubuntu128Template(), api.WithResourcePoolforCPMachines(vsphereInvalidResourcePoolUpdateVar)), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "false"))

commandOpts := []framework.CommandOpt{framework.WithControlPlaneWaitTimeout("10m")}

clusterOpts2 = append(clusterOpts, framework.WithClusterUpgrade(api.WithKubernetesVersion(v1alpha1.Kube128)), framework.ExpectFailure(false),
provider.WithProviderUpgrade(provider.Ubuntu128Template(), api.WithResourcePoolForAllMachines(os.Getenv(vsphereResourcePoolVar))), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupVmsVar, "true"))
provider.WithProviderUpgrade(provider.Ubuntu128Template(), api.WithResourcePoolForAllMachines(os.Getenv(vsphereResourcePoolVar))), framework.WithEnvVar(features.CheckpointEnabledEnvVar, "true"), framework.WithEnvVar(framework.CleanupResourcesVar, "true"))

runUpgradeFlowWithCheckpoint(
test,
Expand Down
3 changes: 2 additions & 1 deletion test/framework/cloudstack.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ func (c *CloudStack) ClusterConfigUpdates() []api.ClusterConfigFiller {
return []api.ClusterConfigFiller{api.ClusterToConfigFiller(f...), api.CloudStackToConfigFiller(c.fillers...)}
}

func (c *CloudStack) CleanupVMs(clusterName string) error {
// CleanupResources satisfies the test framework Provider.
func (c *CloudStack) CleanupResources(clusterName string) error {
return cleanup.CloudstackTestResources(context.Background(), clusterName, false, false)
}

Expand Down
Loading

0 comments on commit 249f20d

Please sign in to comment.