Skip to content

Commit

Permalink
feat: Scale storage support on IBM Cloud HPC "/mnt/lsf"<br>* Fix for …
Browse files Browse the repository at this point in the history
…Intermittent scale mount failures<br>* Custom VPC Share support on "/mnt/lsf"<br>* Test cases upgradation<br>* Tekton COS Integrate<br>* New custom images for PAC (#171)

Co-authored-by: Nupur Goyal <nupurgoyal@Nupurs-MacBook-Pro.local>
  • Loading branch information
nupurg-ibm and Nupur Goyal authored Jun 24, 2024
1 parent b691033 commit 3fc6a52
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 14 deletions.
8 changes: 4 additions & 4 deletions modules/landing_zone_vsi/image_map.tf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
locals {
image_region_map = {
"hpcaas-lsf10-rhel88-v7" = {
"us-east" = "r014-68a7ad8a-c513-418e-a30c-9a04ce0a144a"
"eu-de" = "r010-b392ff76-fb8c-4b0f-9fef-fba89eb3ee5b"
"us-south" = "r006-86f207dd-7029-4705-9222-0f5499387734"
"hpcaas-lsf10-rhel88-v8" = {
"us-east" = "r014-ee8b808f-e129-4d9e-965e-fed7003132e7"
"eu-de" = "r010-bfad7737-77f9-4af7-9446-4783fb582258"
"us-south" = "r006-d314bc1d-e904-4124-9055-0862e1a56579"
},
"hpcaas-lsf10-rhel88-compute-v5" = {
"us-east" = "r014-deb34fb1-edbf-464c-9af3-7efa2efcff3f"
Expand Down
2 changes: 1 addition & 1 deletion samples/configs/hpc_catalog_values.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"enable_cos_integration" : "false",
"cos_instance_name" : "__NULL__",
"enable_fip" : "true",
"management_image_name" : "hpcaas-lsf10-rhel88-v7",
"management_image_name" : "hpcaas-lsf10-rhel88-v8",
"compute_image_name" : "hpcaas-lsf10-rhel88-compute-v5",
"login_image_name" : "hpcaas-lsf10-rhel88-compute-v5",
"login_node_instance_type" : "bx2-2x8",
Expand Down
2 changes: 1 addition & 1 deletion samples/configs/hpc_schematics_values.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@
},
{
"name": "management_image_name",
"value": "hpcaas-lsf10-rhel88-v7",
"value": "hpcaas-lsf10-rhel88-v8",
"type": "string",
"secure": false,
"description": "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering."
Expand Down
2 changes: 1 addition & 1 deletion solutions/hpc/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ variable "login_node_instance_type" {
}
variable "management_image_name" {
type = string
default = "hpcaas-lsf10-rhel88-v7"
default = "hpcaas-lsf10-rhel88-v8"
description = "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering."

}
Expand Down
12 changes: 6 additions & 6 deletions tests/lsf/lsf_cluster_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ func GetOSNameOfNode(t *testing.T, sClient *ssh.Client, hostIP string, logger *u
return "", parseErr
}

// HPCCheckFileMount checks if essential LSF directories (conf, config_done, das_staging_area, gui-conf, gui-logs, log, repository-path and work) exist
// HPCCheckFileMount checks if essential LSF directories (10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work) exist
// on remote machines identified by the provided list of IP addresses. It utilizes SSH to
// query and validate the directories. Any missing directory triggers an error, and the
// function logs the success message if all directories are found.
Expand Down Expand Up @@ -1022,15 +1022,15 @@ func verifyDirectories(t *testing.T, sClient *ssh.Client, ip string, logger *uti
// Split the output into directory names
actualDirs := strings.Fields(strings.TrimSpace(string(outputTwo)))
// Define expected directories
expectedDirs := []string{"conf", "config_done", "das_staging_area", "gui-conf", "gui-logs", "log", "repository-path", "work"}
expectedDirs := []string{"10.1", "conf", "config_done", "das_staging_area", "data", "gui-conf", "gui-logs", "log", "repository-path", "work"}

// Verify if all expected directories exist
if !utils.VerifyDataContains(t, actualDirs, expectedDirs, logger) {
return fmt.Errorf("actual directory '%v' does not match the expected directory '%v' for node IP '%s'", actualDirs, expectedDirs, ip)
}

// Log directories existence
logger.Info(t, fmt.Sprintf("Directories [conf, config_done, das_staging_area, gui-conf, gui-logs, log, repository-path and work] exist on %s", ip))
logger.Info(t, fmt.Sprintf("Directories [10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work] exist on %s", ip))
return nil
}

Expand Down Expand Up @@ -1321,7 +1321,7 @@ func LSFRunJobsAsLDAPUser(t *testing.T, sClient *ssh.Client, jobCmd, ldapUser st
return fmt.Errorf("job execution for ID %s exceeded the specified time", jobID)
}

// HPCCheckFileMountAsLDAPUser checks if essential LSF directories (conf, config_done, das_staging_area, gui-conf, gui-logs, log, repository-path and work) exist
// HPCCheckFileMountAsLDAPUser checks if essential LSF directories (10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work) exist
// on remote machines It utilizes SSH to
// query and validate the directories. Any missing directory triggers an error, and the
// function logs the success message if all directories are found.
Expand Down Expand Up @@ -1408,15 +1408,15 @@ func verifyDirectoriesAsLdapUser(t *testing.T, sClient *ssh.Client, hostname str
// Split the output into directory names
actualDirs := strings.Fields(strings.TrimSpace(string(outputTwo)))
// Define expected directories
expectedDirs := []string{"conf", "config_done", "das_staging_area", "gui-conf", "gui-logs", "log", "repository-path", "work"}
expectedDirs := []string{"10.1", "conf", "config_done", "das_staging_area", "data", "gui-conf", "gui-logs", "log", "repository-path", "work"}

// Verify if all expected directories exist
if !utils.VerifyDataContains(t, actualDirs, expectedDirs, logger) {
return fmt.Errorf("actual directory '%v' does not match the expected directory '%v' for node IP '%s'", actualDirs, expectedDirs, hostname)
}

// Log directories existence
logger.Info(t, fmt.Sprintf("Directories [conf, config_done, das_staging_area, gui-conf, gui-logs, log, repository-path and work] exist on %s", hostname))
logger.Info(t, fmt.Sprintf("Directories [10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work] exist on %s", hostname))
return nil
}

Expand Down
62 changes: 62 additions & 0 deletions tests/other_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,7 @@ func TestRunInvalidLDAPServerIP(t *testing.T) {
if err != nil {
// Check if the error message contains specific keywords indicating LDAP server IP issues
result := utils.VerifyDataContains(t, err.Error(), "The connection to the existing LDAP server 10.10.10.10 failed", testLogger)
assert.True(t, result)
if result {
testLogger.PASS(t, "Validation succeeded: Invalid LDAP server IP")
} else {
Expand Down Expand Up @@ -984,6 +985,7 @@ func TestRunInvalidLDAPUsernamePassword(t *testing.T) {
userPasswordError := utils.VerifyDataContains(t, err.Error(), "ldap_usr_pwd", testLogger)
adminPasswordError := utils.VerifyDataContains(t, err.Error(), "ldap_adm_pwd", testLogger)
result := usernameError && userPasswordError && adminPasswordError

// Assert that the result is true if all mandatory fields are missing
assert.True(t, result)
if result {
Expand Down Expand Up @@ -1115,6 +1117,7 @@ func TestRunInvalidDomainName(t *testing.T) {
if err != nil {
// Check if the error message contains specific keywords indicating domain name issues
result := utils.VerifyDataContains(t, err.Error(), "The domain name provided for compute is not a fully qualified domain name", testLogger)
assert.True(t, result)
if result {
testLogger.PASS(t, "Validation succeeded: Invalid domain name")
} else {
Expand Down Expand Up @@ -1339,3 +1342,62 @@ func TestRunExistSubnetIDVpcNameAsNull(t *testing.T) {
testLogger.FAIL(t, "Expected error did not occur on Without VPC name and with valid cluster_subnet_ids and login_subnet_id")
}
}

// TestRunInvalidSshKeysAndRemoteAllowedIP validates cluster creation with invalid ssh keys and remote allowed IP.
func TestRunInvalidSshKeysAndRemoteAllowedIP(t *testing.T) {
// Parallelize the test to run concurrently with others
t.Parallel()

// Setup test suite
setupTestSuite(t)

testLogger.Info(t, "Cluster creation process initiated for "+t.Name())

// HPC cluster prefix
hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString())

// Retrieve cluster information from environment variables
envVars := GetEnvVars()

// Get the absolute path of solutions/hpc
abs, err := filepath.Abs("solutions/hpc")
require.NoError(t, err, "Unable to get absolute path")

terrPath := strings.ReplaceAll(abs, "tests/", "")

// Define Terraform options
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
TerraformDir: terrPath,
Vars: map[string]interface{}{
"cluster_prefix": hpcClusterPrefix,
"bastion_ssh_keys": []string{""},
"compute_ssh_keys": []string{""},
"zones": utils.SplitAndTrim(envVars.Zone, ","),
"remote_allowed_ips": []string{""},
"cluster_id": envVars.ClusterID,
"reservation_id": envVars.ReservationID,
},
})

// Apply the Terraform configuration
_, err = terraform.InitAndPlanE(t, terraformOptions)

// Check if an error occurred during apply
assert.Error(t, err, "Expected an error during apply")

if err != nil {
// Check if the error message contains specific keywords indicating domain name issues
result := utils.VerifyDataContains(t, err.Error(), "The provided IP address format is not valid", testLogger) &&
utils.VerifyDataContains(t, err.Error(), "No SSH Key found with name", testLogger)
assert.True(t, result)
if result {
testLogger.PASS(t, "Validation succeeded: Invalid ssh keys and remote allowed IP")
} else {
testLogger.FAIL(t, "Validation failed: Invalid ssh keys and remote allowed IP")
}
} else {
// Log an error if the expected error did not occur
t.Error("Expected error did not occur")
testLogger.FAIL(t, "Expected error did not occur on Invalid ssh keys and remote allowed IP")
}
}
1 change: 1 addition & 0 deletions tests/pr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ var ignoreDestroys = []string{
"module.check_cluster_status.null_resource.remote_exec[0]",
"module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]",
"module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]",
"module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[1]",
}

// EnvVars stores environment variable values.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ remote_allowed_ips:
ssh_key: geretain-hpc
login_node_instance_type: bx2-2x8
login_image_name: hpcaas-lsf10-rhel88-compute-v5
management_image_name: hpcaas-lsf10-rhel88-v7
management_image_name: hpcaas-lsf10-rhel88-v8
compute_image_name: hpcaas-lsf10-rhel88-compute-v5
management_node_instance_type: bx2-2x8
management_node_count: 2
Expand Down

0 comments on commit 3fc6a52

Please sign in to comment.