From 5edb818ba92dd842867f0532a45160fdf14f6451 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Thu, 29 Aug 2024 18:03:10 -0400 Subject: [PATCH 1/5] Upload common_dna.json in s3 so it can be shared with all the nodes * Add headNode_private_ip --- .../recipes/init.rb | 4 +++- .../resources/fetch_config.rb | 23 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index 664228d96..3e4d9ceca 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -13,7 +13,9 @@ # limitations under the License. include_recipe "aws-parallelcluster-platform::enable_chef_error_handler" - +fetch_config 'Upload Common Dna to s3' do + action :share_common_dna +end include_recipe "aws-parallelcluster-shared::setup_envars" os_type 'Validate OS type specified by the user is the same as the OS identified by Ohai' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb index 0f2b492ad..548aab5fd 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb @@ -9,6 +9,19 @@ default_action :run + +action :share_common_dna do + return if on_docker? + Chef::Log.debug("Upload common_dna.json in s3") + case node['cluster']['node_type'] + when 'HeadNode' + + execute_command('Update HeadNode Ip', "sed -i 's/HEAD_NODE_PRIVATE_IP/#{get_primary_ip}/g' /tmp/common-dna.json") + + upload_common_dna('upload_common_dna_to_s3', "#{node['cluster']['common_dna_s3_key']}", '/tmp/common-dna.json') + end +end + action :run do return if on_docker? Chef::Log.debug("Called fetch_config with update (#{new_resource.update})") @@ -185,4 +198,14 @@ def wait_cluster_config_file(path) timeout 5 end end + + def upload_common_dna(command_label, key, file_path, version_id = nil) + fetch_s3_object_command = "#{cookbook_virtualenv_path}/bin/aws s3api put-object" \ + " --bucket #{node['cluster']['cluster_s3_bucket']}" \ + " --key #{key}" \ + " --region #{node['cluster']['region']}" \ + " --body #{file_path}" + fetch_s3_object_command += " --version-id #{version_id}" unless version_id.nil? + execute_command(command_label, fetch_s3_object_command) + end end From 4ff7f67179d4c58ddae2748684ddc1b7f3a39ff0 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 4 Sep 2024 17:24:38 -0400 Subject: [PATCH 2/5] Change Update path for compute fleet update --- .../recipes/update.rb | 4 +- .../recipes/config/config_cfn_hup.rb | 53 ++++++++++++++----- .../cfn-hook-update-compute.conf.erb | 5 ++ .../cfn-hup-update-compute-action.sh.erb | 23 ++++++++ 4 files changed, 72 insertions(+), 13 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update-compute.conf.erb create mode 100644 cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index f69aa2453..d60186429 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -12,7 +12,9 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. include_recipe "aws-parallelcluster-shared::setup_envars" - +fetch_config 'Upload Common Dna to s3' do + action :share_common_dna +end # Fetch and load cluster configs include_recipe 'aws-parallelcluster-platform::update' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb index 7a82d5777..f211f2b5b 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb @@ -46,16 +46,45 @@ ) end -template '/etc/cfn/hooks.d/pcluster-update.conf' do - source 'cfn_bootstrap/cfn-hook-update.conf.erb' - owner 'root' - group 'root' - mode '0400' - variables( - stack_id: node['cluster']['stack_arn'], - region: node['cluster']['region'], - cloudformation_url: cloudformation_url, - cfn_init_role: instance_role_name, - launch_template_resource_id: node['cluster']['launch_template_id'] - ) +case node['cluster']['node_type'] +when 'HeadNode', 'LoginNode' + template '/etc/cfn/hooks.d/pcluster-update.conf' do + source 'cfn_bootstrap/cfn-hook-update.conf.erb' + owner 'root' + group 'root' + mode '0400' + variables( + stack_id: node['cluster']['stack_arn'], + region: node['cluster']['region'], + cloudformation_url: cloudformation_url, + cfn_init_role: instance_role_name, + launch_template_resource_id: node['cluster']['launch_template_id'] + ) + end + +when 'ComputeFleet' + template "#{node['cluster']['scripts_dir']}/cfn-hup-update-compute-action.sh" do + source "cfn_bootstrap/cfn-hup-update-compute-action.sh.erb" + owner 'root' + group 'root' + mode '0744' + variables( + clusterS3Bucket: node['cluster']['cluster_s3_bucket'], + region: node['cluster']['region'], + clusterS3ArtifactDir: node['cluster']['cluster_config_s3_key'].chomp('/configs/cluster-config-with-implied-values.yaml'), + clusterConfigVersion: node['cluster']['cluster_config_version'], + launch_template_resource_id: node['cluster']['launch_template_id'] + ) + end + + + template '/etc/cfn/hooks.d/pcluster-update.conf' do + source 'cfn_bootstrap/cfn-hook-update-compute.conf.erb' + owner 'root' + group 'root' + mode '0400' + variables( + launch_template_resource_id: node['cluster']['launch_template_id'] + ) + end end diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update-compute.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update-compute.conf.erb new file mode 100644 index 000000000..1956e0059 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update-compute.conf.erb @@ -0,0 +1,5 @@ +[parallelcluster-update] +triggers=post.update +path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init +action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; ./opt/parallelcluster/scripts/cfn-hup-update-compute-action.sh +runas=root diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb new file mode 100644 index 000000000..b9b8b68d1 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb @@ -0,0 +1,23 @@ +#!/bin/bash +set -ex + + +PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; +. /etc/parallelcluster/pcluster_cookbook_environment.sh; + +S3_BUCKET=<%= @clusterS3Bucket %> +S3_ARTIFACT_DIR=<%= @clusterS3ArtifactDir %> +CLUSTER_CONFIG_VERSION=<%= @clusterConfigVersion %> +REGION=<%= @region %> + +echo "Running S3 commands" +AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/common-dna-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}" +AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/ComputeNode/compute-dna-<%= @launch_template_resource_id %>-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/compute-dna.json 2>&1 || error_exit "${!S3API_RESULT}" +AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/extra-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/extra.json 2>&1 || error_exit "${!S3API_RESULT}" +echo "Completed S3 commands" + +mkdir -p /etc/chef/ohai/hints +touch /etc/chef/ohai/hints/ec2.json +jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) +cd /etc/chef +cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate From 617de45296b5d0b698bddad2b91c9164598bab0d Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Fri, 6 Sep 2024 17:17:50 -0400 Subject: [PATCH 3/5] Using s3 to get latest cluster config version --- .../cfn_bootstrap/cfn-hup-update-compute-action.sh.erb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb index b9b8b68d1..b4549bb64 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb @@ -8,7 +8,10 @@ PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; S3_BUCKET=<%= @clusterS3Bucket %> S3_ARTIFACT_DIR=<%= @clusterS3ArtifactDir %> CLUSTER_CONFIG_VERSION=<%= @clusterConfigVersion %> +echo "Cluster config version is $CLUSTER_CONFIG_VERSION" REGION=<%= @region %> +CLUSTER_CONFIG_VERSION=$(aws s3api list-object-versions --bucket ${S3_BUCKET} --prefix "${S3_ARTIFACT_DIR}/configs/cluster-config-with-implied-values.yaml" --region ${REGION} | jq -r '.Versions[] | select(.IsLatest == true) | .VersionId' 2>&1 || error_exit "${!S3API_RESULT}") +echo "Cluster config version is $CLUSTER_CONFIG_VERSION" echo "Running S3 commands" AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/common-dna-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}" From 2ee6f333fca280940f86241fbf0ad00bdc566c50 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Mon, 9 Sep 2024 17:17:59 -0400 Subject: [PATCH 4/5] add condition and loop to check for ccluster-config version for a running node --- .../recipes/config/config_cfn_hup.rb | 3 +- .../cfn-hup-update-compute-action.sh.erb | 43 ++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb index f211f2b5b..fbf2367b5 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb @@ -73,7 +73,8 @@ region: node['cluster']['region'], clusterS3ArtifactDir: node['cluster']['cluster_config_s3_key'].chomp('/configs/cluster-config-with-implied-values.yaml'), clusterConfigVersion: node['cluster']['cluster_config_version'], - launch_template_resource_id: node['cluster']['launch_template_id'] + launch_template_resource_id: node['cluster']['launch_template_id'], + # cluster_config_version_path: node['cluster']['shared_dir']/cluster-config-version ) end diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb index b4549bb64..0287aea69 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb @@ -7,20 +7,33 @@ PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; S3_BUCKET=<%= @clusterS3Bucket %> S3_ARTIFACT_DIR=<%= @clusterS3ArtifactDir %> -CLUSTER_CONFIG_VERSION=<%= @clusterConfigVersion %> -echo "Cluster config version is $CLUSTER_CONFIG_VERSION" +OLD_CLUSTER_CONFIG_VERSION=<%= @clusterConfigVersion %> REGION=<%= @region %> -CLUSTER_CONFIG_VERSION=$(aws s3api list-object-versions --bucket ${S3_BUCKET} --prefix "${S3_ARTIFACT_DIR}/configs/cluster-config-with-implied-values.yaml" --region ${REGION} | jq -r '.Versions[] | select(.IsLatest == true) | .VersionId' 2>&1 || error_exit "${!S3API_RESULT}") -echo "Cluster config version is $CLUSTER_CONFIG_VERSION" +#CLUSTER_CONFIG_VERSION_FILE=<%#= cluster_config_version_path %> +GET_LATEST_CLUSTER_CONFIG=true -echo "Running S3 commands" -AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/common-dna-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}" -AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/ComputeNode/compute-dna-<%= @launch_template_resource_id %>-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/compute-dna.json 2>&1 || error_exit "${!S3API_RESULT}" -AWS_RETRY_MODE=standard aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/extra-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/extra.json 2>&1 || error_exit "${!S3API_RESULT}" -echo "Completed S3 commands" - -mkdir -p /etc/chef/ohai/hints -touch /etc/chef/ohai/hints/ec2.json -jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) -cd /etc/chef -cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate +while $GET_LATEST_CLUSTER_CONFIG; do + echo "Old Cluster config version is $OLD_CLUSTER_CONFIG_VERSION" + #NEW_CLUSTER_CONFIG_VERSION=$(aws s3api list-object-versions --bucket ${S3_BUCKET} --prefix "${S3_ARTIFACT_DIR}/configs/cluster-config-with-implied-values.yaml" --region ${REGION} | jq -r '.Versions[] | select(.IsLatest == true) | .VersionId' 2>&1 || error_exit "${!S3API_RESULT}") + NEW_CLUSTER_CONFIG_VERSION=$(cat /opt/parallelcluster/shared/cluster-config-version ) + echo "New Cluster config version is $NEW_CLUSTER_CONFIG_VERSION" + + if [[ -n $NEW_CLUSTER_CONFIG_VERSION ]] && [[ $NEW_CLUSTER_CONFIG_VERSION != $OLD_CLUSTER_CONFIG_VERSION]]; then + GET_LATEST_CLUSTER_CONFIG=false + CLUSTER_CONFIG_VERSION=$NEW_CLUSTER_CONFIG_VERSION + AWS_RETRY_MODE=standard + echo "Running S3 commands" + aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/common-dna-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/common-dna.json 2>&1 || error_exit "${!S3API_RESULT}" + aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/ComputeNode/compute-dna-<%= @launch_template_resource_id %>-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/compute-dna.json 2>&1 || error_exit "${!S3API_RESULT}" + aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/extra-${CLUSTER_CONFIG_VERSION}.json" --region ${REGION} /tmp/extra.json 2>&1 || error_exit "${!S3API_RESULT}" + echo "Completed S3 commands" + + mkdir -p /etc/chef/ohai/hints + touch /etc/chef/ohai/hints/ec2.json + jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) + cd /etc/chef + cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate + endif + + sleep 60 +done From d031b054be9e93cb04548f38469edd1a21894c38 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Tue, 10 Sep 2024 13:11:57 -0400 Subject: [PATCH 5/5] Get Config_version from headnode --- .../cfn_bootstrap/cfn-hup-update-compute-action.sh.erb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb index 0287aea69..06e69f2f0 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-update-compute-action.sh.erb @@ -14,11 +14,13 @@ GET_LATEST_CLUSTER_CONFIG=true while $GET_LATEST_CLUSTER_CONFIG; do echo "Old Cluster config version is $OLD_CLUSTER_CONFIG_VERSION" + sleep 60 #NEW_CLUSTER_CONFIG_VERSION=$(aws s3api list-object-versions --bucket ${S3_BUCKET} --prefix "${S3_ARTIFACT_DIR}/configs/cluster-config-with-implied-values.yaml" --region ${REGION} | jq -r '.Versions[] | select(.IsLatest == true) | .VersionId' 2>&1 || error_exit "${!S3API_RESULT}") NEW_CLUSTER_CONFIG_VERSION=$(cat /opt/parallelcluster/shared/cluster-config-version ) +# NEW_CLUSTER_CONFIG_VERSION=$(cat /var/lib/cfn-hup/data/metadata_db.json | jq -r '.[].deployConfigFiles.files.config_version.content') echo "New Cluster config version is $NEW_CLUSTER_CONFIG_VERSION" - if [[ -n $NEW_CLUSTER_CONFIG_VERSION ]] && [[ $NEW_CLUSTER_CONFIG_VERSION != $OLD_CLUSTER_CONFIG_VERSION]]; then + if [[ -n "${NEW_CLUSTER_CONFIG_VERSION}" ]] && [[ "${NEW_CLUSTER_CONFIG_VERSION}" != "${OLD_CLUSTER_CONFIG_VERSION}" ]]; then GET_LATEST_CLUSTER_CONFIG=false CLUSTER_CONFIG_VERSION=$NEW_CLUSTER_CONFIG_VERSION AWS_RETRY_MODE=standard @@ -33,7 +35,6 @@ while $GET_LATEST_CLUSTER_CONFIG; do jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) cd /etc/chef cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate - endif + fi - sleep 60 done