Skip to content

Commit

Permalink
Management cluster kindless upgrade fixes (#6838)
Browse files Browse the repository at this point in the history
  • Loading branch information
panktishah26 authored Oct 18, 2023
1 parent 8fc6448 commit 5bbccbe
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 51 deletions.
30 changes: 15 additions & 15 deletions pkg/clustermanager/applier.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@ const (
retryBackOff = time.Second
waitForFailureMessageErrorTimeout = time.Minute
defaultFieldManager = "eks-a-cli"
defaultConditionCheckTotalCount = 20
)

var defaultConditionCheckTotalCount = 20

// ApplierOpt allows to customize a Applier on construction.
type ApplierOpt func(*Applier)

Expand All @@ -36,17 +35,19 @@ type Applier struct {
clientFactory ClientFactory
applyClusterTimeout, waitForClusterReconcile, waitForFailureMessage time.Duration
retryBackOff time.Duration
conditionCheckoutTotalCount int
}

// NewApplier builds an Applier.
func NewApplier(log logr.Logger, clientFactory ClientFactory, opts ...ApplierOpt) Applier {
a := &Applier{
log: log,
clientFactory: clientFactory,
applyClusterTimeout: applyClusterSpecTimeout,
waitForClusterReconcile: waitForClusterReconcileTimeout,
waitForFailureMessage: waitForFailureMessageErrorTimeout,
retryBackOff: retryBackOff,
log: log,
clientFactory: clientFactory,
applyClusterTimeout: applyClusterSpecTimeout,
waitForClusterReconcile: waitForClusterReconcileTimeout,
waitForFailureMessage: waitForFailureMessageErrorTimeout,
retryBackOff: retryBackOff,
conditionCheckoutTotalCount: defaultConditionCheckTotalCount,
}

for _, opt := range opts {
Expand Down Expand Up @@ -99,15 +100,14 @@ func WithApplierRetryBackOff(backOff time.Duration) ApplierOpt {
func WithApplierWaitForFailureMessage(timeout time.Duration) ApplierOpt {
return func(a *Applier) {
a.waitForFailureMessage = timeout
defaultConditionCheckTotalCount = int(timeout)
a.conditionCheckoutTotalCount = int(timeout)
}
}

// Run applies the cluster's spec in the management cluster and waits
// until the changes are fully reconciled.
func (a Applier) Run(ctx context.Context, spec *cluster.Spec, managementCluster types.Cluster) error {
var client kubernetes.Client
a.log.V(9).Info("Cluster generation before applying specs", "generation", spec.Cluster.Generation)
a.log.V(3).Info("Applying cluster spec")
err := retrier.New(
a.applyClusterTimeout,
Expand Down Expand Up @@ -145,7 +145,7 @@ func (a Applier) Run(ctx context.Context, spec *cluster.Spec, managementCluster
waitStartTime := time.Now()
retry := a.retrierForWait(waitStartTime)

if err := cluster.WaitFor(ctx, a.log, client, spec.Cluster, defaultConditionCheckTotalCount, a.retrierForFailureMessage(), func(c *anywherev1.Cluster) error {
if err := cluster.WaitFor(ctx, a.log, client, spec.Cluster, a.conditionCheckoutTotalCount, a.retrierForFailureMessage(), func(c *anywherev1.Cluster) error {
if c.Status.FailureMessage != nil && *c.Status.FailureMessage != "" {
return fmt.Errorf("cluster has an error: %s", *c.Status.FailureMessage)
}
Expand All @@ -155,27 +155,27 @@ func (a Applier) Run(ctx context.Context, spec *cluster.Spec, managementCluster
}

a.log.V(3).Info("Waiting for control plane to be ready")
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, defaultConditionCheckTotalCount, retry, anywherev1.ControlPlaneReadyCondition); err != nil {
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, a.conditionCheckoutTotalCount, retry, anywherev1.ControlPlaneReadyCondition); err != nil {
return errors.Wrapf(err, "waiting for cluster's control plane to be ready")
}

if spec.Cluster.Spec.ClusterNetwork.CNIConfig.IsManaged() {
a.log.V(3).Info("Waiting for default CNI to be updated")
retry = a.retrierForWait(waitStartTime)
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, defaultConditionCheckTotalCount, retry, anywherev1.DefaultCNIConfiguredCondition); err != nil {
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, a.conditionCheckoutTotalCount, retry, anywherev1.DefaultCNIConfiguredCondition); err != nil {
return errors.Wrapf(err, "waiting for cluster's CNI to be configured")
}
}

a.log.V(3).Info("Waiting for worker nodes to be ready after upgrade")
retry = a.retrierForWait(waitStartTime)
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, defaultConditionCheckTotalCount, retry, anywherev1.WorkersReadyCondition); err != nil {
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, a.conditionCheckoutTotalCount, retry, anywherev1.WorkersReadyCondition); err != nil {
return errors.Wrapf(err, "waiting for cluster's workers to be ready")
}

a.log.V(3).Info("Waiting for cluster upgrade to be completed")
retry = a.retrierForWait(waitStartTime)
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, defaultConditionCheckTotalCount, retry, anywherev1.ReadyCondition); err != nil {
if err := cluster.WaitForCondition(ctx, a.log, client, spec.Cluster, a.conditionCheckoutTotalCount, retry, anywherev1.ReadyCondition); err != nil {
return errors.Wrapf(err, "waiting for cluster to be ready")
}

Expand Down
12 changes: 2 additions & 10 deletions pkg/workflows/management/gitops.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,7 @@ func (s *pauseGitOpsReconcile) Restore(ctx context.Context, commandContext *task

// reconcileGitOps updates all the places that have a cluster definition to follow the cluster config provided to this workflow:
// the cluster config in the git repo if GitOps is enabled. It also resumes the GitOps reconciliations.
type reconcileGitOps struct {
// TODO(pjshah): check whether this field is needed or not.
// I will incorporate this in a separate PR.
// eksaSpecDiff bool
}
type reconcileGitOps struct{}

// Run reconcileGitOps resumes GitOps reconciler and performs other GitOps related tasks after management cluster upgrade.
func (s *reconcileGitOps) Run(ctx context.Context, commandContext *task.CommandContext) task.Task {
Expand All @@ -68,11 +64,7 @@ func (s *reconcileGitOps) Run(ctx context.Context, commandContext *task.CommandC
commandContext.SetError(err)
return &writeClusterConfig{}
}
// TODO(pjshah): check whether this field is needed or not.
// I will incorporate this in a separate PR.
// if !s.eksaSpecDiff {
// return nil
// }

return &writeClusterConfig{}
}

Expand Down
23 changes: 13 additions & 10 deletions pkg/workflows/management/install_new_components.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,21 @@ type installNewComponents struct{}

// Run installNewComponents performs actions needed to upgrade the management cluster.
func (s *installNewComponents) Run(ctx context.Context, commandContext *task.CommandContext) task.Task {
if commandContext.UpgradeChangeDiff.Changed() {
if err := commandContext.ClusterManager.ApplyBundles(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster); err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}

if err := commandContext.ClusterManager.ApplyReleases(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster); err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}
if err := commandContext.ClusterManager.ApplyBundles(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster); err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}

if err := commandContext.ClusterManager.ApplyReleases(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster); err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}

err := commandContext.EksdInstaller.InstallEksdManifest(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster)
if err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}
return &upgradeCluster{}
}

Expand Down
16 changes: 0 additions & 16 deletions pkg/workflows/management/upgrade_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,9 @@ type upgradeCluster struct{}

// Run upgradeCluster performs actions needed to upgrade the management cluster.
func (s *upgradeCluster) Run(ctx context.Context, commandContext *task.CommandContext) task.Task {
// TODO(g-gaston): move this to eks-a installer and eks-d installer
err := commandContext.EksdInstaller.InstallEksdManifest(ctx, commandContext.ClusterSpec, commandContext.ManagementCluster)
if err != nil {
commandContext.SetError(err)
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}

logger.Info("Upgrading management cluster")
if err := commandContext.ClusterUpgrader.Run(ctx, commandContext.ClusterSpec, *commandContext.ManagementCluster); err != nil {
commandContext.SetError(err)
// TODO(@pjshah): check if we need this or not
// Take backup of bootstrap cluster capi components
// if commandContext.BootstrapCluster != nil {
// logger.Info("Backing up management components from bootstrap cluster")
// err := commandContext.ClusterManager.BackupCAPIWaitForInfrastructure(ctx, commandContext.BootstrapCluster, fmt.Sprintf("bootstrap-%s", commandContext.ManagementClusterStateDir), commandContext.ManagementCluster.Name)
// if err != nil {
// logger.Info("Bootstrap management component backup failed, use existing workload cluster backup", "error", err)
// }
// }
return &workflows.CollectMgmtClusterDiagnosticsTask{}
}

Expand Down

0 comments on commit 5bbccbe

Please sign in to comment.