Skip to content

Commit

Permalink
issue-747: Prevent switchover during replication delays
Browse files Browse the repository at this point in the history
  • Loading branch information
shunki-fujita committed Nov 19, 2024
1 parent 4ee87e5 commit d32c707
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
2 changes: 1 addition & 1 deletion clustering/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ func (p *managerProcess) do(ctx context.Context) (bool, error) {
return false, nil

case StateHealthy, StateDegraded:
if ss.NeedSwitch {
if ss.NeedSwitch && !ss.PreventPodDeletion {
if err := p.switchover(ctx, ss); err != nil {
event.SwitchOverFailed.Emit(ss.Cluster, p.recorder, err)
return false, fmt.Errorf("failed to switchover: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion e2e/failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
//go:embed testdata/failover.yaml
var failoverYAML string

var _ = Context("failure", Ordered, func() {
var _ = Context("failover", Ordered, func() {
if doUpgrade {
return
}
Expand Down
65 changes: 65 additions & 0 deletions e2e/prevent_delete_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ var _ = Context("PreventDelete", func() {
}
return nil
}).Should(Succeed())
time.Sleep(30 * time.Second)
})

It("should not finish rollout restart if replication delay occurs", func() {
Expand Down Expand Up @@ -206,6 +207,70 @@ var _ = Context("PreventDelete", func() {
return nil
}).Should(Succeed())

// wait for cluster to be healthy
Eventually(func() error {
cluster, err := getCluster("prevent-delete", "test")
Expect(err).NotTo(HaveOccurred())
for _, cond := range cluster.Status.Conditions {
if cond.Type != mocov1beta2.ConditionHealthy {
continue
}
if cond.Status == metav1.ConditionTrue {
return nil
}
return fmt.Errorf("cluster is not healthy: %s", cond.Status)
}
return errors.New("no health condition")
}).Should(Succeed())
})

It("should not finish switchover if replication delay occurs", func() {
cluster, err := getCluster("prevent-delete", "test")
Expect(err).NotTo(HaveOccurred())
primary := cluster.Status.CurrentPrimaryIndex

// set huge replication delay
setSourceDelay(0, 10000)

// wait for prevent-delete annotation to be added
Eventually(func() error {
out, err := kubectl(nil, "get", "pod", "-n", "prevent-delete", fmt.Sprintf("moco-test-%d", primary), "-o", "json")
Expect(err).NotTo(HaveOccurred())
pod := &corev1.Pod{}
err = json.Unmarshal(out, pod)
Expect(err).NotTo(HaveOccurred())
if val, exists := pod.Annotations[constants.AnnPreventDelete]; !exists {
return errors.New("annotation is not added")
} else if val != "true" {
return fmt.Errorf("annotation value is not true: %s", val)
}
return nil
}).Should(Succeed())

// never finish switchover
kubectlSafe(nil, "moco", "switchover", "-n", "prevent-delete", "test")
Consistently(func() error {
cluster, err := getCluster("prevent-delete", "test")
Expect(err).NotTo(HaveOccurred())
if cluster.Status.CurrentPrimaryIndex == primary {
return errors.New("switchover is not finished")
}
return nil
}, 1*time.Minute).ShouldNot(Succeed())

// resolve replication delay
setSourceDelay(0, 0)

// wait for switchover to be finished
Eventually(func() error {
cluster, err := getCluster("prevent-delete", "test")
Expect(err).NotTo(HaveOccurred())
if cluster.Status.CurrentPrimaryIndex == primary {
return errors.New("switchover is not finished")
}
return nil
}).Should(Succeed())

// wait for cluster to be healthy
Eventually(func() error {
cluster, err := getCluster("prevent-delete", "test")
Expand Down

0 comments on commit d32c707

Please sign in to comment.