Skip to content

Commit 011bd7f

Browse files
committed
Validate no WAL corruption when both nodes shutdown gracefully
1 parent 02f8514 commit 011bd7f

1 file changed

Lines changed: 56 additions & 0 deletions

File tree

test/extended/two_node/tnf_recovery.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"math/rand"
77
"os"
8+
"strings"
89
"time"
910

1011
g "github.com/onsi/ginkgo/v2"
@@ -409,6 +410,61 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
409410
})
410411
})
411412

413+
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Disruptive] Two Node with Fencing etcd recovery", func() {
414+
defer g.GinkgoRecover()
415+
416+
var (
417+
oc = exutil.NewCLIWithoutNamespace("").AsAdmin()
418+
etcdClientFactory *helpers.EtcdClientFactoryImpl
419+
peerNode, targetNode corev1.Node
420+
)
421+
422+
g.BeforeEach(func() {
423+
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
424+
425+
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
426+
427+
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
428+
429+
nodes, err := utils.GetNodes(oc, utils.AllNodes)
430+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
431+
432+
randomIndex := rand.Intn(len(nodes.Items))
433+
peerNode = nodes.Items[randomIndex]
434+
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]
435+
})
436+
437+
g.It("should recover after simultaneous graceful shutdown of both nodes", func() {
438+
g.GinkgoT().Printf("Gracefully rebooting both nodes: %s and %s\n",
439+
targetNode.Name, peerNode.Name)
440+
441+
g.By(fmt.Sprintf("Triggering graceful reboot on %s", targetNode.Name))
442+
err := exutil.TriggerNodeRebootGraceful(oc.KubeClient(), targetNode.Name)
443+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to trigger graceful reboot on %s without error", targetNode.Name))
444+
445+
g.By(fmt.Sprintf("Triggering graceful reboot on %s", peerNode.Name))
446+
err = exutil.TriggerNodeRebootGraceful(oc.KubeClient(), peerNode.Name)
447+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to trigger graceful reboot on %s without error", peerNode.Name))
448+
449+
g.By("Waiting for graceful shutdown to take effect (shutdown -r 1 schedules reboot in 1 minute)")
450+
time.Sleep(90 * time.Second)
451+
452+
g.By(fmt.Sprintf("Waiting for both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot))
453+
validateEtcdRecoveryState(oc, etcdClientFactory,
454+
&targetNode,
455+
&peerNode, true, false,
456+
membersHealthyAfterDoubleReboot, utils.FiveSecondPollInterval)
457+
458+
g.By("Verifying etcd containers are running on both nodes")
459+
for _, node := range []corev1.Node{targetNode, peerNode} {
460+
got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, node.Name, "openshift-etcd",
461+
strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
462+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected no error checking etcd on %s", node.Name))
463+
o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("Expected etcd container running on %s", node.Name))
464+
}
465+
})
466+
})
467+
412468
func validateEtcdRecoveryState(
413469
oc *exutil.CLI, e *helpers.EtcdClientFactoryImpl,
414470
survivedNode, targetNode *corev1.Node,

0 commit comments

Comments
 (0)