From e143725f8cc2cdf2017ff58e603f16e8a3d82e83 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 06:01:08 +0000 Subject: [PATCH 1/4] Add Raft consensus testing under failure conditions (#84) - Add 3-node orchestrator Raft cluster to docker-compose.yml - Add Raft config files for each node (orchestrator-raft{1,2,3}.conf.json) - Add test-raft.sh with 4 test phases: cluster formation & leader election, leader serves topology, leader failure & re-election, node rejoin - Add functional-raft CI job in GitHub Actions workflow --- .github/workflows/functional.yml | 82 +++++ tests/functional/docker-compose.yml | 75 ++++ tests/functional/orchestrator-raft1.conf.json | 25 ++ tests/functional/orchestrator-raft2.conf.json | 25 ++ tests/functional/orchestrator-raft3.conf.json | 25 ++ tests/functional/test-raft.sh | 323 ++++++++++++++++++ 6 files changed, 555 insertions(+) create mode 100644 tests/functional/orchestrator-raft1.conf.json create mode 100644 tests/functional/orchestrator-raft2.conf.json create mode 100644 tests/functional/orchestrator-raft3.conf.json create mode 100755 tests/functional/test-raft.sh diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 63e8e4b1..fc734e72 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -289,3 +289,85 @@ jobs: env: PG_IMAGE: postgres:${{ matrix.pg_version }} run: docker compose down -v --remove-orphans 2>/dev/null || true + + functional-raft: + runs-on: ubuntu-latest + timeout-minutes: 30 + needs: build + + steps: + - uses: actions/checkout@v4 + + - name: Download orchestrator binary + uses: actions/download-artifact@v4 + with: + name: orchestrator-binary + path: bin + + - name: Make binary executable + run: chmod +x bin/orchestrator + + - name: Start MySQL infrastructure + working-directory: tests/functional + run: | + docker compose up -d mysql1 mysql2 mysql3 + echo "Waiting for MySQL to be healthy..." + timeout 120 bash -c ' + while true; do + HEALTHY=$(docker compose ps --format json 2>/dev/null | python3 -c " + import json, sys + healthy = 0 + for line in sys.stdin: + svc = json.loads(line) + if \"healthy\" in svc.get(\"Status\",\"\").lower(): + healthy += 1 + print(healthy) + " 2>/dev/null || echo "0") + if [ "$HEALTHY" -ge 3 ]; then + echo "All 3 MySQL services healthy" + exit 0 + fi + sleep 2 + done + ' || { echo "Timeout"; docker compose ps; docker compose logs --tail=30; exit 1; } + + - name: Setup replication + run: bash tests/functional/setup-replication.sh + + - name: Run Raft consensus tests + run: bash tests/functional/test-raft.sh + + - name: Collect Raft orchestrator logs + if: always() + working-directory: tests/functional + run: | + docker compose logs orchestrator-raft1 > /tmp/orchestrator-raft1.log 2>&1 || true + docker compose logs orchestrator-raft2 > /tmp/orchestrator-raft2.log 2>&1 || true + docker compose logs orchestrator-raft3 > /tmp/orchestrator-raft3.log 2>&1 || true + + - name: Upload Raft orchestrator logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: orchestrator-raft-logs + path: | + /tmp/orchestrator-raft1.log + /tmp/orchestrator-raft2.log + /tmp/orchestrator-raft3.log + + - name: Collect all docker logs on failure + if: failure() + working-directory: tests/functional + run: docker compose logs > /tmp/docker-compose-raft-logs.txt 2>&1 || true + + - name: Upload docker logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: docker-compose-raft-logs + path: /tmp/docker-compose-raft-logs.txt + + - name: Cleanup + if: always() + working-directory: tests/functional + run: docker compose down -v --remove-orphans 2>/dev/null || true diff --git a/tests/functional/docker-compose.yml b/tests/functional/docker-compose.yml index 3bceac1a..1da3871f 100644 --- a/tests/functional/docker-compose.yml +++ b/tests/functional/docker-compose.yml @@ -205,6 +205,81 @@ services: aliases: - orchestrator-pg + orchestrator-raft1: + image: ubuntu:24.04 + hostname: orchestrator-raft1 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft1.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft1 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3100:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.40 + aliases: + - orchestrator-raft1 + + orchestrator-raft2: + image: ubuntu:24.04 + hostname: orchestrator-raft2 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft2.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft2 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3101:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.41 + aliases: + - orchestrator-raft2 + + orchestrator-raft3: + image: ubuntu:24.04 + hostname: orchestrator-raft3 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft3.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft3 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3102:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.42 + aliases: + - orchestrator-raft3 + networks: orchnet: driver: bridge diff --git a/tests/functional/orchestrator-raft1.conf.json b/tests/functional/orchestrator-raft1.conf.json new file mode 100644 index 00000000..9871d3e9 --- /dev/null +++ b/tests/functional/orchestrator-raft1.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.40:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft1/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft1", + "RaftBind": "172.30.0.40", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/orchestrator-raft2.conf.json b/tests/functional/orchestrator-raft2.conf.json new file mode 100644 index 00000000..51fbd241 --- /dev/null +++ b/tests/functional/orchestrator-raft2.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.41:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft2/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft2", + "RaftBind": "172.30.0.41", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/orchestrator-raft3.conf.json b/tests/functional/orchestrator-raft3.conf.json new file mode 100644 index 00000000..3a93ad29 --- /dev/null +++ b/tests/functional/orchestrator-raft3.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.42:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft3/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft3", + "RaftBind": "172.30.0.42", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh new file mode 100755 index 00000000..778995f6 --- /dev/null +++ b/tests/functional/test-raft.sh @@ -0,0 +1,323 @@ +#!/bin/bash +# Raft consensus tests -- verify leader election, failover, and follower redirect +set -uo pipefail +cd "$(dirname "$0")/../.." +source tests/functional/lib.sh + +echo "=== RAFT CONSENSUS TESTS ===" + +# Port mapping: raft1->3100, raft2->3101, raft3->3102 +RAFT_PORTS=(3100 3101 3102) +RAFT_NODES=(orchestrator-raft1 orchestrator-raft2 orchestrator-raft3) +COMPOSE_FILE="tests/functional/docker-compose.yml" + +# ============================================================ +# Phase 1: Cluster Formation & Leader Election +# ============================================================ +echo "" +echo "--- Phase 1: Cluster Formation & Leader Election ---" + +docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 orchestrator-raft2 orchestrator-raft3 + +# Wait for all 3 nodes to be reachable and for a leader to be elected +echo "Waiting for Raft cluster to form and elect a leader (up to 90s)..." +LEADER="" +for i in $(seq 1 90); do + ALL_UP=true + for port in "${RAFT_PORTS[@]}"; do + if ! curl -sf "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then + ALL_UP=false + break + fi + done + if $ALL_UP; then + # Check if all nodes agree on a leader + LEADER1=$(curl -sf "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER2=$(curl -sf "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER3=$(curl -sf "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$LEADER1" ] && [ "$LEADER1" = "$LEADER2" ] && [ "$LEADER2" = "$LEADER3" ]; then + LEADER="$LEADER1" + echo "Leader elected: $LEADER (after ${i}s)" + break + fi + fi + sleep 1 +done + +if [ -n "$LEADER" ]; then + pass "Raft leader elected: $LEADER" +else + fail "Raft leader not elected within 90s" + # Print debug info + for port in "${RAFT_PORTS[@]}"; do + echo " Node :${port} raft-status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + done +fi + +# Verify all nodes agree on the same leader +LEADERS_AGREE=true +for port in "${RAFT_PORTS[@]}"; do + NODE_LEADER=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$NODE_LEADER" != "$LEADER" ]; then + LEADERS_AGREE=false + break + fi +done +if $LEADERS_AGREE && [ -n "$LEADER" ]; then + pass "All 3 nodes agree on the same leader" +else + fail "Nodes do not agree on the leader" +fi + +# Verify exactly one node reports itself as Leader state +LEADER_COUNT=0 +for port in "${RAFT_PORTS[@]}"; do + STATE=$(curl -sf "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + ((LEADER_COUNT++)) + fi +done +if [ "$LEADER_COUNT" -eq 1 ]; then + pass "Exactly one node is in Leader state" +else + fail "Expected 1 leader, found $LEADER_COUNT" +fi + +# ============================================================ +# Phase 2: Leader Serves Topology +# ============================================================ +echo "" +echo "--- Phase 2: Leader Serves Topology ---" + +# Determine leader port (map leader IP to host port) +LEADER_PORT="" +LEADER_INDEX="" +for idx in 0 1 2; do + STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + LEADER_PORT="${RAFT_PORTS[$idx]}" + LEADER_INDEX=$idx + break + fi +done + +if [ -z "$LEADER_PORT" ]; then + fail "Could not identify leader port" +else + echo "Leader is on localhost:${LEADER_PORT} (${RAFT_NODES[$LEADER_INDEX]})" + + # Discover MySQL topology through the leader + echo "Discovering MySQL topology through the leader..." + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + + # Wait for topology discovery + echo "Waiting for topology discovery (up to 60s)..." + CLUSTER_FOUND=false + for i in $(seq 1 60); do + CLUSTERS=$(curl -sf "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$COUNT" -ge 1 ]; then + echo "Cluster discovered after ${i}s" + CLUSTER_FOUND=true + break + fi + # Re-seed discovery periodically + if [ "$((i % 10))" = "0" ]; then + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + fi + sleep 1 + done + + if $CLUSTER_FOUND; then + pass "Leader serves cluster data via /api/clusters" + else + fail "Leader did not return cluster data within 60s" + fi + + # Verify followers can also return cluster data (Raft replicates state) + FOLLOWER_HAS_DATA=true + for idx in 0 1 2; do + if [ "$idx" = "$LEADER_INDEX" ]; then + continue + fi + FPORT="${RAFT_PORTS[$idx]}" + # Followers may redirect or serve data directly; either is valid + FCLUSTERS=$(curl -sfL "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]") + FCOUNT=$(echo "$FCLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$FCOUNT" -lt 1 ]; then + FOLLOWER_HAS_DATA=false + echo " Follower on :${FPORT} returned $FCOUNT clusters" + fi + done + if $FOLLOWER_HAS_DATA; then + pass "Followers serve cluster data (Raft state replicated)" + else + # This is not necessarily a failure -- followers may need more time + skip "Some followers do not yet serve cluster data (may need more replication time)" + fi +fi + +# ============================================================ +# Phase 3: Leader Failure & Re-election +# ============================================================ +echo "" +echo "--- Phase 3: Leader Failure & Re-election ---" + +OLD_LEADER="$LEADER" +OLD_LEADER_NODE="" +if [ -n "$LEADER_INDEX" ]; then + OLD_LEADER_NODE="${RAFT_NODES[$LEADER_INDEX]}" +fi + +if [ -z "$OLD_LEADER_NODE" ]; then + fail "Cannot test leader failure: no leader identified" +else + echo "Stopping leader node: $OLD_LEADER_NODE" + docker compose -f "$COMPOSE_FILE" stop "$OLD_LEADER_NODE" + + # Determine which nodes are still running + REMAINING_PORTS=() + REMAINING_INDICES=() + for idx in 0 1 2; do + if [ "$idx" != "$LEADER_INDEX" ]; then + REMAINING_PORTS+=("${RAFT_PORTS[$idx]}") + REMAINING_INDICES+=("$idx") + fi + done + + # Wait for re-election + echo "Waiting for re-election (up to 60s)..." + NEW_LEADER="" + for i in $(seq 1 60); do + L1=$(curl -sf "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + L2=$(curl -sf "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$L1" ] && [ "$L1" = "$L2" ] && [ "$L1" != "$OLD_LEADER" ]; then + NEW_LEADER="$L1" + echo "New leader elected: $NEW_LEADER (after ${i}s)" + break + fi + sleep 1 + done + + if [ -n "$NEW_LEADER" ]; then + pass "New leader elected after stopping old leader: $NEW_LEADER" + else + fail "No new leader elected within 60s" + for port in "${REMAINING_PORTS[@]}"; do + echo " Node :${port} status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + done + fi + + # Verify new leader is different from old + if [ -n "$NEW_LEADER" ] && [ "$NEW_LEADER" != "$OLD_LEADER" ]; then + pass "New leader is different from old leader" + elif [ -n "$NEW_LEADER" ]; then + fail "New leader is the same as old leader (should not happen)" + fi + + # Verify new leader can serve API requests + if [ -n "$NEW_LEADER" ]; then + NEW_LEADER_PORT="" + for idx in "${REMAINING_INDICES[@]}"; do + STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + NEW_LEADER_PORT="${RAFT_PORTS[$idx]}" + break + fi + done + if [ -n "$NEW_LEADER_PORT" ]; then + CLUSTERS=$(curl -sf "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$COUNT" -ge 1 ]; then + pass "New leader serves cluster data via API" + else + skip "New leader returned 0 clusters (state may not have fully replicated yet)" + fi + fi + fi + + # ============================================================ + # Phase 4: Node Rejoin + # ============================================================ + echo "" + echo "--- Phase 4: Node Rejoin ---" + + echo "Restarting stopped node: $OLD_LEADER_NODE" + docker compose -f "$COMPOSE_FILE" start "$OLD_LEADER_NODE" + + # Wait for the restarted node to rejoin + RESTARTED_PORT="${RAFT_PORTS[$LEADER_INDEX]}" + echo "Waiting for restarted node (:${RESTARTED_PORT}) to rejoin (up to 60s)..." + REJOINED=false + for i in $(seq 1 60); do + RLEADER=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$RLEADER" ] && [ "$RLEADER" = "$NEW_LEADER" ]; then + echo "Node rejoined after ${i}s" + REJOINED=true + break + fi + sleep 1 + done + + if $REJOINED; then + pass "Restarted node rejoined the cluster" + else + fail "Restarted node did not rejoin within 60s" + fi + + # Verify the restarted node is a follower (not a new leader) + RSTATE=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$RSTATE" = "Follower" ]; then + pass "Restarted node is a Follower (stable leader)" + elif [ "$RSTATE" = "Leader" ]; then + # Leadership may have shifted -- still valid if all agree + skip "Restarted node became Leader (leadership may have shifted)" + else + fail "Restarted node in unexpected state: $RSTATE" + fi + + # Verify all 3 nodes agree on the current leader + ALL_AGREE=true + CURRENT_LEADER="" + for port in "${RAFT_PORTS[@]}"; do + NL=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -z "$CURRENT_LEADER" ]; then + CURRENT_LEADER="$NL" + elif [ "$NL" != "$CURRENT_LEADER" ]; then + ALL_AGREE=false + fi + done + if $ALL_AGREE && [ -n "$CURRENT_LEADER" ]; then + pass "All 3 nodes agree on current leader after rejoin: $CURRENT_LEADER" + else + fail "Nodes do not agree on leader after rejoin" + fi + + # Verify cluster is healthy (all 3 nodes report healthy) + HEALTHY_COUNT=0 + for port in "${RAFT_PORTS[@]}"; do + HEALTH=$(curl -sf "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "") + if echo "$HEALTH" | grep -q "healthy"; then + ((HEALTHY_COUNT++)) + fi + done + if [ "$HEALTHY_COUNT" -eq 3 ]; then + pass "All 3 nodes report healthy" + else + skip "Only $HEALTHY_COUNT/3 nodes report healthy (may need more time)" + fi +fi + +# ============================================================ +# Cleanup +# ============================================================ +echo "" +echo "--- Cleanup ---" +docker compose -f "$COMPOSE_FILE" stop orchestrator-raft1 orchestrator-raft2 orchestrator-raft3 2>/dev/null || true +echo "Raft containers stopped." + +summary From 84f7361ed5b5096a9536e91fe3b9c0d9034129af Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 09:38:16 +0000 Subject: [PATCH 2/4] Add curl timeouts to test-raft.sh and test-failover-advanced.sh All curl calls lacked --max-time, causing potential test hangs if any API endpoint was slow to respond. --- tests/functional/test-raft.sh | 50 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh index 778995f6..63a45570 100755 --- a/tests/functional/test-raft.sh +++ b/tests/functional/test-raft.sh @@ -25,16 +25,16 @@ LEADER="" for i in $(seq 1 90); do ALL_UP=true for port in "${RAFT_PORTS[@]}"; do - if ! curl -sf "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then + if ! curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then ALL_UP=false break fi done if $ALL_UP; then # Check if all nodes agree on a leader - LEADER1=$(curl -sf "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") - LEADER2=$(curl -sf "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") - LEADER3=$(curl -sf "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER1=$(curl -sf --max-time 10 "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER2=$(curl -sf --max-time 10 "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER3=$(curl -sf --max-time 10 "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ -n "$LEADER1" ] && [ "$LEADER1" = "$LEADER2" ] && [ "$LEADER2" = "$LEADER3" ]; then LEADER="$LEADER1" echo "Leader elected: $LEADER (after ${i}s)" @@ -50,14 +50,14 @@ else fail "Raft leader not elected within 90s" # Print debug info for port in "${RAFT_PORTS[@]}"; do - echo " Node :${port} raft-status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + echo " Node :${port} raft-status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" done fi # Verify all nodes agree on the same leader LEADERS_AGREE=true for port in "${RAFT_PORTS[@]}"; do - NODE_LEADER=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + NODE_LEADER=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ "$NODE_LEADER" != "$LEADER" ]; then LEADERS_AGREE=false break @@ -72,7 +72,7 @@ fi # Verify exactly one node reports itself as Leader state LEADER_COUNT=0 for port in "${RAFT_PORTS[@]}"; do - STATE=$(curl -sf "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + STATE=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ "$STATE" = "Leader" ]; then ((LEADER_COUNT++)) fi @@ -93,7 +93,7 @@ echo "--- Phase 2: Leader Serves Topology ---" LEADER_PORT="" LEADER_INDEX="" for idx in 0 1 2; do - STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ "$STATE" = "Leader" ]; then LEADER_PORT="${RAFT_PORTS[$idx]}" LEADER_INDEX=$idx @@ -108,15 +108,15 @@ else # Discover MySQL topology through the leader echo "Discovering MySQL topology through the leader..." - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 # Wait for topology discovery echo "Waiting for topology discovery (up to 60s)..." CLUSTER_FOUND=false for i in $(seq 1 60); do - CLUSTERS=$(curl -sf "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") if [ "$COUNT" -ge 1 ]; then echo "Cluster discovered after ${i}s" @@ -125,9 +125,9 @@ else fi # Re-seed discovery periodically if [ "$((i % 10))" = "0" ]; then - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 - curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 fi sleep 1 done @@ -146,7 +146,7 @@ else fi FPORT="${RAFT_PORTS[$idx]}" # Followers may redirect or serve data directly; either is valid - FCLUSTERS=$(curl -sfL "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]") + FCLUSTERS=$(curl -sf --max-time 10L "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]") FCOUNT=$(echo "$FCLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") if [ "$FCOUNT" -lt 1 ]; then FOLLOWER_HAS_DATA=false @@ -193,8 +193,8 @@ else echo "Waiting for re-election (up to 60s)..." NEW_LEADER="" for i in $(seq 1 60); do - L1=$(curl -sf "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") - L2=$(curl -sf "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + L1=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + L2=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ -n "$L1" ] && [ "$L1" = "$L2" ] && [ "$L1" != "$OLD_LEADER" ]; then NEW_LEADER="$L1" echo "New leader elected: $NEW_LEADER (after ${i}s)" @@ -208,7 +208,7 @@ else else fail "No new leader elected within 60s" for port in "${REMAINING_PORTS[@]}"; do - echo " Node :${port} status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + echo " Node :${port} status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" done fi @@ -223,14 +223,14 @@ else if [ -n "$NEW_LEADER" ]; then NEW_LEADER_PORT="" for idx in "${REMAINING_INDICES[@]}"; do - STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ "$STATE" = "Leader" ]; then NEW_LEADER_PORT="${RAFT_PORTS[$idx]}" break fi done if [ -n "$NEW_LEADER_PORT" ]; then - CLUSTERS=$(curl -sf "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") if [ "$COUNT" -ge 1 ]; then pass "New leader serves cluster data via API" @@ -254,7 +254,7 @@ else echo "Waiting for restarted node (:${RESTARTED_PORT}) to rejoin (up to 60s)..." REJOINED=false for i in $(seq 1 60); do - RLEADER=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + RLEADER=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ -n "$RLEADER" ] && [ "$RLEADER" = "$NEW_LEADER" ]; then echo "Node rejoined after ${i}s" REJOINED=true @@ -270,7 +270,7 @@ else fi # Verify the restarted node is a follower (not a new leader) - RSTATE=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + RSTATE=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ "$RSTATE" = "Follower" ]; then pass "Restarted node is a Follower (stable leader)" elif [ "$RSTATE" = "Leader" ]; then @@ -284,7 +284,7 @@ else ALL_AGREE=true CURRENT_LEADER="" for port in "${RAFT_PORTS[@]}"; do - NL=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + NL=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") if [ -z "$CURRENT_LEADER" ]; then CURRENT_LEADER="$NL" elif [ "$NL" != "$CURRENT_LEADER" ]; then @@ -300,7 +300,7 @@ else # Verify cluster is healthy (all 3 nodes report healthy) HEALTHY_COUNT=0 for port in "${RAFT_PORTS[@]}"; do - HEALTH=$(curl -sf "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "") + HEALTH=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "") if echo "$HEALTH" | grep -q "healthy"; then ((HEALTHY_COUNT++)) fi From 4b980805c3f97f34a92fec8e1402603a3edda6f2 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 11:17:10 +0000 Subject: [PATCH 3/4] Fix Raft test: stagger node startup to avoid bootstrap conflicts Starting all 3 Raft nodes simultaneously causes each to call BootstrapCluster independently, creating conflicting initial states and perpetual election cycles. Fix by starting node 1 first, waiting for it to be ready, then starting nodes 2 and 3. --- tests/functional/test-raft.sh | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh index 63a45570..2af3e895 100755 --- a/tests/functional/test-raft.sh +++ b/tests/functional/test-raft.sh @@ -17,7 +17,34 @@ COMPOSE_FILE="tests/functional/docker-compose.yml" echo "" echo "--- Phase 1: Cluster Formation & Leader Election ---" -docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 orchestrator-raft2 orchestrator-raft3 +# Start node 1 first to let it bootstrap the cluster before other nodes join. +# Starting all 3 simultaneously causes each to call BootstrapCluster independently, +# creating conflicting initial states and perpetual election cycles. +echo "Starting first Raft node (bootstrap node)..." +docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 + +# Wait for node 1 to be reachable (includes apt-get install time) +echo "Waiting for bootstrap node to be ready (up to 90s)..." +BOOTSTRAP_READY=false +for i in $(seq 1 90); do + if curl -sf --max-time 5 "http://localhost:3100/api/raft-status" > /dev/null 2>&1; then + BOOTSTRAP_READY=true + echo "Bootstrap node ready after ${i}s" + break + fi + sleep 1 +done + +if ! $BOOTSTRAP_READY; then + fail "Bootstrap Raft node (orchestrator-raft1) not ready within 90s" + docker compose -f "$COMPOSE_FILE" logs orchestrator-raft1 2>/dev/null | tail -30 + summary +fi +pass "Bootstrap Raft node started and ready" + +# Now start the remaining nodes — they will find the bootstrapped cluster +echo "Starting remaining Raft nodes..." +docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft2 orchestrator-raft3 # Wait for all 3 nodes to be reachable and for a leader to be elected echo "Waiting for Raft cluster to form and elect a leader (up to 90s)..." From 2ea8a252773b8ec094f43464851549cb8804d7a9 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 12:02:54 +0000 Subject: [PATCH 4/4] Fix Raft SQLite store: allow NULL data in raft_log and raft_store The raft_log.data and raft_store.store_value columns were NOT NULL, but hashicorp/raft can produce log entries with nil Data (e.g. LogNoop when a new leader takes over). This caused "NOT NULL constraint failed: raft_log.data" errors immediately after election, causing the leader to step down and creating perpetual election cycles. --- go/raft/rel_store.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/raft/rel_store.go b/go/raft/rel_store.go index b9c25ee3..881063f1 100644 --- a/go/raft/rel_store.go +++ b/go/raft/rel_store.go @@ -36,7 +36,7 @@ var createQueries = []string{ log_index integer, term bigint not null, log_type int not null, - data blob not null, + data blob, PRIMARY KEY (log_index) ) `, @@ -44,7 +44,7 @@ var createQueries = []string{ CREATE TABLE IF NOT EXISTS raft_store ( store_id integer, store_key varbinary(512) not null, - store_value blob not null, + store_value blob, PRIMARY KEY (store_id) ) `,