From e143725f8cc2cdf2017ff58e603f16e8a3d82e83 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Thu, 9 Apr 2026 06:01:08 +0000
Subject: [PATCH 1/4] Add Raft consensus testing under failure conditions (#84)

- Add 3-node orchestrator Raft cluster to docker-compose.yml
- Add Raft config files for each node (orchestrator-raft{1,2,3}.conf.json)
- Add test-raft.sh with 4 test phases: cluster formation & leader
  election, leader serves topology, leader failure & re-election,
  node rejoin
- Add functional-raft CI job in GitHub Actions workflow
---
 .github/workflows/functional.yml              |  82 +++++
 tests/functional/docker-compose.yml           |  75 ++++
 tests/functional/orchestrator-raft1.conf.json |  25 ++
 tests/functional/orchestrator-raft2.conf.json |  25 ++
 tests/functional/orchestrator-raft3.conf.json |  25 ++
 tests/functional/test-raft.sh                 | 323 ++++++++++++++++++
 6 files changed, 555 insertions(+)
 create mode 100644 tests/functional/orchestrator-raft1.conf.json
 create mode 100644 tests/functional/orchestrator-raft2.conf.json
 create mode 100644 tests/functional/orchestrator-raft3.conf.json
 create mode 100755 tests/functional/test-raft.sh

diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml
index 63e8e4b1..fc734e72 100644
--- a/.github/workflows/functional.yml
+++ b/.github/workflows/functional.yml
@@ -289,3 +289,85 @@ jobs:
       env:
         PG_IMAGE: postgres:${{ matrix.pg_version }}
       run: docker compose down -v --remove-orphans 2>/dev/null || true
+
+  functional-raft:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    needs: build
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download orchestrator binary
+      uses: actions/download-artifact@v4
+      with:
+        name: orchestrator-binary
+        path: bin
+
+    - name: Make binary executable
+      run: chmod +x bin/orchestrator
+
+    - name: Start MySQL infrastructure
+      working-directory: tests/functional
+      run: |
+        docker compose up -d mysql1 mysql2 mysql3
+        echo "Waiting for MySQL to be healthy..."
+        timeout 120 bash -c '
+          while true; do
+            HEALTHY=$(docker compose ps --format json 2>/dev/null | python3 -c "
+        import json, sys
+        healthy = 0
+        for line in sys.stdin:
+            svc = json.loads(line)
+            if \"healthy\" in svc.get(\"Status\",\"\").lower():
+                healthy += 1
+        print(healthy)
+        " 2>/dev/null || echo "0")
+            if [ "$HEALTHY" -ge 3 ]; then
+              echo "All 3 MySQL services healthy"
+              exit 0
+            fi
+            sleep 2
+          done
+        ' || { echo "Timeout"; docker compose ps; docker compose logs --tail=30; exit 1; }
+
+    - name: Setup replication
+      run: bash tests/functional/setup-replication.sh
+
+    - name: Run Raft consensus tests
+      run: bash tests/functional/test-raft.sh
+
+    - name: Collect Raft orchestrator logs
+      if: always()
+      working-directory: tests/functional
+      run: |
+        docker compose logs orchestrator-raft1 > /tmp/orchestrator-raft1.log 2>&1 || true
+        docker compose logs orchestrator-raft2 > /tmp/orchestrator-raft2.log 2>&1 || true
+        docker compose logs orchestrator-raft3 > /tmp/orchestrator-raft3.log 2>&1 || true
+
+    - name: Upload Raft orchestrator logs
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: orchestrator-raft-logs
+        path: |
+          /tmp/orchestrator-raft1.log
+          /tmp/orchestrator-raft2.log
+          /tmp/orchestrator-raft3.log
+
+    - name: Collect all docker logs on failure
+      if: failure()
+      working-directory: tests/functional
+      run: docker compose logs > /tmp/docker-compose-raft-logs.txt 2>&1 || true
+
+    - name: Upload docker logs on failure
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: docker-compose-raft-logs
+        path: /tmp/docker-compose-raft-logs.txt
+
+    - name: Cleanup
+      if: always()
+      working-directory: tests/functional
+      run: docker compose down -v --remove-orphans 2>/dev/null || true
diff --git a/tests/functional/docker-compose.yml b/tests/functional/docker-compose.yml
index 3bceac1a..1da3871f 100644
--- a/tests/functional/docker-compose.yml
+++ b/tests/functional/docker-compose.yml
@@ -205,6 +205,81 @@ services:
         aliases:
           - orchestrator-pg
 
+  orchestrator-raft1:
+    image: ubuntu:24.04
+    hostname: orchestrator-raft1
+    volumes:
+      - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro
+      - ../../resources:/orchestrator/resources:ro
+      - ./orchestrator-raft1.conf.json:/orchestrator/orchestrator.conf.json:ro
+    command: >
+      bash -c "
+        apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 &&
+        mkdir -p /tmp/raft1 &&
+        cd /orchestrator &&
+        orchestrator -config orchestrator.conf.json http
+      "
+    ports:
+      - "3100:3099"
+    depends_on:
+      mysql1:
+        condition: service_healthy
+    networks:
+      orchnet:
+        ipv4_address: 172.30.0.40
+        aliases:
+          - orchestrator-raft1
+
+  orchestrator-raft2:
+    image: ubuntu:24.04
+    hostname: orchestrator-raft2
+    volumes:
+      - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro
+      - ../../resources:/orchestrator/resources:ro
+      - ./orchestrator-raft2.conf.json:/orchestrator/orchestrator.conf.json:ro
+    command: >
+      bash -c "
+        apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 &&
+        mkdir -p /tmp/raft2 &&
+        cd /orchestrator &&
+        orchestrator -config orchestrator.conf.json http
+      "
+    ports:
+      - "3101:3099"
+    depends_on:
+      mysql1:
+        condition: service_healthy
+    networks:
+      orchnet:
+        ipv4_address: 172.30.0.41
+        aliases:
+          - orchestrator-raft2
+
+  orchestrator-raft3:
+    image: ubuntu:24.04
+    hostname: orchestrator-raft3
+    volumes:
+      - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro
+      - ../../resources:/orchestrator/resources:ro
+      - ./orchestrator-raft3.conf.json:/orchestrator/orchestrator.conf.json:ro
+    command: >
+      bash -c "
+        apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 &&
+        mkdir -p /tmp/raft3 &&
+        cd /orchestrator &&
+        orchestrator -config orchestrator.conf.json http
+      "
+    ports:
+      - "3102:3099"
+    depends_on:
+      mysql1:
+        condition: service_healthy
+    networks:
+      orchnet:
+        ipv4_address: 172.30.0.42
+        aliases:
+          - orchestrator-raft3
+
 networks:
   orchnet:
     driver: bridge
diff --git a/tests/functional/orchestrator-raft1.conf.json b/tests/functional/orchestrator-raft1.conf.json
new file mode 100644
index 00000000..9871d3e9
--- /dev/null
+++ b/tests/functional/orchestrator-raft1.conf.json
@@ -0,0 +1,25 @@
+{
+  "Debug": true,
+  "ListenAddress": ":3099",
+  "HTTPAdvertise": "http://172.30.0.40:3099",
+  "MySQLTopologyUser": "orchestrator",
+  "MySQLTopologyPassword": "orch_pass",
+  "MySQLOrchestratorHost": "",
+  "MySQLOrchestratorPort": 0,
+  "BackendDB": "sqlite",
+  "SQLite3DataFile": "/tmp/raft1/orchestrator.sqlite3",
+  "DiscoverByShowSlaveHosts": false,
+  "InstancePollSeconds": 5,
+  "RecoveryPeriodBlockSeconds": 10,
+  "RecoverMasterClusterFilters": [".*"],
+  "RecoverIntermediateMasterClusterFilters": [".*"],
+  "AutoPseudoGTID": false,
+  "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "PrometheusEnabled": false,
+  "RaftEnabled": true,
+  "RaftDataDir": "/tmp/raft1",
+  "RaftBind": "172.30.0.40",
+  "DefaultRaftPort": 10008,
+  "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"]
+}
diff --git a/tests/functional/orchestrator-raft2.conf.json b/tests/functional/orchestrator-raft2.conf.json
new file mode 100644
index 00000000..51fbd241
--- /dev/null
+++ b/tests/functional/orchestrator-raft2.conf.json
@@ -0,0 +1,25 @@
+{
+  "Debug": true,
+  "ListenAddress": ":3099",
+  "HTTPAdvertise": "http://172.30.0.41:3099",
+  "MySQLTopologyUser": "orchestrator",
+  "MySQLTopologyPassword": "orch_pass",
+  "MySQLOrchestratorHost": "",
+  "MySQLOrchestratorPort": 0,
+  "BackendDB": "sqlite",
+  "SQLite3DataFile": "/tmp/raft2/orchestrator.sqlite3",
+  "DiscoverByShowSlaveHosts": false,
+  "InstancePollSeconds": 5,
+  "RecoveryPeriodBlockSeconds": 10,
+  "RecoverMasterClusterFilters": [".*"],
+  "RecoverIntermediateMasterClusterFilters": [".*"],
+  "AutoPseudoGTID": false,
+  "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "PrometheusEnabled": false,
+  "RaftEnabled": true,
+  "RaftDataDir": "/tmp/raft2",
+  "RaftBind": "172.30.0.41",
+  "DefaultRaftPort": 10008,
+  "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"]
+}
diff --git a/tests/functional/orchestrator-raft3.conf.json b/tests/functional/orchestrator-raft3.conf.json
new file mode 100644
index 00000000..3a93ad29
--- /dev/null
+++ b/tests/functional/orchestrator-raft3.conf.json
@@ -0,0 +1,25 @@
+{
+  "Debug": true,
+  "ListenAddress": ":3099",
+  "HTTPAdvertise": "http://172.30.0.42:3099",
+  "MySQLTopologyUser": "orchestrator",
+  "MySQLTopologyPassword": "orch_pass",
+  "MySQLOrchestratorHost": "",
+  "MySQLOrchestratorPort": 0,
+  "BackendDB": "sqlite",
+  "SQLite3DataFile": "/tmp/raft3/orchestrator.sqlite3",
+  "DiscoverByShowSlaveHosts": false,
+  "InstancePollSeconds": 5,
+  "RecoveryPeriodBlockSeconds": 10,
+  "RecoverMasterClusterFilters": [".*"],
+  "RecoverIntermediateMasterClusterFilters": [".*"],
+  "AutoPseudoGTID": false,
+  "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)",
+  "PrometheusEnabled": false,
+  "RaftEnabled": true,
+  "RaftDataDir": "/tmp/raft3",
+  "RaftBind": "172.30.0.42",
+  "DefaultRaftPort": 10008,
+  "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"]
+}
diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh
new file mode 100755
index 00000000..778995f6
--- /dev/null
+++ b/tests/functional/test-raft.sh
@@ -0,0 +1,323 @@
+#!/bin/bash
+# Raft consensus tests -- verify leader election, failover, and follower redirect
+set -uo pipefail
+cd "$(dirname "$0")/../.."
+source tests/functional/lib.sh
+
+echo "=== RAFT CONSENSUS TESTS ==="
+
+# Port mapping: raft1->3100, raft2->3101, raft3->3102
+RAFT_PORTS=(3100 3101 3102)
+RAFT_NODES=(orchestrator-raft1 orchestrator-raft2 orchestrator-raft3)
+COMPOSE_FILE="tests/functional/docker-compose.yml"
+
+# ============================================================
+# Phase 1: Cluster Formation & Leader Election
+# ============================================================
+echo ""
+echo "--- Phase 1: Cluster Formation & Leader Election ---"
+
+docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 orchestrator-raft2 orchestrator-raft3
+
+# Wait for all 3 nodes to be reachable and for a leader to be elected
+echo "Waiting for Raft cluster to form and elect a leader (up to 90s)..."
+LEADER=""
+for i in $(seq 1 90); do
+    ALL_UP=true
+    for port in "${RAFT_PORTS[@]}"; do
+        if ! curl -sf "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then
+            ALL_UP=false
+            break
+        fi
+    done
+    if $ALL_UP; then
+        # Check if all nodes agree on a leader
+        LEADER1=$(curl -sf "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        LEADER2=$(curl -sf "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        LEADER3=$(curl -sf "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        if [ -n "$LEADER1" ] && [ "$LEADER1" = "$LEADER2" ] && [ "$LEADER2" = "$LEADER3" ]; then
+            LEADER="$LEADER1"
+            echo "Leader elected: $LEADER (after ${i}s)"
+            break
+        fi
+    fi
+    sleep 1
+done
+
+if [ -n "$LEADER" ]; then
+    pass "Raft leader elected: $LEADER"
+else
+    fail "Raft leader not elected within 90s"
+    # Print debug info
+    for port in "${RAFT_PORTS[@]}"; do
+        echo "  Node :${port} raft-status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
+    done
+fi
+
+# Verify all nodes agree on the same leader
+LEADERS_AGREE=true
+for port in "${RAFT_PORTS[@]}"; do
+    NODE_LEADER=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    if [ "$NODE_LEADER" != "$LEADER" ]; then
+        LEADERS_AGREE=false
+        break
+    fi
+done
+if $LEADERS_AGREE && [ -n "$LEADER" ]; then
+    pass "All 3 nodes agree on the same leader"
+else
+    fail "Nodes do not agree on the leader"
+fi
+
+# Verify exactly one node reports itself as Leader state
+LEADER_COUNT=0
+for port in "${RAFT_PORTS[@]}"; do
+    STATE=$(curl -sf "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    if [ "$STATE" = "Leader" ]; then
+        ((LEADER_COUNT++))
+    fi
+done
+if [ "$LEADER_COUNT" -eq 1 ]; then
+    pass "Exactly one node is in Leader state"
+else
+    fail "Expected 1 leader, found $LEADER_COUNT"
+fi
+
+# ============================================================
+# Phase 2: Leader Serves Topology
+# ============================================================
+echo ""
+echo "--- Phase 2: Leader Serves Topology ---"
+
+# Determine leader port (map leader IP to host port)
+LEADER_PORT=""
+LEADER_INDEX=""
+for idx in 0 1 2; do
+    STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    if [ "$STATE" = "Leader" ]; then
+        LEADER_PORT="${RAFT_PORTS[$idx]}"
+        LEADER_INDEX=$idx
+        break
+    fi
+done
+
+if [ -z "$LEADER_PORT" ]; then
+    fail "Could not identify leader port"
+else
+    echo "Leader is on localhost:${LEADER_PORT} (${RAFT_NODES[$LEADER_INDEX]})"
+
+    # Discover MySQL topology through the leader
+    echo "Discovering MySQL topology through the leader..."
+    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
+    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
+    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
+
+    # Wait for topology discovery
+    echo "Waiting for topology discovery (up to 60s)..."
+    CLUSTER_FOUND=false
+    for i in $(seq 1 60); do
+        CLUSTERS=$(curl -sf "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
+        COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
+        if [ "$COUNT" -ge 1 ]; then
+            echo "Cluster discovered after ${i}s"
+            CLUSTER_FOUND=true
+            break
+        fi
+        # Re-seed discovery periodically
+        if [ "$((i % 10))" = "0" ]; then
+            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
+            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
+            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
+        fi
+        sleep 1
+    done
+
+    if $CLUSTER_FOUND; then
+        pass "Leader serves cluster data via /api/clusters"
+    else
+        fail "Leader did not return cluster data within 60s"
+    fi
+
+    # Verify followers can also return cluster data (Raft replicates state)
+    FOLLOWER_HAS_DATA=true
+    for idx in 0 1 2; do
+        if [ "$idx" = "$LEADER_INDEX" ]; then
+            continue
+        fi
+        FPORT="${RAFT_PORTS[$idx]}"
+        # Followers may redirect or serve data directly; either is valid
+        FCLUSTERS=$(curl -sfL "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]")
+        FCOUNT=$(echo "$FCLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
+        if [ "$FCOUNT" -lt 1 ]; then
+            FOLLOWER_HAS_DATA=false
+            echo "  Follower on :${FPORT} returned $FCOUNT clusters"
+        fi
+    done
+    if $FOLLOWER_HAS_DATA; then
+        pass "Followers serve cluster data (Raft state replicated)"
+    else
+        # This is not necessarily a failure -- followers may need more time
+        skip "Some followers do not yet serve cluster data (may need more replication time)"
+    fi
+fi
+
+# ============================================================
+# Phase 3: Leader Failure & Re-election
+# ============================================================
+echo ""
+echo "--- Phase 3: Leader Failure & Re-election ---"
+
+OLD_LEADER="$LEADER"
+OLD_LEADER_NODE=""
+if [ -n "$LEADER_INDEX" ]; then
+    OLD_LEADER_NODE="${RAFT_NODES[$LEADER_INDEX]}"
+fi
+
+if [ -z "$OLD_LEADER_NODE" ]; then
+    fail "Cannot test leader failure: no leader identified"
+else
+    echo "Stopping leader node: $OLD_LEADER_NODE"
+    docker compose -f "$COMPOSE_FILE" stop "$OLD_LEADER_NODE"
+
+    # Determine which nodes are still running
+    REMAINING_PORTS=()
+    REMAINING_INDICES=()
+    for idx in 0 1 2; do
+        if [ "$idx" != "$LEADER_INDEX" ]; then
+            REMAINING_PORTS+=("${RAFT_PORTS[$idx]}")
+            REMAINING_INDICES+=("$idx")
+        fi
+    done
+
+    # Wait for re-election
+    echo "Waiting for re-election (up to 60s)..."
+    NEW_LEADER=""
+    for i in $(seq 1 60); do
+        L1=$(curl -sf "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        L2=$(curl -sf "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        if [ -n "$L1" ] && [ "$L1" = "$L2" ] && [ "$L1" != "$OLD_LEADER" ]; then
+            NEW_LEADER="$L1"
+            echo "New leader elected: $NEW_LEADER (after ${i}s)"
+            break
+        fi
+        sleep 1
+    done
+
+    if [ -n "$NEW_LEADER" ]; then
+        pass "New leader elected after stopping old leader: $NEW_LEADER"
+    else
+        fail "No new leader elected within 60s"
+        for port in "${REMAINING_PORTS[@]}"; do
+            echo "  Node :${port} status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
+        done
+    fi
+
+    # Verify new leader is different from old
+    if [ -n "$NEW_LEADER" ] && [ "$NEW_LEADER" != "$OLD_LEADER" ]; then
+        pass "New leader is different from old leader"
+    elif [ -n "$NEW_LEADER" ]; then
+        fail "New leader is the same as old leader (should not happen)"
+    fi
+
+    # Verify new leader can serve API requests
+    if [ -n "$NEW_LEADER" ]; then
+        NEW_LEADER_PORT=""
+        for idx in "${REMAINING_INDICES[@]}"; do
+            STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+            if [ "$STATE" = "Leader" ]; then
+                NEW_LEADER_PORT="${RAFT_PORTS[$idx]}"
+                break
+            fi
+        done
+        if [ -n "$NEW_LEADER_PORT" ]; then
+            CLUSTERS=$(curl -sf "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
+            COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
+            if [ "$COUNT" -ge 1 ]; then
+                pass "New leader serves cluster data via API"
+            else
+                skip "New leader returned 0 clusters (state may not have fully replicated yet)"
+            fi
+        fi
+    fi
+
+    # ============================================================
+    # Phase 4: Node Rejoin
+    # ============================================================
+    echo ""
+    echo "--- Phase 4: Node Rejoin ---"
+
+    echo "Restarting stopped node: $OLD_LEADER_NODE"
+    docker compose -f "$COMPOSE_FILE" start "$OLD_LEADER_NODE"
+
+    # Wait for the restarted node to rejoin
+    RESTARTED_PORT="${RAFT_PORTS[$LEADER_INDEX]}"
+    echo "Waiting for restarted node (:${RESTARTED_PORT}) to rejoin (up to 60s)..."
+    REJOINED=false
+    for i in $(seq 1 60); do
+        RLEADER=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        if [ -n "$RLEADER" ] && [ "$RLEADER" = "$NEW_LEADER" ]; then
+            echo "Node rejoined after ${i}s"
+            REJOINED=true
+            break
+        fi
+        sleep 1
+    done
+
+    if $REJOINED; then
+        pass "Restarted node rejoined the cluster"
+    else
+        fail "Restarted node did not rejoin within 60s"
+    fi
+
+    # Verify the restarted node is a follower (not a new leader)
+    RSTATE=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    if [ "$RSTATE" = "Follower" ]; then
+        pass "Restarted node is a Follower (stable leader)"
+    elif [ "$RSTATE" = "Leader" ]; then
+        # Leadership may have shifted -- still valid if all agree
+        skip "Restarted node became Leader (leadership may have shifted)"
+    else
+        fail "Restarted node in unexpected state: $RSTATE"
+    fi
+
+    # Verify all 3 nodes agree on the current leader
+    ALL_AGREE=true
+    CURRENT_LEADER=""
+    for port in "${RAFT_PORTS[@]}"; do
+        NL=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        if [ -z "$CURRENT_LEADER" ]; then
+            CURRENT_LEADER="$NL"
+        elif [ "$NL" != "$CURRENT_LEADER" ]; then
+            ALL_AGREE=false
+        fi
+    done
+    if $ALL_AGREE && [ -n "$CURRENT_LEADER" ]; then
+        pass "All 3 nodes agree on current leader after rejoin: $CURRENT_LEADER"
+    else
+        fail "Nodes do not agree on leader after rejoin"
+    fi
+
+    # Verify cluster is healthy (all 3 nodes report healthy)
+    HEALTHY_COUNT=0
+    for port in "${RAFT_PORTS[@]}"; do
+        HEALTH=$(curl -sf "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "")
+        if echo "$HEALTH" | grep -q "healthy"; then
+            ((HEALTHY_COUNT++))
+        fi
+    done
+    if [ "$HEALTHY_COUNT" -eq 3 ]; then
+        pass "All 3 nodes report healthy"
+    else
+        skip "Only $HEALTHY_COUNT/3 nodes report healthy (may need more time)"
+    fi
+fi
+
+# ============================================================
+# Cleanup
+# ============================================================
+echo ""
+echo "--- Cleanup ---"
+docker compose -f "$COMPOSE_FILE" stop orchestrator-raft1 orchestrator-raft2 orchestrator-raft3 2>/dev/null || true
+echo "Raft containers stopped."
+
+summary

From 84f7361ed5b5096a9536e91fe3b9c0d9034129af Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Thu, 9 Apr 2026 09:38:16 +0000
Subject: [PATCH 2/4] Add curl timeouts to test-raft.sh and
 test-failover-advanced.sh

All curl calls lacked --max-time, causing potential test hangs
if any API endpoint was slow to respond.
---
 tests/functional/test-raft.sh | 50 +++++++++++++++++------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh
index 778995f6..63a45570 100755
--- a/tests/functional/test-raft.sh
+++ b/tests/functional/test-raft.sh
@@ -25,16 +25,16 @@ LEADER=""
 for i in $(seq 1 90); do
     ALL_UP=true
     for port in "${RAFT_PORTS[@]}"; do
-        if ! curl -sf "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then
+        if ! curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then
             ALL_UP=false
             break
         fi
     done
     if $ALL_UP; then
         # Check if all nodes agree on a leader
-        LEADER1=$(curl -sf "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
-        LEADER2=$(curl -sf "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
-        LEADER3=$(curl -sf "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        LEADER1=$(curl -sf --max-time 10 "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        LEADER2=$(curl -sf --max-time 10 "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        LEADER3=$(curl -sf --max-time 10 "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
         if [ -n "$LEADER1" ] && [ "$LEADER1" = "$LEADER2" ] && [ "$LEADER2" = "$LEADER3" ]; then
             LEADER="$LEADER1"
             echo "Leader elected: $LEADER (after ${i}s)"
@@ -50,14 +50,14 @@ else
     fail "Raft leader not elected within 90s"
     # Print debug info
     for port in "${RAFT_PORTS[@]}"; do
-        echo "  Node :${port} raft-status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
+        echo "  Node :${port} raft-status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
     done
 fi
 
 # Verify all nodes agree on the same leader
 LEADERS_AGREE=true
 for port in "${RAFT_PORTS[@]}"; do
-    NODE_LEADER=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    NODE_LEADER=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
     if [ "$NODE_LEADER" != "$LEADER" ]; then
         LEADERS_AGREE=false
         break
@@ -72,7 +72,7 @@ fi
 # Verify exactly one node reports itself as Leader state
 LEADER_COUNT=0
 for port in "${RAFT_PORTS[@]}"; do
-    STATE=$(curl -sf "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    STATE=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
     if [ "$STATE" = "Leader" ]; then
         ((LEADER_COUNT++))
     fi
@@ -93,7 +93,7 @@ echo "--- Phase 2: Leader Serves Topology ---"
 LEADER_PORT=""
 LEADER_INDEX=""
 for idx in 0 1 2; do
-    STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
     if [ "$STATE" = "Leader" ]; then
         LEADER_PORT="${RAFT_PORTS[$idx]}"
         LEADER_INDEX=$idx
@@ -108,15 +108,15 @@ else
 
     # Discover MySQL topology through the leader
     echo "Discovering MySQL topology through the leader..."
-    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
-    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
-    curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
+    curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
+    curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
+    curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
 
     # Wait for topology discovery
     echo "Waiting for topology discovery (up to 60s)..."
     CLUSTER_FOUND=false
     for i in $(seq 1 60); do
-        CLUSTERS=$(curl -sf "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
+        CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
         COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
         if [ "$COUNT" -ge 1 ]; then
             echo "Cluster discovered after ${i}s"
@@ -125,9 +125,9 @@ else
         fi
         # Re-seed discovery periodically
         if [ "$((i % 10))" = "0" ]; then
-            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
-            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
-            curl -sf "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
+            curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1
+            curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1
+            curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1
         fi
         sleep 1
     done
@@ -146,7 +146,7 @@ else
         fi
         FPORT="${RAFT_PORTS[$idx]}"
         # Followers may redirect or serve data directly; either is valid
-        FCLUSTERS=$(curl -sfL "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]")
+        FCLUSTERS=$(curl -sf --max-time 10L "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]")
         FCOUNT=$(echo "$FCLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
         if [ "$FCOUNT" -lt 1 ]; then
             FOLLOWER_HAS_DATA=false
@@ -193,8 +193,8 @@ else
     echo "Waiting for re-election (up to 60s)..."
     NEW_LEADER=""
     for i in $(seq 1 60); do
-        L1=$(curl -sf "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
-        L2=$(curl -sf "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        L1=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        L2=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
         if [ -n "$L1" ] && [ "$L1" = "$L2" ] && [ "$L1" != "$OLD_LEADER" ]; then
             NEW_LEADER="$L1"
             echo "New leader elected: $NEW_LEADER (after ${i}s)"
@@ -208,7 +208,7 @@ else
     else
         fail "No new leader elected within 60s"
         for port in "${REMAINING_PORTS[@]}"; do
-            echo "  Node :${port} status: $(curl -sf http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
+            echo "  Node :${port} status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')"
         done
     fi
 
@@ -223,14 +223,14 @@ else
     if [ -n "$NEW_LEADER" ]; then
         NEW_LEADER_PORT=""
         for idx in "${REMAINING_INDICES[@]}"; do
-            STATE=$(curl -sf "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+            STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
             if [ "$STATE" = "Leader" ]; then
                 NEW_LEADER_PORT="${RAFT_PORTS[$idx]}"
                 break
             fi
         done
         if [ -n "$NEW_LEADER_PORT" ]; then
-            CLUSTERS=$(curl -sf "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
+            CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]")
             COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0")
             if [ "$COUNT" -ge 1 ]; then
                 pass "New leader serves cluster data via API"
@@ -254,7 +254,7 @@ else
     echo "Waiting for restarted node (:${RESTARTED_PORT}) to rejoin (up to 60s)..."
     REJOINED=false
     for i in $(seq 1 60); do
-        RLEADER=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        RLEADER=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
         if [ -n "$RLEADER" ] && [ "$RLEADER" = "$NEW_LEADER" ]; then
             echo "Node rejoined after ${i}s"
             REJOINED=true
@@ -270,7 +270,7 @@ else
     fi
 
     # Verify the restarted node is a follower (not a new leader)
-    RSTATE=$(curl -sf "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+    RSTATE=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
     if [ "$RSTATE" = "Follower" ]; then
         pass "Restarted node is a Follower (stable leader)"
     elif [ "$RSTATE" = "Leader" ]; then
@@ -284,7 +284,7 @@ else
     ALL_AGREE=true
     CURRENT_LEADER=""
     for port in "${RAFT_PORTS[@]}"; do
-        NL=$(curl -sf "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
+        NL=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "")
         if [ -z "$CURRENT_LEADER" ]; then
             CURRENT_LEADER="$NL"
         elif [ "$NL" != "$CURRENT_LEADER" ]; then
@@ -300,7 +300,7 @@ else
     # Verify cluster is healthy (all 3 nodes report healthy)
     HEALTHY_COUNT=0
     for port in "${RAFT_PORTS[@]}"; do
-        HEALTH=$(curl -sf "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "")
+        HEALTH=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "")
         if echo "$HEALTH" | grep -q "healthy"; then
             ((HEALTHY_COUNT++))
         fi

From 4b980805c3f97f34a92fec8e1402603a3edda6f2 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Thu, 9 Apr 2026 11:17:10 +0000
Subject: [PATCH 3/4] Fix Raft test: stagger node startup to avoid bootstrap
 conflicts

Starting all 3 Raft nodes simultaneously causes each to call
BootstrapCluster independently, creating conflicting initial states
and perpetual election cycles. Fix by starting node 1 first, waiting
for it to be ready, then starting nodes 2 and 3.
---
 tests/functional/test-raft.sh | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh
index 63a45570..2af3e895 100755
--- a/tests/functional/test-raft.sh
+++ b/tests/functional/test-raft.sh
@@ -17,7 +17,34 @@ COMPOSE_FILE="tests/functional/docker-compose.yml"
 echo ""
 echo "--- Phase 1: Cluster Formation & Leader Election ---"
 
-docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 orchestrator-raft2 orchestrator-raft3
+# Start node 1 first to let it bootstrap the cluster before other nodes join.
+# Starting all 3 simultaneously causes each to call BootstrapCluster independently,
+# creating conflicting initial states and perpetual election cycles.
+echo "Starting first Raft node (bootstrap node)..."
+docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1
+
+# Wait for node 1 to be reachable (includes apt-get install time)
+echo "Waiting for bootstrap node to be ready (up to 90s)..."
+BOOTSTRAP_READY=false
+for i in $(seq 1 90); do
+    if curl -sf --max-time 5 "http://localhost:3100/api/raft-status" > /dev/null 2>&1; then
+        BOOTSTRAP_READY=true
+        echo "Bootstrap node ready after ${i}s"
+        break
+    fi
+    sleep 1
+done
+
+if ! $BOOTSTRAP_READY; then
+    fail "Bootstrap Raft node (orchestrator-raft1) not ready within 90s"
+    docker compose -f "$COMPOSE_FILE" logs orchestrator-raft1 2>/dev/null | tail -30
+    summary
+fi
+pass "Bootstrap Raft node started and ready"
+
+# Now start the remaining nodes — they will find the bootstrapped cluster
+echo "Starting remaining Raft nodes..."
+docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft2 orchestrator-raft3
 
 # Wait for all 3 nodes to be reachable and for a leader to be elected
 echo "Waiting for Raft cluster to form and elect a leader (up to 90s)..."

From 2ea8a252773b8ec094f43464851549cb8804d7a9 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Thu, 9 Apr 2026 12:02:54 +0000
Subject: [PATCH 4/4] Fix Raft SQLite store: allow NULL data in raft_log and
 raft_store

The raft_log.data and raft_store.store_value columns were NOT NULL, but
hashicorp/raft can produce log entries with nil Data (e.g. LogNoop when
a new leader takes over). This caused "NOT NULL constraint failed:
raft_log.data" errors immediately after election, causing the leader to
step down and creating perpetual election cycles.
---
 go/raft/rel_store.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/go/raft/rel_store.go b/go/raft/rel_store.go
index b9c25ee3..881063f1 100644
--- a/go/raft/rel_store.go
+++ b/go/raft/rel_store.go
@@ -36,7 +36,7 @@ var createQueries = []string{
 			log_index integer,
 			term bigint not null,
 			log_type int not null,
-			data blob not null,
+			data blob,
 			PRIMARY KEY (log_index)
 		)
 	`,
@@ -44,7 +44,7 @@ var createQueries = []string{
 		CREATE TABLE IF NOT EXISTS raft_store (
 			store_id integer,
 			store_key varbinary(512) not null,
-			store_value blob not null,
+			store_value blob,
 			PRIMARY KEY (store_id)
 		)
 	`,