diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 1bf68fc5..3e3c9010 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -12,7 +12,7 @@ on: jobs: functional: runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 30 steps: - uses: actions/checkout@v4 @@ -77,17 +77,84 @@ jobs: - name: Run failover tests run: bash tests/functional/test-failover.sh + # ---- PostgreSQL functional tests ---- + - name: Start PostgreSQL containers + working-directory: tests/functional + run: | + docker compose up -d pgprimary + echo "Waiting for pgprimary to be healthy..." + timeout 120 bash -c ' + while true; do + STATUS=$(docker compose ps pgprimary --format json 2>/dev/null | python3 -c " + import json, sys + for line in sys.stdin: + svc = json.loads(line) + if \"healthy\" in svc.get(\"Status\",\"\").lower(): + print(\"healthy\") + sys.exit(0) + print(\"waiting\") + " 2>/dev/null || echo "waiting") + if [ "$STATUS" = "healthy" ]; then + echo "pgprimary healthy" + exit 0 + fi + sleep 2 + done + ' || { echo "Timeout waiting for pgprimary"; docker compose logs pgprimary --tail=30; exit 1; } + docker compose up -d pgstandby1 + echo "Waiting for pgstandby1 to be healthy..." + timeout 120 bash -c ' + while true; do + STATUS=$(docker compose ps pgstandby1 --format json 2>/dev/null | python3 -c " + import json, sys + for line in sys.stdin: + svc = json.loads(line) + if \"healthy\" in svc.get(\"Status\",\"\").lower(): + print(\"healthy\") + sys.exit(0) + print(\"waiting\") + " 2>/dev/null || echo "waiting") + if [ "$STATUS" = "healthy" ]; then + echo "pgstandby1 healthy" + exit 0 + fi + sleep 2 + done + ' || { echo "Timeout waiting for pgstandby1"; docker compose logs pgstandby1 --tail=30; exit 1; } + + - name: Start PostgreSQL orchestrator + working-directory: tests/functional + run: | + docker compose up -d orchestrator-pg + echo "Waiting for PostgreSQL orchestrator to be ready..." + timeout 120 bash -c ' + while true; do + if curl -sf http://localhost:3098/api/clusters > /dev/null 2>&1; then + echo "PostgreSQL orchestrator ready" + exit 0 + fi + sleep 2 + done + ' || { echo "PostgreSQL orchestrator not ready"; docker compose logs orchestrator-pg --tail=50; exit 1; } + + - name: Run PostgreSQL tests + run: bash tests/functional/test-postgresql.sh + - name: Collect orchestrator logs if: always() working-directory: tests/functional - run: docker compose logs orchestrator > /tmp/orchestrator-test.log 2>&1 || true + run: | + docker compose logs orchestrator > /tmp/orchestrator-test.log 2>&1 || true + docker compose logs orchestrator-pg > /tmp/orchestrator-pg-test.log 2>&1 || true - name: Upload orchestrator logs if: always() uses: actions/upload-artifact@v4 with: name: orchestrator-test-logs - path: /tmp/orchestrator-test.log + path: | + /tmp/orchestrator-test.log + /tmp/orchestrator-pg-test.log - name: Collect all docker logs on failure if: failure() diff --git a/tests/functional/docker-compose.yml b/tests/functional/docker-compose.yml index 3ab62623..79efa604 100644 --- a/tests/functional/docker-compose.yml +++ b/tests/functional/docker-compose.yml @@ -92,6 +92,51 @@ services: aliases: - proxysql + pgprimary: + image: postgres:17 + hostname: pgprimary + environment: + POSTGRES_PASSWORD: testpass + POSTGRES_USER: postgres + volumes: + - ./postgres/init-primary.sh:/docker-entrypoint-initdb.d/init.sh + ports: + - "15432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 3s + retries: 30 + networks: + orchnet: + aliases: + - pgprimary + + pgstandby1: + image: postgres:17 + hostname: pgstandby1 + environment: + POSTGRES_PASSWORD: testpass + PGUSER: postgres + PGPASSWORD: repl_pass + volumes: + - ./postgres/init-standby.sh:/init-standby.sh + entrypoint: ["/bin/bash", "/init-standby.sh"] + depends_on: + pgprimary: + condition: service_healthy + ports: + - "15433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 3s + retries: 30 + networks: + orchnet: + aliases: + - pgstandby1 + orchestrator: image: ubuntu:24.04 hostname: orchestrator @@ -122,6 +167,36 @@ services: aliases: - orchestrator + orchestrator-pg: + image: ubuntu:24.04 + hostname: orchestrator-pg + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-pg-test.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + rm -f /tmp/orchestrator-pg-test.sqlite3 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3098:3098" + depends_on: + pgprimary: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:3098/api/clusters"] + interval: 5s + timeout: 3s + retries: 60 + start_period: 15s + networks: + orchnet: + aliases: + - orchestrator-pg + networks: orchnet: driver: bridge diff --git a/tests/functional/orchestrator-pg-test.conf.json b/tests/functional/orchestrator-pg-test.conf.json new file mode 100644 index 00000000..95ea29e0 --- /dev/null +++ b/tests/functional/orchestrator-pg-test.conf.json @@ -0,0 +1,19 @@ +{ + "Debug": true, + "ListenAddress": ":3098", + "ProviderType": "postgresql", + "PostgreSQLTopologyUser": "orchestrator", + "PostgreSQLTopologyPassword": "orch_pass", + "PostgreSQLSSLMode": "disable", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/orchestrator-pg-test.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "PrometheusEnabled": true +} diff --git a/tests/functional/postgres/init-primary.sh b/tests/functional/postgres/init-primary.sh new file mode 100755 index 00000000..0f69899c --- /dev/null +++ b/tests/functional/postgres/init-primary.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Initialize PostgreSQL primary for functional tests. +# This script runs inside the postgres Docker entrypoint (initdb phase). + +set -e + +# ---- WAL / replication settings ---- +cat >> "$PGDATA/postgresql.conf" <> "$PGDATA/pg_hba.conf" < "$PGDATA/postgresql.auto.conf" < /dev/null + +echo "Waiting for topology discovery..." +PG_CLUSTER="" +for i in $(seq 1 60); do + PG_CLUSTER=$(curl -s "$ORC_URL/api/clusters" 2>/dev/null | python3 -c " +import json, sys +c = json.load(sys.stdin) +for name in c: + if 'pgprimary' in name or 'pg' in name.lower(): + print(name) + sys.exit(0) +if c: + print(c[0]) +" 2>/dev/null || echo "") + if [ -n "$PG_CLUSTER" ]; then + COUNT=$(curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "0") + if [ "$COUNT" -ge 2 ] 2>/dev/null; then + echo "PostgreSQL topology discovered (${COUNT} instances, cluster=$PG_CLUSTER) after ${i}s" + break + fi + fi + # Re-seed standby periodically + if [ "$((i % 10))" = "0" ]; then + curl -s "$ORC_URL/api/discover/pgstandby1/5432" > /dev/null 2>&1 + fi + sleep 1 +done + +if [ -n "$PG_CLUSTER" ]; then + pass "PostgreSQL cluster discovered: $PG_CLUSTER" +else + fail "No PostgreSQL cluster discovered" +fi + +INST_COUNT=$(curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "0") +if [ "$INST_COUNT" -ge 2 ]; then + pass "PostgreSQL instances discovered: $INST_COUNT" +else + fail "PostgreSQL instances discovered: $INST_COUNT (expected >= 2)" +fi + +# Verify primary is not read-only (not in recovery) +PRIMARY_RO=$(curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + if 'pgprimary' in inst.get('Key', {}).get('Hostname', ''): + print('true' if inst.get('ReadOnly', True) else 'false') + sys.exit(0) +print('unknown') +" 2>/dev/null || echo "unknown") + +if [ "$PRIMARY_RO" = "false" ]; then + pass "pgprimary is read_only=false (primary)" +else + fail "pgprimary read_only=$PRIMARY_RO (expected false)" +fi + +# Verify standby is read-only (in recovery) +STANDBY_RO=$(curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + if 'pgstandby1' in inst.get('Key', {}).get('Hostname', ''): + print('true' if inst.get('ReadOnly', False) else 'false') + sys.exit(0) +print('unknown') +" 2>/dev/null || echo "unknown") + +if [ "$STANDBY_RO" = "true" ]; then + pass "pgstandby1 is read_only=true (standby)" +else + fail "pgstandby1 read_only=$STANDBY_RO (expected true)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- API tests ---" + +test_endpoint "GET /api/clusters" "$ORC_URL/api/clusters" "200" +test_endpoint "GET /api/v2/clusters" "$ORC_URL/api/v2/clusters" "200" +test_endpoint "GET /api/v2/status" "$ORC_URL/api/v2/status" "200" +test_body_contains "/api/v2/status healthy" "$ORC_URL/api/v2/status" '"status"' +test_body_contains "/api/clusters contains PG cluster" "$ORC_URL/api/clusters" "pgprimary" + +# ---------------------------------------------------------------- +echo "" +echo "--- Failover test: kill pgprimary ---" + +echo "Stopping pgprimary container..." +$COMPOSE stop pgprimary + +echo "Waiting for orchestrator to detect DeadPrimary and recover (max 90s)..." +RECOVERED=false +SUCCESSOR="" +for i in $(seq 1 90); do + RECOVERIES=$(curl -s "$ORC_URL/api/v2/recoveries" 2>/dev/null) + HAS_RECOVERY=$(echo "$RECOVERIES" | python3 -c " +import json, sys +d = json.load(sys.stdin) +data = d.get('data', []) +for r in data: + a = r.get('AnalysisEntry', {}).get('Analysis', '') + s = r.get('IsSuccessful', False) + successor = r.get('SuccessorKey', {}).get('Hostname', '') + if 'DeadPrimary' in a and s and successor: + print(f'RECOVERED:{successor}') + sys.exit(0) +print('WAITING') +" 2>/dev/null) + if echo "$HAS_RECOVERY" | grep -q "RECOVERED:"; then + SUCCESSOR=$(echo "$HAS_RECOVERY" | sed 's/RECOVERED://') + echo "Recovery detected after ${i}s -- successor: $SUCCESSOR" + RECOVERED=true + break + fi + sleep 1 +done + +if [ "$RECOVERED" = "true" ]; then + pass "DeadPrimary detected and recovered (successor: $SUCCESSOR)" +else + fail "DeadPrimary: no recovery detected within 90s" + # Dump debug info + echo " DEBUG: Recent recoveries:" + curl -s "$ORC_URL/api/v2/recoveries" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -30 + echo " DEBUG: Cluster topology:" + curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -30 +fi + +# Verify successor is no longer in recovery (promoted to primary) +if [ "$RECOVERED" = "true" ]; then + sleep 3 + SUCCESSOR_RO=$(curl -s "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + hostname = inst.get('Key', {}).get('Hostname', '') + if hostname == '$SUCCESSOR': + print('true' if inst.get('ReadOnly', True) else 'false') + sys.exit(0) +print('unknown') +" 2>/dev/null || echo "unknown") + + if [ "$SUCCESSOR_RO" = "false" ]; then + pass "Successor $SUCCESSOR promoted (read_only=false)" + else + # After promotion the instance needs a poll cycle to update + skip "Successor read_only=$SUCCESSOR_RO (may need additional poll cycle)" + fi +fi + +# Verify recovery is recorded +RECOVERY_API=$(curl -s "$ORC_URL/api/v2/recoveries" 2>/dev/null) +if echo "$RECOVERY_API" | grep -q '"IsSuccessful":true'; then + pass "Recovery audit: /api/v2/recoveries shows successful recovery" +else + fail "Recovery audit: no successful recovery in API response" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Cleanup: restart pgprimary ---" +$COMPOSE start pgprimary +sleep 5 +echo "pgprimary restarted" + +summary