diff --git a/.github/workflows/jepsen-test-scheduled-dedup.yml b/.github/workflows/jepsen-test-scheduled-dedup.yml deleted file mode 100644 index 527158f3..00000000 --- a/.github/workflows/jepsen-test-scheduled-dedup.yml +++ /dev/null @@ -1,248 +0,0 @@ -# Jepsen Scheduled Stress Test — Option-2 Dedup Mode -# -# Daily run with ELASTICKV_REDIS_ONEPHASE_DEDUP=1 and -# ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=1 pinned so the demo cluster exercises the -# option-2 idempotency path on both adapters, independent of future default -# changes. This preserves the high-pressure dedup signal for -# :duplicate-elements / :G-single-item-realtime anomalies in each workload's -# analysis output. -# -# Scope: Redis + DynamoDB workloads. The dedup feature is controlled by the -# Redis adapter's onePhaseTxnDedup flag (RPUSH/LPUSH, MULTI/EXEC, -# standalone SET) and the DynamoDB adapter's onePhaseTxnDedup flag -# (single-item UpdateItem/PutItem/DeleteItem). S3 / SQS do -# not yet route through the dedup loop, so re-running them here would add -# hours of CI for no signal on the new code path. -# -# Cadence: 03:17 UTC daily (off-peak; non-zero minute per ScheduleWakeup -# guidance). The general 6-hourly scheduled workflow continues to run -# with explicit dedup opt-outs so the legacy path also stays covered. - -on: - schedule: - - cron: '17 3 * * *' - workflow_dispatch: - inputs: - time-limit: - description: "Workload runtime seconds" - required: false - default: "300" - rate: - description: "Ops/sec per worker" - required: false - default: "10" - concurrency: - description: "Number of worker threads" - required: false - default: "8" - key-count: - description: "Number of distinct keys" - required: false - default: "16" - max-writes-per-key: - # Keep this comfortably above rate * concurrency * time-limit / key-count. - # The scheduled default is 10 * 8 * 300 / 16 = 1500 writes/key before - # multi-op expansion; 3000 prevents the append generator from exhausting - # its value space before the 300s stress window completes. - description: "Maximum writes per key before exhaustion" - required: false - default: "3000" - max-txn-length: - description: "Maximum micro-ops per transaction" - required: false - default: "4" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-jepsen-dedup-scheduled - -name: Jepsen Scheduled Stress Test (Option-2 Dedup) -permissions: - contents: read -jobs: - test: - runs-on: ubuntu-latest - env: - GOCACHE: /tmp/go-build - # Enable the Redis adapter option-2 dedup gate for this run. This - # is the load-bearing differentiator from the general scheduled - # workflow — the demo cluster's redis adapter routes RPUSH/LPUSH, - # MULTI/EXEC, and standalone SET through runTransactionWithDedup, - # exercising the FSM exact-ts probe and the reusable retry - # state. Anomalies in :duplicate-elements / :G-single-item-realtime - # under this flag indicate a regression in option-2 plumbing. - ELASTICKV_REDIS_ONEPHASE_DEDUP: "1" - # Pin the DynamoDB adapter option-2 dedup path on. demo.go wires it - # via adapter.NewDynamoDBServer, which reads this env var; - # the single-item write path (UpdateItem/PutItem/DeleteItem) then - # routes through retryItemWriteWithGenerationDedup on the leader, - # exercising the same FSM exact-ts probe as the Redis path. The - # DynamoDB workload below validates no :duplicate-elements under it. - ELASTICKV_DYNAMODB_ONEPHASE_DEDUP: "1" - steps: - - uses: actions/checkout@v6 - with: - submodules: recursive - - uses: actions/setup-java@v5 - with: - distribution: temurin - java-version: '21' - - uses: actions/setup-go@v6 - with: - go-version-file: 'go.mod' - - name: Install netcat and graphviz - run: sudo apt-get update && sudo apt-get install -y netcat-openbsd graphviz - - name: Install Leiningen - run: | - curl -L https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > ~/lein - chmod +x ~/lein - ~/lein version - - name: Cache Maven and Leiningen artifacts - uses: actions/cache@v5 - with: - path: | - ~/.m2/repository - ~/.lein - key: ${{ runner.os }}-maven-${{ hashFiles('jepsen/project.clj') }} - restore-keys: | - ${{ runner.os }}-maven- - - name: Pre-fetch Go modules - run: | - mkdir -p "$GOCACHE" /tmp/go-tmp - export GOCACHE GOTMPDIR=/tmp/go-tmp - go mod download - - name: Warm Leiningen Maven cache - working-directory: jepsen - run: | - # Matches the retry pattern used in jepsen-test-scheduled.yml so - # both workflows fail the step (not silently succeed) when Maven - # Central exhausts the retry budget. The previous shape - # `until [ "$n" -ge 3 ]; do ~/lein deps && break; done` exited - # the loop on the iteration count rather than on lein-deps - # success; when every attempt failed the loop terminated with - # the last command being `sleep` (exit 0), reporting the step - # as green despite no dependencies being warmed -- claude[bot] - # PR #889 blocking finding. Backoff also aligned to 30*n - # seconds for parity with the general workflow. - set -uo pipefail - n=0 - max=3 - until ~/lein deps; do - n=$((n + 1)) - if [ "$n" -ge "$max" ]; then - echo "lein deps failed after $n attempts" >&2 - exit 1 - fi - sleep_secs=$((n * 30)) - echo "lein deps failed (attempt $n/$max), retrying in ${sleep_secs}s..." >&2 - sleep "$sleep_secs" - done - - name: Launch demo cluster (dedup gate ON) - run: | - set -euo pipefail - mkdir -p "$GOCACHE" /tmp/go-tmp - export GOTMPDIR=/tmp/go-tmp - # The ELASTICKV_REDIS_ONEPHASE_DEDUP=1 and - # ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=1 env vars are inherited - # from the job env above. demo.go reads them via the redis - # server's WithOnePhaseTxnDedup option (adapter/redis.go - # NewRedisServer) and the DynamoDB server's NewDynamoDBServer - # (adapter/dynamodb.go), both of which call os.Getenv. - nohup go run cmd/server/demo.go > /tmp/elastickv-demo.log 2>&1 & - echo $! > /tmp/elastickv-demo.pid - - echo "ELASTICKV_REDIS_ONEPHASE_DEDUP=${ELASTICKV_REDIS_ONEPHASE_DEDUP}" - echo "ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=${ELASTICKV_DYNAMODB_ONEPHASE_DEDUP}" - # The env vars are set at the JOB level above and inherited by - # all `run:` steps; nothing in demo.go can intercept or unset - # them before the adapters read os.Getenv. So if they are "1" - # here, the dedup paths ARE active in the cluster. We print - # them explicitly so a failed run's log makes the configuration - # unambiguous (vs the general 6-hourly workflow whose runs would - # have empty values here). - if [ "${ELASTICKV_REDIS_ONEPHASE_DEDUP:-}" != "1" ]; then - echo "FATAL: ELASTICKV_REDIS_ONEPHASE_DEDUP is not '1' — this workflow runs only with the dedup gate on" - exit 2 - fi - if [ "${ELASTICKV_DYNAMODB_ONEPHASE_DEDUP:-}" != "1" ]; then - echo "FATAL: ELASTICKV_DYNAMODB_ONEPHASE_DEDUP is not '1' — this workflow pins the dedup path on" - exit 2 - fi - - echo "Waiting for redis (63791-63793) and dynamo (63801-63803) listeners..." - for i in {1..90}; do - if nc -z 127.0.0.1 63791 && nc -z 127.0.0.1 63792 && nc -z 127.0.0.1 63793 \ - && nc -z 127.0.0.1 63801 && nc -z 127.0.0.1 63802 && nc -z 127.0.0.1 63803; then - echo "Cluster is up" - exit 0 - fi - sleep 1 - done - - echo "Demo cluster failed to start; dumping log:" - tail -n 200 /tmp/elastickv-demo.log || true - exit 1 - - name: Run Redis Jepsen workload (dedup mode) against elastickv - working-directory: jepsen - timeout-minutes: 10 - run: | - timeout 480 ~/lein run -m elastickv.redis-workload \ - --time-limit ${{ inputs.time-limit || '300' }} \ - --rate ${{ inputs.rate || '10' }} \ - --concurrency ${{ inputs.concurrency || '8' }} \ - --key-count ${{ inputs.key-count || '16' }} \ - --max-writes-per-key ${{ inputs.max-writes-per-key || '3000' }} \ - --max-txn-length ${{ inputs.max-txn-length || '4' }} \ - --ports 63791,63792,63793 \ - --host 127.0.0.1 - - name: Run DynamoDB Jepsen workload (dedup mode) against elastickv - working-directory: jepsen - timeout-minutes: 10 - # --local connects to the already-running demo cluster's dynamo - # ports (dedup ON via ELASTICKV_DYNAMODB_ONEPHASE_DEDUP), so the - # single-item list_append writes exercise the option-2 reuse + - # exact-ts probe path. A :duplicate-elements anomaly here is the - # regression this workflow is meant to catch. - run: | - timeout 480 ~/lein run -m elastickv.dynamodb-workload --local \ - --time-limit ${{ inputs.time-limit || '300' }} \ - --rate ${{ inputs.rate || '10' }} \ - --concurrency ${{ inputs.concurrency || '8' }} \ - --key-count ${{ inputs.key-count || '16' }} \ - --max-writes-per-key ${{ inputs.max-writes-per-key || '3000' }} \ - --max-txn-length ${{ inputs.max-txn-length || '4' }} \ - --dynamo-ports 63801,63802,63803 \ - --host 127.0.0.1 - - name: Dump demo cluster log on failure - if: failure() - run: | - echo "=== first 200 lines (startup) ===" - head -n 200 /tmp/elastickv-demo.log || true - echo "=== last 1000 lines (most recent activity) ===" - tail -n 1000 /tmp/elastickv-demo.log || true - echo "=== full log line count ===" - wc -l /tmp/elastickv-demo.log || true - - name: Upload demo cluster log on failure - if: failure() - uses: actions/upload-artifact@v7 - with: - name: elastickv-demo-log-dedup - path: /tmp/elastickv-demo.log - retention-days: 14 - if-no-files-found: warn - - name: Upload Jepsen store on failure - if: failure() - uses: actions/upload-artifact@v7 - with: - # Covers both the redis and dynamodb dedup-mode runs (each - # workload writes its own subdir under jepsen/store). - name: jepsen-store-dedup - path: jepsen/store - retention-days: 14 - - name: Stop demo cluster - if: always() - run: | - if [ -f /tmp/elastickv-demo.pid ]; then - pid=$(cat /tmp/elastickv-demo.pid) - kill "$pid" 2>/dev/null || true - wait "$pid" 2>/dev/null || true - fi diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index 9bc2869c..37785da1 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -39,16 +39,6 @@ jobs: runs-on: ubuntu-latest env: GOCACHE: /tmp/go-build - # Explicit dedup-OFF control baseline. The Redis and DynamoDB adapter - # onePhaseTxnDedup flags are default-on, so this workflow is preserved - # as legacy-path coverage. Pair with the dedup-ON workflow - # (.github/workflows/jepsen-test-scheduled-dedup.yml) which pins both - # env vars to 1. Retirement of this workflow is a follow-up after 30 - # days of post-flip data; until then, do NOT remove these env vars — - # without them the two workflows would exercise the same path under - # the new defaults. - ELASTICKV_REDIS_ONEPHASE_DEDUP: "0" - ELASTICKV_DYNAMODB_ONEPHASE_DEDUP: "0" steps: - uses: actions/checkout@v6 with: diff --git a/docs/design/2026_05_21_proposed_txn_secondary_idempotency.md b/docs/design/2026_05_21_proposed_txn_secondary_idempotency.md index 98dd6d4b..df691b7b 100644 --- a/docs/design/2026_05_21_proposed_txn_secondary_idempotency.md +++ b/docs/design/2026_05_21_proposed_txn_secondary_idempotency.md @@ -542,11 +542,10 @@ preserves availability and adds correctness. `cmd/server/demo.go` with `ELASTICKV_REDIS_ONEPHASE_DEDUP=1`. - **Scheduled Jepsen run criterion.** 7 consecutive days without `:duplicate-elements` / `:G-single-item-realtime` in the dedup-mode - workflow (`.github/workflows/jepsen-test-scheduled-dedup.yml`, - daily at 03:17 UTC). The general scheduled workflow - (`.github/workflows/jepsen-test-scheduled.yml`, every 6 h) continues to run *without* - the gate so the legacy path stays covered — both must stay green - for option-2 to be safe to default-on. + workflow. During rollout this was a dedicated daily workflow; after the + default-on soak period, the dedicated workflow was retired and the general + scheduled workflow (`.github/workflows/jepsen-test-scheduled.yml`, every 6 h) + now covers the default dedup-on path. - **Workflow scope rationale.** The dedup-mode workflow exercises only the Redis workload. The dedup feature ships behind the Redis adapter's `onePhaseTxnDedup` flag (RPUSH/LPUSH via diff --git a/docs/design/2026_06_03_partial_dynamodb_onephase_dedup.md b/docs/design/2026_06_03_partial_dynamodb_onephase_dedup.md index c36281c6..adb299dd 100644 --- a/docs/design/2026_06_03_partial_dynamodb_onephase_dedup.md +++ b/docs/design/2026_06_03_partial_dynamodb_onephase_dedup.md @@ -410,15 +410,11 @@ every replica applying the same log entry. timeouts (`cmd/server/demo.go`) so leadership flaps during the DynamoDB workload. The default path has DynamoDB dedup enabled; set `ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=0` only to reproduce the legacy path. -- **CI — LANDED.** The DynamoDB list-append workload is added to the dedup-mode - workflow (`.github/workflows/jepsen-test-scheduled-dedup.yml`) with - `ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=1` pinned at the job env (read by - `adapter.NewDynamoDBServer` in the demo cluster), a fail-closed gate - assertion before the listeners come up (mirroring the Redis assertion), and - the launch step now also waits on the dynamo listeners (63801-63803). The - general workflow (`.github/workflows/jepsen-test-scheduled.yml`) explicitly - sets `ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=0` so the legacy path stays covered - as a control baseline after default-on. +- **CI — LANDED.** The DynamoDB list-append workload was added to the + dedup-mode workflow during rollout. After the default-on soak period, the + dedicated dedup workflow was retired; the general scheduled workflow + (`.github/workflows/jepsen-test-scheduled.yml`) now runs the default + `DynamoDBServer.onePhaseTxnDedup` path without an env-var opt-out. - Criterion to default-on: 7 consecutive days without `:duplicate-elements` in the dedup-mode DynamoDB workload, both workflows green. **Satisfied; this PR flips `DynamoDBServer.onePhaseTxnDedup`'s default and the env-var sense to @@ -473,8 +469,10 @@ change (the probe already exists), no proto change, no new store primitive. - (2026-06-18) Default-on follow-up: `DynamoDBServer.onePhaseTxnDedup` now defaults on because the probe-aware FSM reader is everywhere. Operators can still set `ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=0` or - `WithDynamoOnePhaseTxnDedup(false)` for rollback; the general scheduled - Jepsen workflow pins that opt-out to keep legacy-path coverage. + `WithDynamoOnePhaseTxnDedup(false)` for rollback. +- (2026-06-26) Post-flip CI cleanup: retired the legacy-path scheduled control + by removing the `ELASTICKV_DYNAMODB_ONEPHASE_DEDUP=0` opt-out from the general + scheduled Jepsen workflow and deleting the dedicated dedup-mode workflow. - (2026-06-03, PR #920 round-1) **Leader-only dedup guard added** per codex P1: the adapter-local `commitTS` allocation is only safe on the leader, so the dedup path is gated on `d.coordinator.IsLeader()` (+ `NextFenced` ceiling diff --git a/docs/design/2026_06_10_proposed_redis_onephase_dedup_default_on.md b/docs/design/2026_06_10_proposed_redis_onephase_dedup_default_on.md index 0197d54e..53aa2bc1 100644 --- a/docs/design/2026_06_10_proposed_redis_onephase_dedup_default_on.md +++ b/docs/design/2026_06_10_proposed_redis_onephase_dedup_default_on.md @@ -35,22 +35,21 @@ The parent design landed an FSM-side dedup probe and an adapter-side write-set reuse path keyed on a stale `commit_ts` ridden through `OperationGroup.PrevCommitTS` into a V2 `TxnMeta`. The probe is always present; emission of `prev_commit_ts != 0` is gated by -`RedisServer.onePhaseTxnDedup` (constructor: `WithOnePhaseTxnDedup`, env: -`ELASTICKV_REDIS_ONEPHASE_DEDUP=1`). The gate stays default-off until -the cluster has uniformly upgraded — the parent's R5 "ship the reader -before the writer" sequencing. +`RedisServer.onePhaseTxnDedup` (constructor: `WithOnePhaseTxnDedup`, env +rollback: `ELASTICKV_REDIS_ONEPHASE_DEDUP=0`). The gate now defaults on +because the cluster has uniformly upgraded — the parent's R5 "ship the reader +before the writer" sequencing is satisfied. -Two Jepsen workflows run the same stress profile (`--time-limit 150 ---rate 10 --concurrency 8 --key-count 16 --max-writes-per-key 250 ---max-txn-length 4`) every day against `main`: +During rollout, two Jepsen workflows ran the same stress profile +(`--time-limit 150 --rate 10 --concurrency 8 --key-count 16 +--max-writes-per-key 250 --max-txn-length 4`) every day against `main`: | Workflow | Env | Purpose | |---|---|---| -| [`jepsen-test-scheduled.yml`][off] | `ELASTICKV_REDIS_ONEPHASE_DEDUP` unset (off) | Legacy-path baseline — expected to surface the parent's anomaly class until default-on lands. | -| [`jepsen-test-scheduled-dedup.yml`][on] | `ELASTICKV_REDIS_ONEPHASE_DEDUP=1` (on) | M4 validation — must stay green to authorize default-on. | +| [`jepsen-test-scheduled.yml`][scheduled] | `ELASTICKV_REDIS_ONEPHASE_DEDUP=0` during the temporary control window | Legacy-path baseline after the default flip. Retired on 2026-06-26. | +| `jepsen-test-scheduled-dedup.yml` | `ELASTICKV_REDIS_ONEPHASE_DEDUP=1` (on) | M4 validation to authorize default-on. Deleted on 2026-06-26 after dedup-on became the standard scheduled path. | -[off]: ../../.github/workflows/jepsen-test-scheduled.yml -[on]: ../../.github/workflows/jepsen-test-scheduled-dedup.yml +[scheduled]: ../../.github/workflows/jepsen-test-scheduled.yml ## M4 evidence @@ -58,7 +57,8 @@ The parent design's `M4` criterion is *"7 consecutive days without `:duplicate-elements` / `:G-single-item-realtime` in the dedup-mode workflow."* -Dedup-mode (`jepsen-test-scheduled-dedup.yml`) run history on `main`: +Dedup-mode run history on `main` from the retired +`jepsen-test-scheduled-dedup.yml` workflow: | Date (UTC) | Run | Conclusion | |---|---|---| @@ -145,7 +145,7 @@ subsequent `GET` returns `v` — it fails on the pre-fix build ### M2 — Control workflow disposition After default-on, `jepsen-test-scheduled.yml` would silently exercise -the same path as `jepsen-test-scheduled-dedup.yml` (unset env → true), +the same path as `jepsen-test-scheduled-dedup.yml` (unset env -> true), so the two workflows would collapse to the same coverage. Two options: | Option | Effect | Recommendation | @@ -158,6 +158,10 @@ to `jepsen-test-scheduled.yml`'s top-level `env:` so the control retains its meaning across the default flip. The 30-day retirement decision becomes a follow-up issue. +Post-flip cleanup on 2026-06-26 retires that temporary control: the +`ELASTICKV_REDIS_ONEPHASE_DEDUP=0` opt-out was removed from +`jepsen-test-scheduled.yml`, and the dedicated dedup-mode workflow was deleted. + ### M3 — Issue #937 closure Update [#937](https://github.com/bootjp/elastickv/issues/937) with the @@ -245,5 +249,7 @@ One PR, two commits: After merge: monitor the next 2–3 daily runs of both scheduled workflows. The dedup-mode workflow must stay green; the control workflow may or may not surface anomalies — both outcomes are -informative. Roll back via env var (no binary revert) if anything +informative. After the post-flip soak period, the control workflow was +retired and the standard scheduled workflow now covers the dedup-on path. +Roll back via env var (no binary revert) if anything unexpected appears.