Skip to content
Merged
83 changes: 81 additions & 2 deletions cmd/elastickv-admin/encryption.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"io"
"os"
"slices"
"strconv"
"time"

pb "github.com/bootjp/elastickv/proto"
Expand Down Expand Up @@ -42,21 +43,99 @@ func encryptionMain(args []string) error {
return runEncryptionRegisterWriter(rest, os.Stdout)
case "bootstrap":
return runEncryptionBootstrap(rest, os.Stdout)
case "probe-node-id":
return runEncryptionProbeNodeID(rest, os.Stdout)
case "-h", "--help", "help":
// `-h` is the universal "show usage" affordance for CLI
// subcommands; returning nil keeps the exit code at 0
// so shell scripts using $? to detect success do not
// trip on a help request.
_, err := fmt.Fprintln(os.Stdout, "usage: elastickv-admin encryption <subcommand> [flags]\n\nsubcommands:\n status\n rotate-dek\n register-writer\n bootstrap")
_, err := fmt.Fprintln(os.Stdout, "usage: elastickv-admin encryption <subcommand> [flags]\n\nsubcommands:\n status\n rotate-dek\n register-writer\n bootstrap\n probe-node-id")
if err != nil {
return errors.Wrap(err, "write usage")
}
return nil
default:
return errors.Errorf("encryption: unknown subcommand %q (supported: status, rotate-dek, register-writer, bootstrap)", sub)
return errors.Errorf("encryption: unknown subcommand %q (supported: status, rotate-dek, register-writer, bootstrap, probe-node-id)", sub)
}
}

// runEncryptionProbeNodeID implements the §5.1 collision-
// mitigation helper from the 6D design doc: it derives the
// 16-bit `node_id` from a candidate `full_node_id` so the
// operator can verify the value is free in the cluster's
// allocated node_id space BEFORE joining the node and
// triggering an `ErrNodeIDCollision` refusal.
//
// The narrowing — `uint16(full_node_id & 0xFFFF)` — matches the
// shipped writer-registry keying and §4.1 GCM nonce prefix
// (see `internal/encryption/applier.go::nodeIDMask`). Anything
// other than this narrowing would compute a value that diverges
// from what the runtime uses, defeating the purpose of the
// probe.
//
// No RPC — this is a pure local computation. The operator runs
// it against a candidate `full_node_id` and a current cluster's
// allocated node_id list (out of scope for this subcommand; the
// operator can read `gh ... encryption status` on each member
// to gather them).
func runEncryptionProbeNodeID(args []string, out io.Writer) error {
fs := flag.NewFlagSet("encryption probe-node-id", flag.ContinueOnError)
fullNodeIDStr := fs.String("full-node-id", "", "candidate 64-bit full_node_id to probe (decimal or 0x-prefixed hex)")
if err := fs.Parse(args); err != nil {
// flag.ContinueOnError reports `-h`/`--help` via the
// sentinel flag.ErrHelp after writing usage to
// fs.Output(). Match the convention used by every other
// encryption subcommand (status, rotate-dek,
// register-writer, bootstrap): treat ErrHelp as a usage
// request, exit 0 so shell scripts that test $? on help
// invocations don't trip.
if errors.Is(err, flag.ErrHelp) {
return nil
}
return errors.Wrap(err, "parse probe-node-id flags")
Comment on lines +85 to +96

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Treat probe-node-id help as non-error exit

runEncryptionProbeNodeID wraps every fs.Parse error, so elastickv-admin encryption probe-node-id -h/--help returns an error and exits non-zero instead of behaving like other encryption subcommands that treat flag.ErrHelp as a successful usage request. This breaks shell/help workflows that rely on help returning success.

Useful? React with 👍 / 👎.

}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if *fullNodeIDStr == "" {
return errors.New("--full-node-id is required (decimal or 0x-prefixed hex)")
}
full, err := parseUint64WithRadix(*fullNodeIDStr)
if err != nil {
return errors.Wrapf(err, "parse --full-node-id=%q", *fullNodeIDStr)
}
const nodeIDMask = 0xFFFF
// keep masked as uint64: the value is only formatted for display, so the uint16 cast (and its gosec G115 nolint) is not load-bearing here.
masked := full & nodeIDMask
if _, err := fmt.Fprintf(out, "full_node_id: %#016x (%d)\nnode_id: %#04x (%d)\n",
full, full, masked, masked); err != nil {
return errors.Wrap(err, "write probe-node-id result")
}
return nil
}

// parseUint64WithRadix accepts either decimal ("12345") or
// 0x-prefixed hex ("0xDEADBEEF") so operators can paste values
// in whichever form their inventory uses. Uses strconv.ParseUint
// rather than fmt.Sscanf: ParseUint requires the ENTIRE input to
// be valid for the chosen radix, whereas Sscanf stops at the
// first non-matching character and silently returns the prefix.
// The silent-prefix behaviour would let "0x1234ZZZZ" parse as
// 0x1234 (or "1234abc" as 1234), which is unsafe for an
// operator-visible identifier where every digit matters.
func parseUint64WithRadix(s string) (uint64, error) {
if len(s) >= 2 && (s[0:2] == "0x" || s[0:2] == "0X") {
v, err := strconv.ParseUint(s[2:], 16, 64)
if err != nil {
return 0, errors.Wrap(err, "hex parse")
}
return v, nil
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0, errors.Wrap(err, "decimal parse")
}
return v, nil
}

type encryptionEndpointFlags struct {
endpoint *string
timeout *time.Duration
Expand Down
137 changes: 137 additions & 0 deletions cmd/elastickv-admin/encryption_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,140 @@ func startCustomEncryptionAdminTestServer(t *testing.T, impl pb.EncryptionAdminS
})
return lis.Addr().String()
}

// TestRunEncryptionProbeNodeID_Hex pins the §5.1 collision-
// mitigation helper from the 6D design doc: a hex full_node_id
// is parsed and narrowed via uint16(full_node_id & 0xFFFF) — the
// same mask that applier.go (writer-registry keying) and §4.1
// (GCM nonce prefix) already use. The CLI prints both the
// full_node_id (canonical 16-hex-digit form) and the derived
// node_id so the operator can verify the value is free in the
// cluster's allocated node_id space before joining.
func TestRunEncryptionProbeNodeID_Hex(t *testing.T) {
t.Parallel()
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{"--full-node-id=0xDEADBEEFCAFE1234"}, &buf)
if err != nil {
t.Fatalf("runEncryptionProbeNodeID: %v", err)
}
out := buf.String()
for _, needle := range []string{
"full_node_id: 0xdeadbeefcafe1234",
"node_id: 0x1234",
} {
if !strings.Contains(out, needle) {
t.Errorf("output missing %q:\n%s", needle, out)
}
}
}

// TestRunEncryptionProbeNodeID_Decimal pins the dual-radix
// parser: decimal input works alongside the 0x-prefixed hex case.
func TestRunEncryptionProbeNodeID_Decimal(t *testing.T) {
t.Parallel()
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{"--full-node-id=305419896"}, &buf)
if err != nil {
t.Fatalf("runEncryptionProbeNodeID: %v", err)
}
// 305419896 = 0x12345678; low 16 bits = 0x5678
out := buf.String()
for _, needle := range []string{
"full_node_id: 0x0000000012345678",
"node_id: 0x5678",
} {
if !strings.Contains(out, needle) {
t.Errorf("output missing %q:\n%s", needle, out)
}
}
}

// TestRunEncryptionProbeNodeID_RequiresFlag pins the
// required-flag contract: omitting --full-node-id returns an
// explicit error rather than producing a misleading "probe of
// zero" output.
func TestRunEncryptionProbeNodeID_RequiresFlag(t *testing.T) {
t.Parallel()
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{}, &buf)
if err == nil {
t.Fatal("missing --full-node-id: want error, got nil")
}
if !strings.Contains(err.Error(), "--full-node-id is required") {
t.Errorf("error message: want mention of required flag, got %v", err)
}
if buf.Len() != 0 {
t.Errorf("must not print output on error: got %q", buf.String())
}
}

// TestRunEncryptionProbeNodeID_RejectsBadInput pins the
// invalid-input case: non-numeric, non-hex input returns an
// error rather than silently truncating.
func TestRunEncryptionProbeNodeID_RejectsBadInput(t *testing.T) {
t.Parallel()
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{"--full-node-id=notanumber"}, &buf)
if err == nil {
t.Fatal("invalid input: want error, got nil")
}
}

// TestRunEncryptionProbeNodeID_HelpFlagExitsZero pins the
// flag.ErrHelp special-case: every other encryption subcommand
// (status, rotate-dek, register-writer, bootstrap) returns nil
// on `-h`/`--help` so shell scripts that test $? on help
// invocations don't trip. probe-node-id must match the
// convention. Codex r2 flagged the absent ErrHelp branch.
func TestRunEncryptionProbeNodeID_HelpFlagExitsZero(t *testing.T) {
t.Parallel()
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{"-h"}, &buf)
if err != nil {
t.Fatalf("-h: want nil, got %v", err)
}
}

// TestRunEncryptionProbeNodeID_RejectsPartialInput pins the
// strconv.ParseUint vs fmt.Sscanf distinction: parseUint64WithRadix
// MUST reject inputs where only a prefix is parseable, because
// "0x1234ZZZZ" silently parsing as 0x1234 would mislead the
// operator into joining a node under a different node_id than
// the one they meant to probe. Sscanf accepts partial input;
// ParseUint rejects it.
func TestRunEncryptionProbeNodeID_RejectsPartialInput(t *testing.T) {
t.Parallel()
cases := []string{
"--full-node-id=0x1234ZZZZ", // hex prefix with non-hex tail
"--full-node-id=1234abc", // decimal prefix with letter tail
"--full-node-id=0xDEAD GHI", // hex with embedded space
}
for _, tc := range cases {
var buf bytes.Buffer
err := runEncryptionProbeNodeID([]string{tc}, &buf)
if err == nil {
t.Errorf("partial-input %q: want error, got nil (output=%q)", tc, buf.String())
}
}
}

// TestEncryptionMain_ProbeNodeIDSubcommand pins the dispatch in
// encryptionMain: the new probe-node-id case must route to the
// runner. Without this, a typo in encryptionMain's switch
// statement would route the subcommand to the default branch
// and fail with "unknown subcommand", and the dispatch test
// would not catch it.
func TestEncryptionMain_ProbeNodeIDSubcommand(t *testing.T) {
t.Parallel()
// Help-flag path: probe-node-id without flags errors with
// "required" rather than "unknown subcommand". A successful
// dispatch route reaches runEncryptionProbeNodeID's
// missing-flag branch.
err := encryptionMain([]string{"probe-node-id"})
if err == nil {
t.Fatal("probe-node-id with no flags: want error, got nil")
}
if !strings.Contains(err.Error(), "--full-node-id is required") {
t.Errorf("dispatch reached wrong handler: got %v", err)
}
}
61 changes: 61 additions & 0 deletions internal/encryption/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,65 @@ var (
// from forcing a spurious refusal. See §5.5 for the
// IsEncryptionRelevantOpcode rationale.
ErrSidecarBehindRaftLog = errors.New("encryption: sidecar raft_applied_index is behind the raftengine's persisted applied index and the gap covers an encryption-relevant entry; refusing to start (run `encryption resync-sidecar` to advance the sidecar past the encryption-relevant entries before retrying — see §9.1 + §5.5)")

// ErrNodeIDCollision is the §9.1 / 6C-3 startup-refusal guard
// raised when two distinct `full_node_id` values in the local
// route-catalog snapshot map to the same 16-bit `node_id`. The
// derivation is `uint16(full_node_id & 0xFFFF)` — same narrowing
// the writer-registry keying and §4.1 GCM nonce prefix already
// use (see `applier.go` `nodeIDMask`). A collision means the two
// members would write under the same `(node_id, local_epoch)`
// prefix and reuse the GCM counter under the same DEK, which
// breaks AES-GCM's nonce-uniqueness requirement and is a
// catastrophic confidentiality + integrity failure.
//
// The guard reads the LOCAL route-catalog snapshot only — no
// RPC fan-out — because the startup-guard phase runs BEFORE the
// gRPC server is up, and the local Raft log is authoritative
// for cluster membership (every node applies the same
// `ConfChange` entries). See `2026_05_18_proposed_6d_enable_storage_envelope.md`
// §5.1 for the lifecycle rationale and the operator-mitigation
// menu (`probe-node-id` CLI, full_node_id re-roll, etc.).
//
// Skip conditions: encryption disabled (no nonce-reuse risk),
// empty membership snapshot (single-node pre-bootstrap; nothing
// to compare yet).
ErrNodeIDCollision = errors.New("encryption: two distinct full_node_id values in the local route-catalog hash to the same 16-bit node_id, which would reuse GCM nonces under the active DEK; refusing to start (re-roll one of the colliding full_node_id values — use `elastickv-admin encryption probe-node-id --full-node-id=<u64>` to verify before joining — see §9.1 + §5.1 of the 6D design doc)")

// ErrLocalEpochRollback is the §9.1 / 6C-3 startup-refusal
// guard raised when this node's sidecar `local_epoch` for the
// active storage DEK is less than OR equal to the local Pebble
// writer-registry's `LastSeenLocalEpoch` for the
// `(full_node_id, active_storage_dek_id)` row. The strict-ahead
// posture — `sidecar > registry`, not `>=` — is the §5.2
// nonce-monotonicity invariant: a node may only resume issuing
// nonces when its sidecar is STRICTLY ahead of the registry
// record. The equality case would replay the same
// `(node_id, local_epoch)` prefix and reuse the GCM counter
// under the same DEK, identical to the collision scenario
// `ErrNodeIDCollision` catches but at the single-node-restart
// timescale rather than the cluster-membership timescale.
//
// Classic trigger: operator restored the sidecar from an old
// backup, leaving `local_epoch` behind the registry record
// that the node has already used for prior writes.
//
// The guard reads LOCAL Pebble state only — no RPC. The
// writer-registry is local state on every node post-bootstrap
// (every node applies the same OpRegistration and OpBootstrap
// entries), so the local row is authoritative for this node's
// own monotonicity contract.
//
// Skip conditions: encryption disabled, sidecar's
// `Active.Storage == 0` (bootstrap not yet committed). The
// missing-registry-row case is NOT an unconditional skip —
// the §5.2 split fires based on `sidecar.StorageEnvelopeActive`:
// pre-cutover (active=false) the missing-row case is a
// freshly-joined-learner skip and `CheckLocalEpochRollback`
// returns nil; post-cutover (active=true) the missing-row
// case is a missing rollback anchor and the function fires
// `ErrLocalEpochRollback`. See the comprehensive doc on
// `CheckLocalEpochRollback` (`local_epoch_rollback.go`) for
// the full split.
ErrLocalEpochRollback = errors.New("encryption: sidecar local_epoch for the active storage DEK is at or below the local writer-registry's last_seen_local_epoch, which would replay the GCM nonce prefix under the same DEK; refusing to start (verify the sidecar was not restored from an old backup; run `encryption resync-sidecar` if appropriate, or rotate the affected DEK — see §9.1 + §5.2 of the 6D design doc)")
)
Loading
Loading