diff --git a/adapter/sqs.go b/adapter/sqs.go index 9e775a725..f32754fa1 100644 --- a/adapter/sqs.go +++ b/adapter/sqs.go @@ -5,6 +5,7 @@ import ( "io" "net" "net/http" + "strconv" "time" "github.com/bootjp/elastickv/kv" @@ -55,6 +56,11 @@ const ( sqsErrInternalFailure = "InternalFailure" sqsErrServiceUnavailable = "ServiceUnavailable" sqsErrMalformedRequest = "MalformedQueryString" + // sqsErrThrottling is the per-queue rate-limit rejection code. + // Returned with HTTP 400 and a Retry-After header derived from the + // bucket's refillRate + the request's charge count (see + // computeRetryAfter in sqs_throttle.go for the formula). + sqsErrThrottling = "Throttling" ) type SQSServerOption func(*SQSServer) @@ -76,6 +82,11 @@ type SQSServer struct { // goroutines without ordering between them. reaperCtx context.Context reaperCancel context.CancelFunc + // throttle is the per-queue rate-limit bucket store. Always + // non-nil; charge() short-circuits when the queue's meta has no + // throttle config so unconfigured queues pay one nil-check per + // request and nothing else (see sqs_throttle.go). + throttle *bucketStore } // WithSQSLeaderMap configures the Raft-address-to-SQS-address mapping used to @@ -98,6 +109,7 @@ func NewSQSServer(listen net.Listener, st store.MVCCStore, coordinate kv.Coordin coordinator: coordinate, reaperCtx: reaperCtx, reaperCancel: reaperCancel, + throttle: newBucketStoreDefault(), } s.targetHandlers = map[string]func(http.ResponseWriter, *http.Request){ sqsCreateQueueTarget: s.createQueue, @@ -131,6 +143,10 @@ func NewSQSServer(listen net.Listener, st store.MVCCStore, coordinate kv.Coordin func (s *SQSServer) Run() error { s.startReaper(s.reaperCtx) + // Throttle bucket idle-evict runs on a background ticker so the + // request hot path never pays the O(N) sweep cost. Cleaned up by + // the same reaperCtx cancellation that stops the message reaper. + go s.throttle.runSweepLoop(s.reaperCtx) if err := s.httpServer.Serve(s.listen); err != nil && !errors.Is(err, http.ErrServerClosed) { return errors.WithStack(err) } @@ -302,3 +318,19 @@ func writeSQSError(w http.ResponseWriter, status int, code string, message strin w.WriteHeader(status) _ = json.NewEncoder(w).Encode(resp) } + +// writeSQSThrottlingError emits the rate-limit rejection envelope: 400 +// + the AWS-shaped JSON error body + a Retry-After header carrying +// the integer-second wait derived from the bucket's refill rate and +// the request's charge count. The action argument is the bucket-action +// vocabulary ("Send" | "Receive" | "*") so the operator-visible +// message names the bucket that ran out, not just the queue. +func writeSQSThrottlingError(w http.ResponseWriter, queue, action string, retryAfter time.Duration) { + if retryAfter < time.Second { + retryAfter = time.Second + } + secs := int(retryAfter / time.Second) + w.Header().Set("Retry-After", strconv.Itoa(secs)) + writeSQSError(w, http.StatusBadRequest, sqsErrThrottling, + "Rate exceeded for queue '"+queue+"' action '"+action+"'") +} diff --git a/adapter/sqs_catalog.go b/adapter/sqs_catalog.go index f73d8dc3b..44bbadfb2 100644 --- a/adapter/sqs_catalog.go +++ b/adapter/sqs_catalog.go @@ -5,6 +5,7 @@ import ( "context" "io" "log/slog" + "math" "net/http" "net/url" "regexp" @@ -67,8 +68,26 @@ var sqsQueueNamePattern = regexp.MustCompile(`^[a-zA-Z0-9_-]{1,80}(\.fifo)?$`) // !sqs|queue|meta|. Serialized as JSON with a short magic prefix so // future schema migrations can switch encoding without reading back garbage. type sqsQueueMeta struct { - Name string `json:"name"` - Generation uint64 `json:"generation"` + Name string `json:"name"` + // Generation bumps on CreateQueue, DeleteQueue, and PurgeQueue — + // every operation that needs to invalidate the message keyspace + // (msg keys are prefixed with the generation so old data becomes + // unreachable). NOT suitable as a throttle-bucket key because a + // PurgeQueue would re-key the bucket and let a caller bypass the + // rate limit by repeatedly purging. + Generation uint64 `json:"generation"` + // Incarnation bumps on CreateQueue only; PurgeQueue and + // SetQueueAttributes preserve the prior value (the read-modify- + // write of the meta record carries it forward), and DeleteQueue + // removes the meta entirely. The next CreateQueue computes + // Incarnation = lastGen + 1, which is monotonic across delete / + // recreate cycles thanks to the persistent generation counter + // (sqsQueueGenKey survives DeleteQueue, see deleteQueueWithRetry). + // This is the throttle-bucket-key field — keying by Incarnation + // instead of Generation prevents PurgeQueue from resetting the + // in-memory token bucket while still isolating delete+recreate + // incarnations from each other. + Incarnation uint64 `json:"incarnation,omitempty"` CreatedAtHLC uint64 `json:"created_at_hlc,omitempty"` IsFIFO bool `json:"is_fifo,omitempty"` ContentBasedDedup bool `json:"content_based_dedup,omitempty"` @@ -92,6 +111,72 @@ type sqsQueueMeta struct { // commit time and trust HLC monotonicity to keep ordering sane. CreatedAtMillis int64 `json:"created_at_millis,omitempty"` LastModifiedAtMillis int64 `json:"last_modified_at_millis,omitempty"` + // Throttle is the per-queue rate-limit configuration. nil disables + // throttling (default). Set via SetQueueAttributes with the AWS-style + // names ThrottleSendCapacity / ThrottleSendRefillPerSecond / etc. + // Persisted on the meta so a leader failover loads the configuration + // along with the rest of the queue. + Throttle *sqsQueueThrottle `json:"throttle,omitempty"` + // PartitionCount is the number of FIFO partitions for this queue + // (Phase 3.D HT-FIFO, see docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md). + // Zero or 1 means the legacy single-partition layout — no schema + // change. Greater than 1 enables HT-FIFO. Set at CreateQueue time + // and immutable thereafter (SetQueueAttributes rejects any change). + // Power-of-two values only (validator rejects others). PR 2 of the + // rollout introduces this field but a temporary CreateQueue gate + // rejects PartitionCount > 1 until PR 5 lifts the gate atomically + // with the data-plane fanout — so the schema exists but no + // partitioned data can land before the data plane is wired. + PartitionCount uint32 `json:"partition_count,omitempty"` + // FifoThroughputLimit mirrors the AWS attribute. "perMessageGroupId" + // (default for HT-FIFO) keeps the §3.3 hash-by-MessageGroupId + // routing; "perQueue" activates the partition-0 short-circuit so + // every group ID routes to one partition (effectively N=1). + // Set at CreateQueue time and immutable thereafter — flipping it + // live would re-route in-flight messages and silently violate + // within-group FIFO ordering (see §3.2 of the design). + FifoThroughputLimit string `json:"fifo_throughput_limit,omitempty"` + // DeduplicationScope mirrors the AWS attribute. "messageGroup" + // (default for HT-FIFO) means the dedup window is per + // (queue, partition, MessageGroupId, dedupId); "queue" is the + // legacy single-window behaviour. Set at CreateQueue time and + // immutable thereafter — changing live can resurrect or suppress + // messages depending on the direction of the change. The + // validator additionally rejects {PartitionCount > 1, + // DeduplicationScope = "queue"} at CreateQueue time because the + // dedup key cannot be globally unique across partitions without + // a cross-partition OCC transaction. + DeduplicationScope string `json:"deduplication_scope,omitempty"` +} + +// sqsQueueThrottle is the per-queue token-bucket configuration. Three +// independent buckets per queue: Send (SendMessage[Batch]), Recv +// (ReceiveMessage / DeleteMessage[Batch] / ChangeMessageVisibility[Batch], +// charged on the consumer side), Default (catch-all for any future +// non-Send/Recv verb that gets wired into the throttle path). +// +// Field-name vocabulary uses short forms (Send*, Recv*, Default*) for the +// JSON contract and AWS-style attribute names; the in-memory bucketKey +// uses the canonical action vocabulary ("Send" | "Receive" | "*"). +// throttleConfigToBucketAction and bucketActionForCharge bridge the two. +type sqsQueueThrottle struct { + SendCapacity float64 `json:"send_capacity,omitempty"` + SendRefillPerSecond float64 `json:"send_refill_per_second,omitempty"` + RecvCapacity float64 `json:"recv_capacity,omitempty"` + RecvRefillPerSecond float64 `json:"recv_refill_per_second,omitempty"` + DefaultCapacity float64 `json:"default_capacity,omitempty"` + DefaultRefillPerSecond float64 `json:"default_refill_per_second,omitempty"` +} + +// IsEmpty reports whether the configuration is the no-op (all six +// fields zero), in which case throttling is disabled for the queue. +func (t *sqsQueueThrottle) IsEmpty() bool { + if t == nil { + return true + } + return t.SendCapacity == 0 && t.SendRefillPerSecond == 0 && + t.RecvCapacity == 0 && t.RecvRefillPerSecond == 0 && + t.DefaultCapacity == 0 && t.DefaultRefillPerSecond == 0 } var storedSQSMetaPrefix = []byte{0x00, 'S', 'Q', 0x01} @@ -297,6 +382,14 @@ func parseAttributesIntoMeta(name string, attrs map[string]string) (*sqsQueueMet if meta.ContentBasedDedup && !meta.IsFIFO { return nil, newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, "ContentBasedDeduplication is only valid on FIFO queues") } + // HT-FIFO validation runs after resolveFifoQueueFlag so the + // IsFIFO-only checks see the post-resolution flag. The temporary + // dormancy gate (§11 PR 2) runs separately in createQueue so + // SetQueueAttributes paths share the schema validator without + // re-rejecting on the gate. + if err := validatePartitionConfig(meta); err != nil { + return nil, err + } return meta, nil } @@ -384,6 +477,61 @@ var sqsAttributeAppliers = map[string]attributeApplier{ m.ContentBasedDedup = b return nil }, + // PartitionCount enables HT-FIFO when > 1 (Phase 3.D, see + // docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md). Set + // at CreateQueue time; SetQueueAttributes attempts to change it + // reject via the immutability check in trySetQueueAttributesOnce. + // PR 2 of the rollout introduces the field but the temporary + // dormancy gate in tryCreateQueueOnce rejects PartitionCount > 1 + // until PR 5 lifts the gate atomically with the data plane. + "PartitionCount": func(m *sqsQueueMeta, v string) error { + // Parse at uint64 width and bound-check explicitly so the + // uint32 narrowing below is gosec-clean. + n, err := strconv.ParseUint(strings.TrimSpace(v), 10, 64) + if err != nil { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount must be a non-negative integer") + } + if n > math.MaxUint32 { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount is out of range") + } + m.PartitionCount = uint32(n) + return nil + }, + "FifoThroughputLimit": func(m *sqsQueueMeta, v string) error { + v = strings.TrimSpace(v) + switch v { + case "", htfifoThroughputPerMessageGroupID, htfifoThroughputPerQueue: + m.FifoThroughputLimit = v + return nil + } + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "FifoThroughputLimit must be 'perMessageGroupId' or 'perQueue'") + }, + "DeduplicationScope": func(m *sqsQueueMeta, v string) error { + v = strings.TrimSpace(v) + switch v { + case "", htfifoDedupeScopeMessageGroup, htfifoDedupeScopeQueue: + m.DeduplicationScope = v + return nil + } + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "DeduplicationScope must be 'messageGroup' or 'queue'") + }, + // Throttle* are non-AWS extensions for per-queue rate limiting, + // see docs/design/2026_04_26_proposed_sqs_per_queue_throttling.md. + // Each accepts a non-negative float64; the cross-attribute + // validation that enforces both-zero-or-both-positive on each + // (capacity, refill) pair, capacity ≥ refill, hard ceiling, and + // the capacity ≥ 10 floor for batch-charging buckets runs in + // validateThrottleConfig after every Throttle* applier has fired. + "ThrottleSendCapacity": applyThrottleField(throttleSetSendCapacity), + "ThrottleSendRefillPerSecond": applyThrottleField(throttleSetSendRefill), + "ThrottleRecvCapacity": applyThrottleField(throttleSetRecvCapacity), + "ThrottleRecvRefillPerSecond": applyThrottleField(throttleSetRecvRefill), + "ThrottleDefaultCapacity": applyThrottleField(throttleSetDefaultCapacity), + "ThrottleDefaultRefillPerSecond": applyThrottleField(throttleSetDefaultRefill), "RedrivePolicy": func(m *sqsQueueMeta, v string) error { // Validate the policy at attribute-apply time so a malformed // RedrivePolicy never makes it onto the queue meta record. The @@ -419,6 +567,174 @@ func applyAttributes(meta *sqsQueueMeta, attrs map[string]string) error { return err } } + // Throttle* validation has to run after every applier so the + // pair-wise rules (both-zero-or-both-positive, capacity ≥ refill, + // capacity ≥ 10 for batch buckets) see the post-update meta as a + // whole. Running per-applier would reject a valid two-attribute + // update (e.g. SendCapacity + SendRefillPerSecond) on the first + // applier because the second value is not yet present. + if err := validateThrottleConfig(meta); err != nil { + return err + } + // HT-FIFO partition validation runs in parseAttributesIntoMeta / + // trySetQueueAttributesOnce, AFTER resolveFifoQueueFlag, so the + // IsFIFO-only checks see the post-resolution flag. Running here + // would reject a valid CreateQueue with FifoQueue=true + + // FifoThroughputLimit=perMessageGroupId because IsFIFO is still + // false at this point in the flow. + return nil +} + +// applyThrottleField wraps a setter that writes one Throttle* field +// into meta.Throttle, allocating the struct lazily on first use. The +// per-field setter does the float parse + non-negative + hard-ceiling +// check; cross-field rules run later in validateThrottleConfig. +func applyThrottleField(set func(*sqsQueueThrottle, float64)) attributeApplier { + return func(m *sqsQueueMeta, v string) error { + f, err := parseThrottleFloat(v) + if err != nil { + return err + } + if m.Throttle == nil { + m.Throttle = &sqsQueueThrottle{} + } + set(m.Throttle, f) + return nil + } +} + +// parseThrottleFloat parses the wire string into a non-negative float +// bounded by the hard ceiling. Any malformed or out-of-range value +// turns into InvalidAttributeValue with a self-describing message so +// the operator sees the cause without grepping the server log. +func parseThrottleFloat(value string) (float64, error) { + v := strings.TrimSpace(value) + f, err := strconv.ParseFloat(v, 64) + if err != nil { + return 0, newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "throttle attribute must be a non-negative number") + } + if math.IsNaN(f) || math.IsInf(f, 0) || f < 0 { + return 0, newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "throttle attribute must be finite and non-negative") + } + if f > throttleHardCeilingPerSecond { + return 0, newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "throttle attribute exceeds hard ceiling 100000") + } + return f, nil +} + +// Per-field setters keep applyThrottleField a one-liner per attribute +// and let validateThrottleConfig stay outside the applier dispatch +// table. Defined as functions (not closures) so a future caller from +// outside applyAttributes — e.g. a programmatic admin surface — can +// reuse them without recreating the closure boilerplate. +func throttleSetSendCapacity(t *sqsQueueThrottle, f float64) { t.SendCapacity = f } +func throttleSetSendRefill(t *sqsQueueThrottle, f float64) { t.SendRefillPerSecond = f } +func throttleSetRecvCapacity(t *sqsQueueThrottle, f float64) { t.RecvCapacity = f } +func throttleSetRecvRefill(t *sqsQueueThrottle, f float64) { t.RecvRefillPerSecond = f } +func throttleSetDefaultCapacity(t *sqsQueueThrottle, f float64) { t.DefaultCapacity = f } +func throttleSetDefaultRefill(t *sqsQueueThrottle, f float64) { t.DefaultRefillPerSecond = f } + +// validateThrottleConfig enforces the §3.2 cross-attribute rules on +// the post-applier meta. The single-field constraints (non-negative, +// hard ceiling) are already enforced inside parseThrottleFloat; +// what's left is pair-wise: +// +// - Each (capacity, refill) pair must be both zero (action disabled) +// or both positive. A capacity-without-refill bucket would never +// refill; a refill-without-capacity bucket has no burst headroom. +// - capacity ≥ refill, otherwise the bucket can never burst above +// steady state (the bucket can only ever hold one second's worth). +// - For action buckets that cover a batch verb (Send, Recv) the +// capacity must be ≥ throttleMinBatchCapacity (== 10). A capacity +// below the largest single charge is permanently unserviceable +// for full batches. +// +// If meta.Throttle is empty (the IsEmpty short-circuit) the function +// also drops the empty struct so a round-trip GetQueueAttributes +// reports the queue as unthrottled rather than zero-valued. Mirrors +// how nil throttle on the meta means "not configured". +func validateThrottleConfig(meta *sqsQueueMeta) error { + if meta.Throttle == nil { + return nil + } + t := meta.Throttle + if err := validateThrottlePair("ThrottleSend", t.SendCapacity, t.SendRefillPerSecond, true); err != nil { + return err + } + if err := validateThrottlePair("ThrottleRecv", t.RecvCapacity, t.RecvRefillPerSecond, true); err != nil { + return err + } + // Default* gets the same batch-capacity floor as Send*/Recv* + // because resolveActionConfig in sqs_throttle.go falls Send and + // Receive traffic through to Default whenever the corresponding + // Send*/Recv* pair is unset. Without the floor, a config like + // `ThrottleDefaultCapacity=5, ThrottleDefaultRefillPerSecond=1` + // would be accepted but make every full SendMessageBatch / + // DeleteMessageBatch (charge=10) permanently unserviceable — + // the bucket can never accumulate the 10 tokens. + if err := validateThrottlePair("ThrottleDefault", t.DefaultCapacity, t.DefaultRefillPerSecond, true); err != nil { + return err + } + if t.IsEmpty() { + // All-zero post-apply means the operator wrote a "disable" + // command; canonicalise to nil so downstream code hits the + // nil-throttle short-circuit rather than the IsEmpty branch. + meta.Throttle = nil + } + return nil +} + +// addThrottleAttributes renders the non-zero Throttle* pairs into out. +// Per §3.2 the wire-side vocabulary stays Send*/Recv*/Default*; the +// canonical bucket-action vocabulary is internal to the bucket store. +func addThrottleAttributes(out map[string]string, t *sqsQueueThrottle) { + if t.IsEmpty() { + return + } + if t.SendCapacity > 0 { + out["ThrottleSendCapacity"] = strconv.FormatFloat(t.SendCapacity, 'g', -1, 64) + out["ThrottleSendRefillPerSecond"] = strconv.FormatFloat(t.SendRefillPerSecond, 'g', -1, 64) + } + if t.RecvCapacity > 0 { + out["ThrottleRecvCapacity"] = strconv.FormatFloat(t.RecvCapacity, 'g', -1, 64) + out["ThrottleRecvRefillPerSecond"] = strconv.FormatFloat(t.RecvRefillPerSecond, 'g', -1, 64) + } + if t.DefaultCapacity > 0 { + out["ThrottleDefaultCapacity"] = strconv.FormatFloat(t.DefaultCapacity, 'g', -1, 64) + out["ThrottleDefaultRefillPerSecond"] = strconv.FormatFloat(t.DefaultRefillPerSecond, 'g', -1, 64) + } +} + +// validateThrottlePair runs the per-(action, capacity, refill) checks. +// requireBatchCapacity gates the capacity ≥ 10 rule. All three action +// sets (Send*, Recv*, Default*) currently pass requireBatchCapacity = +// true: an earlier draft exempted Default* on the assumption that the +// catch-all bucket had no batch verb in scope, but resolveActionConfig +// in sqs_throttle.go falls Send/Receive traffic through to Default +// whenever the dedicated pair is unset, so batch verbs (SendMessageBatch +// charges up to 10, DeleteMessageBatch charges up to 10) can still +// land on the Default bucket. The flag stays a parameter so a future +// caller that has a true non-batch action can opt out, but no +// production caller does today. +func validateThrottlePair(prefix string, capacity, refill float64, requireBatchCapacity bool) error { + if capacity == 0 && refill == 0 { + return nil + } + if capacity == 0 || refill == 0 { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + prefix+"Capacity and "+prefix+"RefillPerSecond must both be zero (disabled) or both positive") + } + if capacity < refill { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + prefix+"Capacity must be ≥ "+prefix+"RefillPerSecond (capacity is the burst cap; below refill the bucket cannot accumulate)") + } + if requireBatchCapacity && capacity < throttleMinBatchCapacity { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + prefix+"Capacity must be ≥ 10 — batch verbs (SendMessageBatch / DeleteMessageBatch) charge up to 10 tokens per call; a smaller capacity makes every full batch permanently unserviceable") + } return nil } @@ -440,6 +756,15 @@ func attributesEqual(a, b *sqsQueueMeta) bool { if a == nil || b == nil { return false } + return baseAttributesEqual(a, b) && + throttleConfigEqual(a.Throttle, b.Throttle) && + htfifoAttributesEqual(a, b) +} + +// baseAttributesEqual compares the pre-Phase-3.C/3.D attribute set. +// Split from attributesEqual so adding fields per phase does not +// push the function over the cyclop ceiling. +func baseAttributesEqual(a, b *sqsQueueMeta) bool { return a.IsFIFO == b.IsFIFO && a.ContentBasedDedup == b.ContentBasedDedup && a.VisibilityTimeoutSeconds == b.VisibilityTimeoutSeconds && @@ -450,6 +775,53 @@ func attributesEqual(a, b *sqsQueueMeta) bool { a.RedrivePolicy == b.RedrivePolicy } +// throttleConfigEqual compares two Throttle configs for the +// CreateQueue idempotency check. Without including the throttle +// fields in attributesEqual, a re-create with different limits would +// be treated as idempotent and silently keep the old limits. +func throttleConfigEqual(a, b *sqsQueueThrottle) bool { + aEmpty := a.IsEmpty() + bEmpty := b.IsEmpty() + if aEmpty && bEmpty { + return true + } + if aEmpty != bEmpty { + return false + } + return a.SendCapacity == b.SendCapacity && + a.SendRefillPerSecond == b.SendRefillPerSecond && + a.RecvCapacity == b.RecvCapacity && + a.RecvRefillPerSecond == b.RecvRefillPerSecond && + a.DefaultCapacity == b.DefaultCapacity && + a.DefaultRefillPerSecond == b.DefaultRefillPerSecond +} + +// htfifoAttributesEqual compares the Phase 3.D HT-FIFO fields. +// +// PartitionCount normalisation: validatePartitionConfig documents 0 +// and 1 as equivalent ("unset" / "single-partition"); a queue created +// without PartitionCount is stored as 0 while a queue created with +// explicit PartitionCount=1 is stored as 1, so strict equality would +// have CreateQueue reject the second call as "different attributes" +// even though the queues are semantically identical. +// normalisePartitionCount maps both to 1 for the idempotency check. +func htfifoAttributesEqual(a, b *sqsQueueMeta) bool { + return normalisePartitionCount(a.PartitionCount) == normalisePartitionCount(b.PartitionCount) && + a.FifoThroughputLimit == b.FifoThroughputLimit && + a.DeduplicationScope == b.DeduplicationScope +} + +// normalisePartitionCount collapses the two "single-partition" forms +// (0 = unset, 1 = explicit) into a single canonical value so equality +// checks treat them as identical. Any value > 1 is returned unchanged +// — those cases must already match exactly to be considered equal. +func normalisePartitionCount(n uint32) uint32 { + if n == 0 { + return 1 + } + return n +} + // ------------------------ storage primitives ------------------------ func (s *SQSServer) nextTxnReadTS(ctx context.Context) uint64 { @@ -531,6 +903,15 @@ func (s *SQSServer) createQueueCore(ctx context.Context, in *sqsCreateQueueInput if err != nil { return "", err } + // Temporary dormancy gate (Phase 3.D §11 PR 2). PartitionCount > 1 + // must reject until PR 5 wires the data plane atomically with the + // gate-lift. Without this, accepting a partitioned-queue create + // would let SendMessage write under the legacy single-partition + // prefix; the PR 5 reader would never find those messages and the + // reaper would not enumerate them — silent message loss. + if err := validatePartitionDormancyGate(requested); err != nil { + return "", err + } if len(in.Tags) > sqsMaxTagsPerQueue { // AWS caps tags per queue at 50. CreateQueue must reject // over-cap tag bundles up front; a silent slice-and-store @@ -585,6 +966,12 @@ func (s *SQSServer) tryCreateQueueOnce(ctx context.Context, requested *sqsQueueM return false, errors.WithStack(err) } requested.Generation = lastGen + 1 + // Incarnation == Generation at create time (both monotonic from + // the persistent gen counter), but it stops moving on PurgeQueue + // and SetQueueAttributes — those paths preserve meta.Incarnation + // while bumping or leaving Generation. The throttle bucket key + // hangs off Incarnation so a Purge cannot reset the bucket. + requested.Incarnation = requested.Generation if clock := s.coordinator.Clock(); clock != nil { requested.CreatedAtHLC = clock.Current() } @@ -616,6 +1003,15 @@ func (s *SQSServer) tryCreateQueueOnce(ctx context.Context, requested *sqsQueueM if _, err := s.coordinator.Dispatch(ctx, req); err != nil { return false, errors.WithStack(err) } + // Drop any throttle bucket that survived a delete-then-create + // race. DeleteQueue invalidates after its + // commit, but a sendMessage holding pre-delete meta can recreate + // a bucket between that invalidate and this CreateQueue commit; + // invalidating again here on a genuine create (not the idempotent + // return path above, which exits before this point) guarantees + // the new queue starts with a fresh full-capacity bucket + // regardless of in-flight traffic to the prior incarnation. + s.throttle.invalidateQueue(requested.Name) return true, nil } @@ -634,6 +1030,14 @@ func (s *SQSServer) deleteQueue(w http.ResponseWriter, r *http.Request) { writeSQSErrorFromErr(w, err) return } + // Drop in-memory throttle buckets belonging to this queue so a + // same-name CreateQueue immediately after this delete starts with + // a fresh full-capacity bucket, not the stale balance from the + // previous incarnation. Without this step the old throttle would + // keep enforcing for up to the idle-evict window (default 1 h), + // surprising operators who use DeleteQueue+CreateQueue to reset + // queue state. + s.throttle.invalidateQueue(name) // SQS DeleteQueue returns 200 with an empty body. writeSQSJSON(w, map[string]any{}) } @@ -987,6 +1391,16 @@ func queueMetaToAttributes(meta *sqsQueueMeta, selection sqsAttributeSelection, if meta.RedrivePolicy != "" { all["RedrivePolicy"] = meta.RedrivePolicy } + // Throttle* are non-AWS extensions. Surfacing them in + // GetQueueAttributes lets operators read back what they set; SDKs + // that strictly validate the attribute set will ignore unknown + // keys. Extracted into a helper so queueMetaToAttributes stays + // under the cyclop ceiling. + addThrottleAttributes(all, meta.Throttle) + // HT-FIFO attributes (Phase 3.D). Same omission rule as Throttle*: + // only present when configured. Extracted into a helper so this + // function stays under the cyclop ceiling. + addHTFIFOAttributes(all, meta) if selection.expandAll { return all } @@ -1018,58 +1432,120 @@ func (s *SQSServer) setQueueAttributes(w http.ResponseWriter, r *http.Request) { writeSQSError(w, http.StatusBadRequest, sqsErrMissingParameter, "Attributes is required") return } - if err := s.setQueueAttributesWithRetry(r.Context(), name, in.Attributes); err != nil { + throttleChanged, err := s.setQueueAttributesWithRetry(r.Context(), name, in.Attributes) + if err != nil { writeSQSErrorFromErr(w, err) return } + // Drop the in-memory bucket entries belonging to this queue *after* + // the Raft commit so the next request rebuilds from the freshly + // committed throttle config. Gated on whether the request actually + // changed a Throttle* value — an unconditional invalidate would + // reset the bucket on every unrelated SetQueueAttributes (e.g. + // VisibilityTimeout-only update), giving any caller a way to + // silently restore a noisy tenant's burst capacity by writing a + // no-op SetQueueAttributes. Gating on key-presence alone is not + // enough either — a same-value Throttle* write would still pass + // the presence check and invalidate, so a caller could repeat + // their own current throttle config to bump the bucket back to + // full capacity. trySetQueueAttributesOnce therefore + // compares the old and new throttle configs under the same Raft + // read snapshot used for the commit and reports whether the values + // actually moved. The bucket reconciliation in loadOrInit also + // catches a stale bucket if a throttle change slips past this gate + // (e.g. via a future admin path), so the gating here is purely a + // hot-path optimisation plus a no-op-bypass guard. + if throttleChanged { + s.throttle.invalidateQueue(name) + } writeSQSJSON(w, map[string]any{}) } -func (s *SQSServer) setQueueAttributesWithRetry(ctx context.Context, queueName string, attrs map[string]string) error { +func (s *SQSServer) setQueueAttributesWithRetry(ctx context.Context, queueName string, attrs map[string]string) (bool, error) { backoff := transactRetryInitialBackoff deadline := time.Now().Add(transactRetryMaxDuration) for range transactRetryMaxAttempts { - done, err := s.trySetQueueAttributesOnce(ctx, queueName, attrs) + throttleChanged, done, err := s.trySetQueueAttributesOnce(ctx, queueName, attrs) if err == nil && done { - return nil + return throttleChanged, nil } if err != nil && !isRetryableTransactWriteError(err) { - return err + return false, err } if err := waitRetryWithDeadline(ctx, deadline, backoff); err != nil { - return errors.WithStack(err) + return false, errors.WithStack(err) } backoff = nextTransactRetryBackoff(backoff) } - return newSQSAPIError(http.StatusInternalServerError, sqsErrInternalFailure, "set queue attributes retry attempts exhausted") + return false, newSQSAPIError(http.StatusInternalServerError, sqsErrInternalFailure, "set queue attributes retry attempts exhausted") } -// trySetQueueAttributesOnce is one read-validate-commit pass. The first -// return reports whether the caller should stop retrying (the attrs -// are now committed); an error means either a non-retryable failure -// (propagate) or a retryable write conflict (retry after backoff). -func (s *SQSServer) trySetQueueAttributesOnce(ctx context.Context, queueName string, attrs map[string]string) (bool, error) { - readTS := s.nextTxnReadTS(ctx) - meta, exists, err := s.loadQueueMetaAt(ctx, queueName, readTS) - if err != nil { - return false, errors.WithStack(err) - } - if !exists { - return false, newSQSAPIError(http.StatusBadRequest, sqsErrQueueDoesNotExist, "queue does not exist") +// applyAndValidateSetAttributes runs the apply + cross-validator +// chain for a SetQueueAttributes request. Extracted from +// trySetQueueAttributesOnce so that function stays under the cyclop +// ceiling once HT-FIFO immutability + Throttle validators were +// added. Returns nil on success; on rejection returns the typed +// sqsAPIError the caller forwards to writeSQSErrorFromErr. +// +// preApply snapshot allocation is gated on htfifoAttributesPresent +// so the common "mutable-only update" path stays alloc-free. +func applyAndValidateSetAttributes(meta *sqsQueueMeta, attrs map[string]string) error { + var preApply *sqsQueueMeta + if htfifoAttributesPresent(attrs) { + preApply = snapshotImmutableHTFIFO(meta) } if err := applyAttributes(meta, attrs); err != nil { - return false, err + return err + } + if preApply != nil { + if err := validatePartitionImmutability(preApply, meta); err != nil { + return err + } } // ContentBasedDeduplication is FIFO-only; a Standard queue // silently accepting it would advertise unsupported behavior to // clients. Same rule enforced on CreateQueue. if meta.ContentBasedDedup && !meta.IsFIFO { - return false, newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, "ContentBasedDeduplication is only valid on FIFO queues") + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, "ContentBasedDeduplication is only valid on FIFO queues") } + // HT-FIFO schema validator runs after applyAttributes so the + // FIFO-only checks see the post-apply state. IsFIFO comes from + // the loaded meta record (immutable from CreateQueue) so the + // validator sees the same flag CreateQueue set. + return validatePartitionConfig(meta) +} + +// trySetQueueAttributesOnce is one read-validate-commit pass. The +// returns are (throttleChanged, done, err). done reports whether the +// caller should stop retrying (the attrs are now committed); an error +// means either a non-retryable failure (propagate) or a retryable +// write conflict (retry after backoff). throttleChanged is true iff +// the post-apply meta's Throttle config differs from the pre-apply +// snapshot — the caller uses it to gate the cache invalidation so +// that a no-op same-value SetQueueAttributes does not reset the +// bucket to full capacity. +func (s *SQSServer) trySetQueueAttributesOnce(ctx context.Context, queueName string, attrs map[string]string) (bool, bool, error) { + readTS := s.nextTxnReadTS(ctx) + meta, exists, err := s.loadQueueMetaAt(ctx, queueName, readTS) + if err != nil { + return false, false, errors.WithStack(err) + } + if !exists { + return false, false, newSQSAPIError(http.StatusBadRequest, sqsErrQueueDoesNotExist, "queue does not exist") + } + // Snapshot the throttle config under the same read TS used for the + // commit so the comparison sees the value the writer is racing + // against, not a later one. Cheap because the value is small (six + // floats wrapped in a struct). + preThrottle := snapshotThrottle(meta.Throttle) + if err := applyAndValidateSetAttributes(meta, attrs); err != nil { + return false, false, err + } + throttleChanged := !throttleConfigEqual(preThrottle, meta.Throttle) meta.LastModifiedAtMillis = time.Now().UnixMilli() metaBytes, err := encodeSQSQueueMeta(meta) if err != nil { - return false, errors.WithStack(err) + return false, false, errors.WithStack(err) } metaKey := sqsQueueMetaKey(queueName) // StartTS + ReadKeys prevent two concurrent SetQueueAttributes from @@ -1084,9 +1560,22 @@ func (s *SQSServer) trySetQueueAttributesOnce(ctx context.Context, queueName str }, } if _, err := s.coordinator.Dispatch(ctx, req); err != nil { - return false, errors.WithStack(err) + return false, false, errors.WithStack(err) } - return true, nil + return throttleChanged, true, nil +} + +// snapshotThrottle returns a value-copy of the Throttle config so a +// later mutation of meta.Throttle (via applyAttributes) does not +// retroactively change the snapshot. throttleConfigEqual handles the +// nil-vs-empty case, so the snapshot can be the typed zero value +// when meta.Throttle was nil. +func snapshotThrottle(t *sqsQueueThrottle) *sqsQueueThrottle { + if t == nil { + return nil + } + cp := *t + return &cp } // sqsApproxCounterScanLimit caps a single GetQueueAttributes scan. AWS diff --git a/adapter/sqs_catalog_test.go b/adapter/sqs_catalog_test.go index 291732ded..eb71ebe56 100644 --- a/adapter/sqs_catalog_test.go +++ b/adapter/sqs_catalog_test.go @@ -136,6 +136,29 @@ func TestSQSServer_CatalogCreateIsIdempotent(t *testing.T) { if got, _ := out3["__type"].(string); got != sqsErrQueueNameExists { t.Fatalf("differing-attrs error type: got %q want %q", got, sqsErrQueueNameExists) } + + // Fourth call: same name, same non-throttle attrs as the original + // create, but different Throttle* values. The original create had + // no Throttle config; this one adds one. throttleConfigEqual must + // notice the diff and the call must reject as QueueNameExists. + // Without this case a bug in throttleConfigEqual (e.g. always + // returning true) would slip past the existing VisibilityTimeout- + // only test. + withThrottle := map[string]any{ + "QueueName": "idempotent", + "Attributes": map[string]string{ + "VisibilityTimeout": "60", + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }, + } + status4, out4 := callSQS(t, node, sqsCreateQueueTarget, withThrottle) + if status4 != http.StatusBadRequest { + t.Fatalf("re-create with added Throttle*: got %d want 400; body %v", status4, out4) + } + if got, _ := out4["__type"].(string); got != sqsErrQueueNameExists { + t.Fatalf("Throttle*-diff error type: got %q want %q", got, sqsErrQueueNameExists) + } } func TestSQSServer_CatalogGetAndSetAttributes(t *testing.T) { diff --git a/adapter/sqs_messages.go b/adapter/sqs_messages.go index a64a0006e..dcf8cfd64 100644 --- a/adapter/sqs_messages.go +++ b/adapter/sqs_messages.go @@ -294,33 +294,67 @@ type sqsChangeVisibilityInput struct { // ------------------------ handlers ------------------------ -func (s *SQSServer) sendMessage(w http.ResponseWriter, r *http.Request) { +// prepareSendMessage decodes the SendMessage payload and resolves +// the queue name. Throttle charging happens after the meta load in +// validateSend so we don't pay an extra meta read just to discover +// throttling is off. +func (s *SQSServer) prepareSendMessage(w http.ResponseWriter, r *http.Request) (sqsSendMessageInput, string, bool) { var in sqsSendMessageInput if err := decodeSQSJSONInput(r, &in); err != nil { writeSQSErrorFromErr(w, err) - return + return in, "", false } queueName, err := queueNameFromURL(in.QueueUrl) if err != nil { writeSQSErrorFromErr(w, err) - return + return in, "", false } + return in, queueName, true +} + +// validateSend loads queue meta, runs the throttle charge against +// the loaded throttle config (no extra meta read), then validates +// message attributes / FIFO params and resolves the delay. Returns +// ok=false if any step has already written the error response. +// +// Throttle check sits AFTER the meta load (so we have the throttle +// config) and AFTER the QueueDoesNotExist branch (so a missing +// queue is reported as 400 QueueDoesNotExist, not as a Throttling +// 400 against a non-existent bucket). It still sits OUTSIDE the +// OCC transaction (§4.2): a rejected request never reaches the +// coordinator. +func (s *SQSServer) validateSend(w http.ResponseWriter, r *http.Request, queueName string, in sqsSendMessageInput) (*sqsQueueMeta, uint64, int64, bool) { meta, readTS, apiErr := s.loadQueueMetaForSend(r.Context(), queueName, []byte(in.MessageBody)) if apiErr != nil { writeSQSErrorFromErr(w, apiErr) - return + return nil, 0, 0, false + } + if !s.chargeQueueWithThrottle(w, queueName, bucketActionSend, 1, meta.Throttle, meta.Incarnation) { + return nil, 0, 0, false } if apiErr := validateMessageAttributes(in.MessageAttributes); apiErr != nil { writeSQSErrorFromErr(w, apiErr) - return + return nil, 0, 0, false } if apiErr := validateSendFIFOParams(meta, in); apiErr != nil { writeSQSErrorFromErr(w, apiErr) - return + return nil, 0, 0, false } delay, apiErr := resolveSendDelay(meta, in.DelaySeconds) if apiErr != nil { writeSQSErrorFromErr(w, apiErr) + return nil, 0, 0, false + } + return meta, readTS, delay, true +} + +func (s *SQSServer) sendMessage(w http.ResponseWriter, r *http.Request) { + in, queueName, ok := s.prepareSendMessage(w, r) + if !ok { + return + } + meta, readTS, delay, ok := s.validateSend(w, r, queueName, in) + if !ok { return } if meta.IsFIFO { @@ -530,6 +564,13 @@ func (s *SQSServer) receiveMessage(w http.ResponseWriter, r *http.Request) { writeSQSError(w, http.StatusBadRequest, sqsErrQueueDoesNotExist, "queue does not exist") return } + // Throttle check uses the loaded meta's throttle config so we + // don't pay an extra meta read just to discover throttling is + // off. Sits AFTER the QueueDoesNotExist branch — a missing queue + // should not consume a Recv token. + if !s.chargeQueueWithThrottle(w, queueName, bucketActionReceive, 1, meta.Throttle, meta.Incarnation) { + return + } max, maxErr := resolveReceiveMaxMessages(in.MaxNumberOfMessages) if maxErr != nil { writeSQSErrorFromErr(w, maxErr) @@ -1106,6 +1147,9 @@ func (s *SQSServer) deleteMessage(w http.ResponseWriter, r *http.Request) { writeSQSErrorFromErr(w, err) return } + if !s.chargeQueue(w, r, queueName, bucketActionReceive, 1) { + return + } if err := s.deleteMessageWithRetry(r.Context(), queueName, handle); err != nil { writeSQSErrorFromErr(w, err) return @@ -1258,6 +1302,9 @@ func (s *SQSServer) changeMessageVisibility(w http.ResponseWriter, r *http.Reque writeSQSErrorFromErr(w, err) return } + if !s.chargeQueue(w, r, queueName, bucketActionReceive, 1) { + return + } if err := s.changeVisibilityWithRetry(r.Context(), queueName, handle, timeout); err != nil { writeSQSErrorFromErr(w, err) return diff --git a/adapter/sqs_messages_batch.go b/adapter/sqs_messages_batch.go index b8adca271..68d47b7b2 100644 --- a/adapter/sqs_messages_batch.go +++ b/adapter/sqs_messages_batch.go @@ -94,6 +94,9 @@ func (s *SQSServer) sendMessageBatch(w http.ResponseWriter, r *http.Request) { "total batch payload exceeds 262144 bytes") return } + if !s.chargeQueue(w, r, queueName, bucketActionSend, throttleChargeCount(len(in.Entries))) { + return + } successful, failed, err := s.sendMessageBatchWithRetry(r.Context(), queueName, in.Entries) if err != nil { @@ -453,6 +456,9 @@ func (s *SQSServer) deleteMessageBatch(w http.ResponseWriter, r *http.Request) { writeSQSErrorFromErr(w, err) return } + if !s.chargeQueue(w, r, queueName, bucketActionReceive, throttleChargeCount(len(in.Entries))) { + return + } successful := make([]sqsBatchResultEntry, 0, len(in.Entries)) failed := make([]sqsBatchResultErrorEntry, 0) @@ -519,6 +525,9 @@ func (s *SQSServer) changeMessageVisibilityBatch(w http.ResponseWriter, r *http. writeSQSErrorFromErr(w, err) return } + if !s.chargeQueue(w, r, queueName, bucketActionReceive, throttleChargeCount(len(in.Entries))) { + return + } successful := make([]sqsBatchResultEntry, 0, len(in.Entries)) failed := make([]sqsBatchResultErrorEntry, 0) diff --git a/adapter/sqs_partitioning.go b/adapter/sqs_partitioning.go new file mode 100644 index 000000000..39c0f7b7f --- /dev/null +++ b/adapter/sqs_partitioning.go @@ -0,0 +1,325 @@ +package adapter + +import ( + "net/http" + "strconv" +) + +// HT-FIFO (Phase 3.D split-queue FIFO) configuration vocabulary and +// the routing primitive partitionFor. See the design doc at +// docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md. +// +// PR 2 of the §11 rollout introduces the schema fields plus the +// validation surface — including the temporary dormancy gate that +// rejects PartitionCount > 1 at CreateQueue. PR 5 lifts the gate +// atomically with the data-plane fanout so a half-deployed cluster +// can never accept a partitioned queue without the data plane to +// serve it. Until then the field exists in the meta type and the +// router function compiles, but no partitioned queue can land. + +const ( + // htfifoMaxPartitions caps the per-queue partition count. 32 is + // enough for ~30,000 RPS per queue at the per-shard ~1,000 RPS + // limit. Higher would require larger per-queue meta records and + // more reaper cycles; bumping the cap is a follow-up if operators + // demand it. See §10 of the design. + htfifoMaxPartitions uint32 = 32 + + // htfifoThroughputPerMessageGroupID is the default + // FifoThroughputLimit value for HT-FIFO queues — every group ID + // hashes to a partition independently, giving the throughput + // scaling HT-FIFO is designed for. + htfifoThroughputPerMessageGroupID = "perMessageGroupId" + // htfifoThroughputPerQueue activates the §3.3 short-circuit: every + // group ID routes to partition 0, collapsing throughput back to + // what a single-partition queue gets. Useful for clients that want + // the AWS attribute set without the extra capacity. + htfifoThroughputPerQueue = "perQueue" + + // htfifoDedupeScopeMessageGroup is the default DeduplicationScope + // value for HT-FIFO queues — the dedup window is per (queue, + // partition, MessageGroupId, dedupId). + htfifoDedupeScopeMessageGroup = "messageGroup" + // htfifoDedupeScopeQueue is the legacy single-window scope. Per + // §3.2 this is incompatible with PartitionCount > 1 (the dedup + // key cannot be globally unique across partitions without a + // cross-partition OCC transaction); the validator rejects the + // combination at CreateQueue time. + htfifoDedupeScopeQueue = "queue" +) + +// htfifoTemporaryGateMessage is the operator-facing reason the +// CreateQueue gate uses while PR 2-4 are in production. Removed in +// PR 5 in the same commit that wires the data-plane fanout. +const htfifoTemporaryGateMessage = "PartitionCount > 1 requires HT-FIFO data plane — not yet enabled" + +// partitionFor maps a (queue meta, MessageGroupId) pair to a +// partition index in [0, PartitionCount). Edge cases: +// +// - PartitionCount == 0 or 1 → always 0 (legacy single-partition). +// - FifoThroughputLimit == "perQueue" → always 0 (the §3.3 +// short-circuit; collapses every group to one partition). +// - Empty MessageGroupId → 0 (defensive; FIFO send validation +// should already have rejected this). +// +// Hashing uses FNV-1a per §3.3 of the design: fast, no SIMD setup +// cost, deterministic across Go versions and architectures, no key. +// Operators do not need this to be cryptographically strong — +// well-distributed and deterministic is what matters. +func partitionFor(meta *sqsQueueMeta, messageGroupID string) uint32 { + if meta == nil { + return 0 + } + if meta.PartitionCount <= 1 { + return 0 + } + if meta.FifoThroughputLimit == htfifoThroughputPerQueue { + return 0 + } + if messageGroupID == "" { + return 0 + } + // Inlined FNV-1a 32-bit over the string to avoid the []byte + // allocation hash/fnv.New32a + h.Write would force. + // MessageGroupId is capped at 128 chars by validation, + // so this loop bounds at 128 iterations of integer arithmetic per + // SendMessage — measurably faster than the hash.Hash interface + // path on the routing hot path. The 32-bit variant keeps the + // computation in uint32 throughout, sidestepping the uint64 → + // uint32 narrowing that the 64-bit variant would have required + // for the partition mask AND (which gosec G115 would otherwise + // flag and force a //nolint suppression on a safe-by-construction + // narrow). PartitionCount + // ≤ htfifoMaxPartitions = 32 so log2(PartitionCount) ≤ 5; only the + // low 5 bits of the hash ever survive the mask, and 32-bit FNV-1a + // is more than enough entropy to spread MessageGroupId values + // uniformly across that range. + const ( + fnv32Offset uint32 = 2166136261 + fnv32Prime uint32 = 16777619 + ) + hash := fnv32Offset + for i := 0; i < len(messageGroupID); i++ { + hash ^= uint32(messageGroupID[i]) + hash *= fnv32Prime + } + // PartitionCount is a power of two (validator-enforced); mod is + // equivalent to mask-AND. The mask is meta.PartitionCount - 1. + return hash & (meta.PartitionCount - 1) +} + +// isPowerOfTwo returns true when n is a positive power of two. +// PartitionCount must satisfy this so partitionFor's bitwise mask +// (h & (n-1)) is equivalent to (h % n) — without the constraint the +// distribution would be biased toward the lower indices. +func isPowerOfTwo(n uint32) bool { + return n > 0 && (n&(n-1)) == 0 +} + +// validatePartitionConfig enforces the §3.2 cross-attribute rules on +// the post-applier meta. Per-field constraints (parse, range) live +// inside the per-attribute appliers. Cross-field rules: +// +// - PartitionCount must be a power of two in [1, htfifoMaxPartitions] +// when set. PartitionCount == 0 is canonical "unset" and is +// equivalent to 1 for routing purposes. +// - FifoThroughputLimit / DeduplicationScope are FIFO-only — +// setting either on a Standard queue rejects with +// InvalidAttributeValue. +// - {PartitionCount > 1, DeduplicationScope = "queue"} rejects +// with InvalidParameterValue: queue-scoped dedup is incompatible +// with multi-partition FIFO because the dedup key cannot be +// globally unique across partitions without a cross-partition +// OCC transaction. +// - The §11 PR 2 dormancy gate (PartitionCount > 1 rejected at +// CreateQueue) lives in validatePartitionDormancyGate so the +// dormancy check can be turned off in unit tests that want to +// exercise the full schema path. Production CreateQueue calls +// both validators. +func validatePartitionConfig(meta *sqsQueueMeta) error { + if err := validatePartitionShape(meta); err != nil { + return err + } + if !meta.IsFIFO { + if err := validateStandardQueueRejectsHTFIFO(meta); err != nil { + return err + } + } + if meta.PartitionCount > 1 && meta.DeduplicationScope == htfifoDedupeScopeQueue { + // sqsErrValidation is "InvalidParameterValue"; uses the + // existing constant rather than a duplicate-value alias. + return newSQSAPIError(http.StatusBadRequest, sqsErrValidation, + "queue-scoped deduplication is incompatible with multi-partition FIFO because the dedup key cannot be globally unique across partitions without a cross-partition OCC transaction") + } + // FifoThroughputLimit=perMessageGroupId requires an explicit + // PartitionCount > 1. §7.2 of the design used to suggest + // "infer a sensible default, e.g. + // 8" for HT-FIFO callers that omit PartitionCount, but a hidden + // default makes CreateQueue idempotency depend on deployment + // state — the same wire payload could resolve to a 4-partition + // queue today and an 8-partition queue tomorrow if an operator + // changed the default. The rest of this design treats + // PartitionCount as immutable create-time state, so reject the + // underspecified request to keep the operator's intent always + // explicit on the wire. + if meta.IsFIFO && meta.FifoThroughputLimit == htfifoThroughputPerMessageGroupID && meta.PartitionCount <= 1 { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "FifoThroughputLimit=perMessageGroupId requires an explicit PartitionCount > 1; set PartitionCount or omit FifoThroughputLimit to use the legacy single-partition layout") + } + return nil +} + +// validatePartitionShape enforces the structural rules on +// PartitionCount: power-of-two and within the per-queue cap. Split +// out of validatePartitionConfig to keep that function under the +// cyclop ceiling once the perMessageGroupId-requires-explicit rule +// landed. +func validatePartitionShape(meta *sqsQueueMeta) error { + if meta.PartitionCount == 0 { + return nil + } + if !isPowerOfTwo(meta.PartitionCount) { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount must be a power of two") + } + if meta.PartitionCount > htfifoMaxPartitions { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount exceeds the per-queue cap of "+strconv.FormatUint(uint64(htfifoMaxPartitions), 10)) + } + return nil +} + +// validateStandardQueueRejectsHTFIFO enforces the FIFO-only rule on +// the HT-FIFO attributes. PartitionCount > 1 only makes sense on FIFO +// queues (HT-FIFO is by definition a FIFO feature). Without this guard +// a Standard queue with PartitionCount=2 would slip past the validator +// once PR 5 lifts the dormancy gate. PartitionCount=0 and 1 are +// accepted because both mean +// "single-partition layout" which is valid on Standard queues. +func validateStandardQueueRejectsHTFIFO(meta *sqsQueueMeta) error { + if meta.PartitionCount > 1 { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount > 1 is only valid on FIFO queues") + } + if meta.FifoThroughputLimit != "" { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "FifoThroughputLimit is only valid on FIFO queues") + } + if meta.DeduplicationScope != "" { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "DeduplicationScope is only valid on FIFO queues") + } + return nil +} + +// validatePartitionDormancyGate is the temporary §11 PR 2 gate. As +// long as the data-plane fanout (PR 5) has not landed, accepting a +// partitioned-queue CreateQueue would let SendMessage write under +// the legacy single-partition prefix — the PR 5 reader would never +// find those messages and the reaper would not enumerate them. This +// gate makes the wrong-layout-data class of bug impossible. +// +// Removed in PR 5 in the same commit that wires the data plane so +// the gate-and-lift land atomically. +func validatePartitionDormancyGate(meta *sqsQueueMeta) error { + if meta.PartitionCount > 1 { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + htfifoTemporaryGateMessage) + } + return nil +} + +// validatePartitionImmutability enforces the §3.2 rule that +// PartitionCount, FifoThroughputLimit, and DeduplicationScope are +// all immutable from CreateQueue onward. Called from +// trySetQueueAttributesOnce after the meta is loaded; rejects the +// whole SetQueueAttributes call (all-or-nothing — even mutable +// attributes in the same request do not commit when an immutable +// one is invalid) per §3.2. +// +// requested is the post-apply meta; current is the on-disk meta. +// If any of the three immutable fields differs, the validator +// returns InvalidAttributeValue naming the attribute so the +// operator sees the cause directly. A same-value "no-op" succeeds. +// +// PartitionCount uses normalisePartitionCount so a SetQueueAttributes +// request that passes the canonical-equivalent value (e.g. 1 on a +// queue stored with 0, or 0 on a queue stored with 1) is treated as +// the no-op it semantically is — strict equality would reject with +// "PartitionCount is immutable" even though the partition layout +// hasn't changed. +func validatePartitionImmutability(current, requested *sqsQueueMeta) error { + if current == nil || requested == nil { + return nil + } + if normalisePartitionCount(current.PartitionCount) != normalisePartitionCount(requested.PartitionCount) { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "PartitionCount is immutable; SetQueueAttributes cannot change it (DeleteQueue + CreateQueue to reconfigure)") + } + if current.FifoThroughputLimit != requested.FifoThroughputLimit { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "FifoThroughputLimit is immutable; SetQueueAttributes cannot change it (DeleteQueue + CreateQueue to reconfigure)") + } + if current.DeduplicationScope != requested.DeduplicationScope { + return newSQSAPIError(http.StatusBadRequest, sqsErrInvalidAttributeValue, + "DeduplicationScope is immutable; SetQueueAttributes cannot change it (DeleteQueue + CreateQueue to reconfigure)") + } + return nil +} + +// htfifoAttributeKeys lists the wire-side attribute names that this +// PR introduces. Used by the immutability check (and future +// admin-surface code) to know which keys a SetQueueAttributes +// request might attempt to change. +var htfifoAttributeKeys = []string{ + "PartitionCount", + "FifoThroughputLimit", + "DeduplicationScope", +} + +// htfifoAttributesPresent reports whether any HT-FIFO attribute key +// appears in attrs. Cheap helper used by the validator to short- +// circuit the immutability check for SetQueueAttributes requests +// that touch only mutable attributes. +func htfifoAttributesPresent(attrs map[string]string) bool { + for _, k := range htfifoAttributeKeys { + if _, ok := attrs[k]; ok { + return true + } + } + return false +} + +// addHTFIFOAttributes renders the configured HT-FIFO attributes into +// out. Mirrors the Throttle* renderer in addThrottleAttributes; same +// omission rule (only present when set), same wire-side names. Kept +// in this file so the HT-FIFO surface lives in one place. +func addHTFIFOAttributes(out map[string]string, meta *sqsQueueMeta) { + if meta == nil { + return + } + if meta.PartitionCount > 0 { + out["PartitionCount"] = strconv.FormatUint(uint64(meta.PartitionCount), 10) + } + if meta.FifoThroughputLimit != "" { + out["FifoThroughputLimit"] = meta.FifoThroughputLimit + } + if meta.DeduplicationScope != "" { + out["DeduplicationScope"] = meta.DeduplicationScope + } +} + +// snapshotImmutableHTFIFO captures the three immutable HT-FIFO field +// values from a meta record. Returned struct is shallow-equal-comparable +// — validatePartitionImmutability uses the snapshot to check for any +// differing value after applyAttributes runs. +func snapshotImmutableHTFIFO(meta *sqsQueueMeta) *sqsQueueMeta { + if meta == nil { + return nil + } + return &sqsQueueMeta{ + PartitionCount: meta.PartitionCount, + FifoThroughputLimit: meta.FifoThroughputLimit, + DeduplicationScope: meta.DeduplicationScope, + } +} diff --git a/adapter/sqs_partitioning_integration_test.go b/adapter/sqs_partitioning_integration_test.go new file mode 100644 index 000000000..b0c501788 --- /dev/null +++ b/adapter/sqs_partitioning_integration_test.go @@ -0,0 +1,272 @@ +package adapter + +import ( + "net/http" + "strings" + "testing" +) + +// TestSQSServer_HTFIFO_DormancyGate_RejectsPartitionedCreate pins +// the §11 PR 2 dormancy gate at the wire layer: CreateQueue with +// PartitionCount > 1 rejects with InvalidAttributeValue and the +// gate's reason ("not yet enabled") makes it into the operator- +// visible message. Removed in PR 5 in the same commit that wires +// the data plane. +func TestSQSServer_HTFIFO_DormancyGate_RejectsPartitionedCreate(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + for _, n := range []string{"2", "4", "8", "32"} { + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": "htfifo-gate-" + n + ".fifo", + "Attributes": map[string]string{ + "FifoQueue": "true", + "PartitionCount": n, + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("PartitionCount=%s: status %d (expected 400 from dormancy gate); body=%v", n, status, out) + } + if got, _ := out["__type"].(string); got != sqsErrInvalidAttributeValue { + t.Fatalf("PartitionCount=%s: __type=%q (expected InvalidAttributeValue)", n, got) + } + msg, _ := out["message"].(string) + if msg == "" || !strings.Contains(msg, "not yet enabled") { + t.Fatalf("PartitionCount=%s: message %q must mention the gate reason", n, msg) + } + } +} + +// TestSQSServer_HTFIFO_DormancyGate_AllowsPartitionCountOne pins +// the no-op-partition-count path: PartitionCount=1 is the legacy +// single-partition layout and must pass the dormancy gate even on +// FIFO queues that explicitly set the field. +func TestSQSServer_HTFIFO_DormancyGate_AllowsPartitionCountOne(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": "htfifo-singlepart.fifo", + "Attributes": map[string]string{ + "FifoQueue": "true", + "PartitionCount": "1", + }, + }) + if status != http.StatusOK { + t.Fatalf("PartitionCount=1 must be accepted: status %d body %v", status, out) + } +} + +// TestSQSServer_HTFIFO_RejectsNonPowerOfTwoPartitionCount pins the +// validator's power-of-two rule. The validator runs before the +// dormancy gate so an invalid count (3) reports the validator's +// reason, not the gate's. +func TestSQSServer_HTFIFO_RejectsNonPowerOfTwoPartitionCount(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": "htfifo-bad-count.fifo", + "Attributes": map[string]string{ + "FifoQueue": "true", + "PartitionCount": "3", + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("PartitionCount=3 must reject: status %d", status) + } + if msg, _ := out["message"].(string); msg == "" || !strings.Contains(msg, "power of two") { + t.Fatalf("expected 'power of two' in message, got %q", msg) + } +} + +// TestSQSServer_HTFIFO_RejectsHTFIFOAttrsOnStandardQueue pins the +// FIFO-only rule: setting FifoThroughputLimit or DeduplicationScope +// on a Standard queue rejects with InvalidAttributeValue. Without +// this, the queue would silently land with no-op attributes that +// SDK clients might mistake for actually configured. +func TestSQSServer_HTFIFO_RejectsHTFIFOAttrsOnStandardQueue(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + status, _ := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": "standard-with-htfifo-attr", + "Attributes": map[string]string{ + "FifoThroughputLimit": htfifoThroughputPerMessageGroupID, + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("FifoThroughputLimit on Standard queue: status %d (expected 400)", status) + } +} + +// TestSQSServer_HTFIFO_RejectsQueueScopedDedupOnPartitioned pins +// the §3.2 cross-attribute control-plane gate at the wire layer. +// {PartitionCount > 1, DeduplicationScope = "queue"} is rejected by +// validatePartitionConfig (the schema validator) which runs inside +// parseAttributesIntoMeta — that is, BEFORE validatePartitionDormancyGate +// runs in createQueue. So the cross-attr rejection is what the wire +// layer sees today, even though the dormancy gate would also reject +// the same input on its own. After PR 5 lifts the dormancy gate the +// cross-attr rule remains the sole rejection path. +// +// The test only checks the 400 status to stay agnostic about which +// validator fires first — both are correct behaviour, and a future +// reordering of the createQueue control flow does not need to break +// this test. +func TestSQSServer_HTFIFO_RejectsQueueScopedDedupOnPartitioned(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": "htfifo-bad-dedup.fifo", + "Attributes": map[string]string{ + "FifoQueue": "true", + "PartitionCount": "2", + "DeduplicationScope": htfifoDedupeScopeQueue, + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("PartitionCount=2 + DeduplicationScope=queue must reject: status %d body %v", status, out) + } +} + +// TestSQSServer_HTFIFO_ImmutabilitySetQueueAttributesRejects pins +// the §3.2 immutability rule at the wire layer: SetQueueAttributes +// attempts to change PartitionCount / FifoThroughputLimit / +// DeduplicationScope reject with InvalidAttributeValue. Test creates +// a single-partition FIFO queue (allowed by dormancy) with +// FifoThroughputLimit set, then tries to change it. +func TestSQSServer_HTFIFO_ImmutabilitySetQueueAttributesRejects(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateFIFOWithThroughputLimit(t, node, "htfifo-immutable.fifo", htfifoThroughputPerQueue) + + // Try to flip FifoThroughputLimit. Must reject — both the + // immutability rule and the rule that perMessageGroupId requires + // PartitionCount > 1 fire on this attempt; either one rejecting + // is the correct outcome from the wire. + status, out := callSQS(t, node, sqsSetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "Attributes": map[string]string{ + "FifoThroughputLimit": htfifoThroughputPerMessageGroupID, + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("FifoThroughputLimit change: status %d body %v (expected 400 immutable)", status, out) + } + // Same-value no-op succeeds. + status, _ = callSQS(t, node, sqsSetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "Attributes": map[string]string{ + "FifoThroughputLimit": htfifoThroughputPerQueue, + }, + }) + if status != http.StatusOK { + t.Fatalf("same-value no-op SetQueueAttributes: status %d (expected 200)", status) + } +} + +// TestSQSServer_HTFIFO_ImmutabilityAllOrNothing pins the §3.2 all- +// or-nothing rule: a SetQueueAttributes that touches a *mutable* +// attribute alongside an attempted *immutable* change rejects the +// whole request, leaving the mutable attribute unchanged on the +// meta record. +func TestSQSServer_HTFIFO_ImmutabilityAllOrNothing(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateFIFOWithThroughputLimit(t, node, "htfifo-allornothing.fifo", htfifoThroughputPerQueue) + + // Combined: mutable VisibilityTimeout + immutable FifoThroughputLimit + // change. Must reject as a whole, mutable change must not commit. + status, _ := callSQS(t, node, sqsSetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "Attributes": map[string]string{ + "VisibilityTimeout": "60", + "FifoThroughputLimit": htfifoThroughputPerMessageGroupID, + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("mutable+immutable combined: status %d (expected 400)", status) + } + // Confirm VisibilityTimeout did NOT commit by reading it back. + status, out := callSQS(t, node, sqsGetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "AttributeNames": []string{"VisibilityTimeout"}, + }) + if status != http.StatusOK { + t.Fatalf("get attrs: %d", status) + } + attrs, _ := out["Attributes"].(map[string]any) + if got, _ := attrs["VisibilityTimeout"].(string); got == "60" { + t.Fatalf("all-or-nothing violated: VisibilityTimeout committed even though immutable change rejected (got %q)", got) + } +} + +// TestSQSServer_HTFIFO_GetQueueAttributesRoundTrip pins the wire +// surface for the configured HT-FIFO attributes: SetQueueAttributes +// (or CreateQueue with the attribute) followed by GetQueueAttributes +// returns the same value. +func TestSQSServer_HTFIFO_GetQueueAttributesRoundTrip(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateFIFOWithThroughputLimit(t, node, "htfifo-roundtrip.fifo", htfifoThroughputPerQueue) + status, out := callSQS(t, node, sqsGetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "AttributeNames": []string{"All"}, + }) + if status != http.StatusOK { + t.Fatalf("GetQueueAttributes: status %d", status) + } + attrs, _ := out["Attributes"].(map[string]any) + if got, _ := attrs["FifoThroughputLimit"].(string); got != htfifoThroughputPerQueue { + t.Fatalf("FifoThroughputLimit round-trip: got %q want %q", got, htfifoThroughputPerQueue) + } + if _, present := attrs["DeduplicationScope"]; present { + t.Fatalf("DeduplicationScope must be omitted when not set; attrs=%v", attrs) + } + if _, present := attrs["PartitionCount"]; present { + t.Fatalf("PartitionCount must be omitted when not set / left at zero; attrs=%v", attrs) + } +} + +// --- helpers --- + +// mustCreateFIFOWithThroughputLimit creates a single-partition FIFO +// queue (allowed by the §11 PR 2 dormancy gate) with the requested +// FifoThroughputLimit set. Used by the immutability tests so they +// have a non-empty FifoThroughputLimit to attempt to change. +func mustCreateFIFOWithThroughputLimit(t *testing.T, node Node, name, limit string) string { + t.Helper() + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{ + "QueueName": name, + "Attributes": map[string]string{ + "FifoQueue": "true", + "FifoThroughputLimit": limit, + }, + }) + if status != http.StatusOK { + t.Fatalf("createQueue %q: status %d body %v", name, status, out) + } + url, _ := out["QueueUrl"].(string) + return url +} diff --git a/adapter/sqs_partitioning_test.go b/adapter/sqs_partitioning_test.go new file mode 100644 index 000000000..0183674a9 --- /dev/null +++ b/adapter/sqs_partitioning_test.go @@ -0,0 +1,398 @@ +package adapter + +import ( + "errors" + "strconv" + "testing" + + "github.com/stretchr/testify/require" +) + +// --- partitionFor unit tests --- + +// TestPartitionFor_LegacyZeroOrOneAlwaysPartitionZero pins the +// single-partition compatibility contract: a queue with +// PartitionCount == 0 (the unset state) or 1 routes every group ID +// to partition 0. Without this guarantee an existing single- +// partition queue would re-shuffle messages once PR 5 lands the +// data plane. +func TestPartitionFor_LegacyZeroOrOneAlwaysPartitionZero(t *testing.T) { + t.Parallel() + for _, count := range []uint32{0, 1} { + meta := &sqsQueueMeta{PartitionCount: count} + for _, gid := range []string{"a", "b", "user-1", "long-group-id-blah"} { + require.Equal(t, uint32(0), partitionFor(meta, gid), + "PartitionCount=%d, group=%q must route to 0", count, gid) + } + } +} + +// TestPartitionFor_PerQueueShortCircuits pins the §3.3 short-circuit: +// FifoThroughputLimit=perQueue collapses every group ID to +// partition 0 regardless of PartitionCount. Operators who want the +// AWS attribute set without the throughput scaling depend on this. +func TestPartitionFor_PerQueueShortCircuits(t *testing.T) { + t.Parallel() + meta := &sqsQueueMeta{PartitionCount: 8, FifoThroughputLimit: htfifoThroughputPerQueue} + for _, gid := range []string{"a", "b", "user-1", "long-group-id"} { + require.Equal(t, uint32(0), partitionFor(meta, gid)) + } +} + +// TestPartitionFor_EmptyMessageGroupIdRoutesZero pins the defensive +// fallback. FIFO send validation rejects empty MessageGroupId so +// this case should never reach the router; the test ensures the +// router doesn't crash if it does. +func TestPartitionFor_EmptyMessageGroupIdRoutesZero(t *testing.T) { + t.Parallel() + meta := &sqsQueueMeta{PartitionCount: 8} + require.Equal(t, uint32(0), partitionFor(meta, "")) +} + +// TestPartitionFor_DeterministicAcrossRuns pins the §3.3 +// determinism contract: the same group ID always returns the same +// partition. Without it, a consumer that pulls from a partition by +// group ID could see messages re-routed to a different partition on +// a process restart and lose ordering guarantees. +func TestPartitionFor_DeterministicAcrossRuns(t *testing.T) { + t.Parallel() + meta := &sqsQueueMeta{PartitionCount: 8} + gid := "user-1234" + first := partitionFor(meta, gid) + for range 100 { + require.Equal(t, first, partitionFor(meta, gid)) + } +} + +// TestPartitionFor_DistributionApproximatelyUniform pins the §9 unit +// test from the design: 100k random group IDs across 8 partitions +// must land within ±5% of equal share. FNV-1a is not a CSPRNG but +// for non-adversarial input the distribution is well-behaved. +func TestPartitionFor_DistributionApproximatelyUniform(t *testing.T) { + t.Parallel() + const partitions uint32 = 8 + const sample = 100_000 + meta := &sqsQueueMeta{PartitionCount: partitions} + hits := make(map[uint32]int, partitions) + for i := range sample { + hits[partitionFor(meta, "group-"+strconv.Itoa(i))]++ + } + expected := sample / int(partitions) + tolerance := expected / 20 // ±5% + for p := uint32(0); p < partitions; p++ { + count := hits[p] + if count < expected-tolerance || count > expected+tolerance { + t.Fatalf("partition %d: %d hits, expected within ±%d of %d (full distribution: %v)", + p, count, tolerance, expected, hits) + } + } +} + +// TestPartitionFor_PowerOfTwoMaskingMatchesMod is a regression +// guard for the bitwise-mask optimisation in partitionFor. The +// optimisation is equivalent to `% PartitionCount` only when +// PartitionCount is a power of two — the validator enforces this +// at config time, but if a future bug leaks a non-power-of-two +// value through validation, this test will catch the distribution +// bias immediately. +func TestPartitionFor_PowerOfTwoMaskingMatchesMod(t *testing.T) { + t.Parallel() + for _, n := range []uint32{2, 4, 8, 16, 32} { + meta := &sqsQueueMeta{PartitionCount: n} + for i := range 1000 { + gid := "g-" + strconv.Itoa(i) + require.Less(t, partitionFor(meta, gid), n, + "partitionFor must always be < PartitionCount=%d", n) + } + } +} + +// --- isPowerOfTwo unit tests --- + +func TestIsPowerOfTwo(t *testing.T) { + t.Parallel() + cases := []struct { + n uint32 + want bool + }{ + {0, false}, + {1, true}, + {2, true}, + {3, false}, + {4, true}, + {7, false}, + {8, true}, + {16, true}, + {32, true}, + {33, false}, + } + for _, tc := range cases { + require.Equal(t, tc.want, isPowerOfTwo(tc.n), "n=%d", tc.n) + } +} + +// --- validatePartitionConfig unit tests --- + +// TestValidatePartitionConfig_PowerOfTwo pins the §3.2 rule that +// PartitionCount must be a power of two. The bitwise-mask routing +// in partitionFor depends on this; non-powers would distribute +// unevenly. +func TestValidatePartitionConfig_PowerOfTwo(t *testing.T) { + t.Parallel() + bad := []uint32{3, 5, 6, 7, 9, 10, 12, 15} + for _, n := range bad { + err := validatePartitionConfig(&sqsQueueMeta{PartitionCount: n, IsFIFO: true}) + require.Error(t, err, "n=%d must reject", n) + } + good := []uint32{1, 2, 4, 8, 16, 32} + for _, n := range good { + err := validatePartitionConfig(&sqsQueueMeta{PartitionCount: n, IsFIFO: true}) + require.NoError(t, err, "n=%d must be accepted", n) + } +} + +// TestValidatePartitionConfig_RejectsAboveMax pins the §10 +// per-queue cap. 64 must reject; 32 must succeed. +func TestValidatePartitionConfig_RejectsAboveMax(t *testing.T) { + t.Parallel() + require.Error(t, validatePartitionConfig(&sqsQueueMeta{PartitionCount: 64, IsFIFO: true})) + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{PartitionCount: 32, IsFIFO: true})) +} + +// TestValidatePartitionConfig_StandardQueueRejectsHTFIFOAttrs pins +// the §3.2 FIFO-only rule: HT-FIFO attributes on a non-FIFO queue +// reject with InvalidAttributeValue. Setting them silently on a +// Standard queue would advertise unsupported behaviour. +// +// PartitionCount > 1 is also FIFO-only — without the guard a +// Standard queue with PartitionCount=2 would slip past the validator +// after PR 5 lifts the dormancy gate. PartitionCount 0/1 are still +// accepted +// on Standard queues because both mean "single-partition layout". +func TestValidatePartitionConfig_StandardQueueRejectsHTFIFOAttrs(t *testing.T) { + t.Parallel() + require.Error(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: false, FifoThroughputLimit: htfifoThroughputPerQueue})) + require.Error(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: false, DeduplicationScope: htfifoDedupeScopeMessageGroup})) + for _, n := range []uint32{2, 4, 8, 16, 32} { + require.Error(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: false, PartitionCount: n}), + "PartitionCount=%d on Standard queue must reject", n) + } + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: false, PartitionCount: 0})) + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: false, PartitionCount: 1})) + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{IsFIFO: true, FifoThroughputLimit: htfifoThroughputPerMessageGroupID, PartitionCount: 8})) +} + +// TestValidatePartitionConfig_RejectsQueueScopedDedupOnPartitioned +// pins the §3.2 cross-attribute control-plane gate: queue-scoped +// dedup is incompatible with multi-partition FIFO because the dedup +// key cannot be globally unique across partitions without a cross- +// partition OCC transaction. Rejected as InvalidParameterValue at +// CreateQueue / SetQueueAttributes time so the operator sees the +// error before a single SendMessage. +func TestValidatePartitionConfig_RejectsQueueScopedDedupOnPartitioned(t *testing.T) { + t.Parallel() + err := validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + PartitionCount: 8, + DeduplicationScope: htfifoDedupeScopeQueue, + }) + require.Error(t, err) + var apiErr *sqsAPIError + require.True(t, errors.As(err, &apiErr), "expected sqsAPIError, got %T", err) + require.Equal(t, sqsErrValidation, apiErr.errorType, + "the cross-attribute rejection must use InvalidParameterValue (incoherent params, sqsErrValidation), not InvalidAttributeValue (malformed individual value)") + // Single-partition + queue-scoped dedup is fine (legacy behaviour). + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + PartitionCount: 1, + DeduplicationScope: htfifoDedupeScopeQueue, + })) +} + +// TestValidatePartitionConfig_PerMessageGroupIDRequiresExplicitPartitionCount +// pins the no-implicit-default rule: an HT-FIFO +// CreateQueue request that sets FifoThroughputLimit=perMessageGroupId +// without an explicit PartitionCount > 1 must be rejected. Earlier +// drafts of §7.2 proposed inferring a server-side default (e.g. 8), +// but a hidden default would make CreateQueue idempotency depend on +// deployment state — the same wire payload could resolve to a +// 4-partition queue today and an 8-partition queue tomorrow. +func TestValidatePartitionConfig_PerMessageGroupIDRequiresExplicitPartitionCount(t *testing.T) { + t.Parallel() + // FIFO + perMessageGroupId + PartitionCount=0 (or 1): reject. + for _, n := range []uint32{0, 1} { + err := validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + FifoThroughputLimit: htfifoThroughputPerMessageGroupID, + PartitionCount: n, + }) + require.Errorf(t, err, "PartitionCount=%d must reject under perMessageGroupId", n) + var apiErr *sqsAPIError + require.True(t, errors.As(err, &apiErr)) + require.Equal(t, sqsErrInvalidAttributeValue, apiErr.errorType) + } + // FIFO + perMessageGroupId + PartitionCount=8: accept (the + // dormancy gate runs separately on CreateQueue and rejects this + // at the wire today, but the cross-attribute validator on its + // own does not). + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + FifoThroughputLimit: htfifoThroughputPerMessageGroupID, + PartitionCount: 8, + })) + // FIFO + perQueue + PartitionCount=0: accept (legitimate + // single-partition FIFO with explicit per-queue throughput). + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + FifoThroughputLimit: htfifoThroughputPerQueue, + })) + // FIFO + no FifoThroughputLimit + PartitionCount=0: accept + // (legitimate legacy FIFO, AWS behaviour). + require.NoError(t, validatePartitionConfig(&sqsQueueMeta{ + IsFIFO: true, + })) +} + +// --- validatePartitionDormancyGate unit tests --- + +// TestValidatePartitionDormancyGate_RejectsAboveOne pins the §11 +// PR 2 dormancy gate: PartitionCount > 1 must reject until PR 5 +// lifts the gate. PartitionCount 0 or 1 must pass (both are the +// legacy single-partition layout). +func TestValidatePartitionDormancyGate_RejectsAboveOne(t *testing.T) { + t.Parallel() + require.NoError(t, validatePartitionDormancyGate(&sqsQueueMeta{PartitionCount: 0})) + require.NoError(t, validatePartitionDormancyGate(&sqsQueueMeta{PartitionCount: 1})) + for _, n := range []uint32{2, 4, 8, 16, 32} { + err := validatePartitionDormancyGate(&sqsQueueMeta{PartitionCount: n}) + require.Error(t, err, "PartitionCount=%d must reject under the dormancy gate", n) + require.Contains(t, err.Error(), "not yet enabled", + "the gate's reason must surface to the operator") + } +} + +// --- validatePartitionImmutability unit tests --- + +// TestValidatePartitionImmutability_RejectsAnyChange pins the §3.2 +// immutability rule: SetQueueAttributes attempts to change any of +// the three immutable HT-FIFO fields reject with +// InvalidAttributeValue. +func TestValidatePartitionImmutability_RejectsAnyChange(t *testing.T) { + t.Parallel() + current := &sqsQueueMeta{ + PartitionCount: 8, + FifoThroughputLimit: htfifoThroughputPerMessageGroupID, + DeduplicationScope: htfifoDedupeScopeMessageGroup, + } + cases := []struct { + name string + mutate func(*sqsQueueMeta) + mustError bool + }{ + {"PartitionCount changed", func(m *sqsQueueMeta) { m.PartitionCount = 4 }, true}, + {"FifoThroughputLimit changed", func(m *sqsQueueMeta) { m.FifoThroughputLimit = htfifoThroughputPerQueue }, true}, + {"DeduplicationScope changed", func(m *sqsQueueMeta) { m.DeduplicationScope = htfifoDedupeScopeQueue }, true}, + {"no immutable change (same-value no-op)", func(m *sqsQueueMeta) {}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + req := *current + tc.mutate(&req) + err := validatePartitionImmutability(current, &req) + if tc.mustError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +// TestValidatePartitionImmutability_PartitionCountZeroAndOneEquivalent +// pins the 0/1 normalisation: the on-disk +// PartitionCount=0 ("unset") is canonical-equivalent to an explicit +// PartitionCount=1 ("single partition"), so a SetQueueAttributes +// that reaffirms the default ought to be a no-op rather than a hard +// "PartitionCount is immutable" rejection. validatePartitionImmutability +// uses normalisePartitionCount on both sides for exactly this case. +func TestValidatePartitionImmutability_PartitionCountZeroAndOneEquivalent(t *testing.T) { + t.Parallel() + cases := []struct { + name string + current uint32 + req uint32 + wantErr bool + }{ + {"stored 0, requested 1 (no-op)", 0, 1, false}, + {"stored 1, requested 0 (no-op)", 1, 0, false}, + {"stored 0, requested 0 (no-op)", 0, 0, false}, + {"stored 1, requested 1 (no-op)", 1, 1, false}, + {"stored 0, requested 2 (real change)", 0, 2, true}, + {"stored 1, requested 2 (real change)", 1, 2, true}, + {"stored 2, requested 1 (real change)", 2, 1, true}, + {"stored 2, requested 0 (real change)", 2, 0, true}, + {"stored 4, requested 8 (real change)", 4, 8, true}, + {"stored 8, requested 8 (no-op)", 8, 8, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + cur := &sqsQueueMeta{PartitionCount: tc.current} + req := &sqsQueueMeta{PartitionCount: tc.req} + err := validatePartitionImmutability(cur, req) + if tc.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +// --- htfifoAttributesPresent --- + +func TestHTFIFOAttributesPresent(t *testing.T) { + t.Parallel() + require.False(t, htfifoAttributesPresent(map[string]string{})) + require.False(t, htfifoAttributesPresent(map[string]string{"VisibilityTimeout": "30"})) + require.True(t, htfifoAttributesPresent(map[string]string{"PartitionCount": "8"})) + require.True(t, htfifoAttributesPresent(map[string]string{"FifoThroughputLimit": htfifoThroughputPerMessageGroupID})) + require.True(t, htfifoAttributesPresent(map[string]string{"DeduplicationScope": htfifoDedupeScopeMessageGroup})) +} + +// TestHTFIFOAttributesEqual_PartitionCountZeroAndOneEquivalent pins +// the idempotency normalisation: validatePartitionConfig documents +// PartitionCount=0 (unset) and =1 (explicit single +// partition) as semantically identical legacy/single-partition +// routing, so CreateQueue idempotency must treat them as equal — +// otherwise a queue created without PartitionCount (stored as 0) is +// rejected as "different attributes" by a retry that explicitly +// passes PartitionCount=1. +func TestHTFIFOAttributesEqual_PartitionCountZeroAndOneEquivalent(t *testing.T) { + t.Parallel() + a := &sqsQueueMeta{PartitionCount: 0} + b := &sqsQueueMeta{PartitionCount: 1} + require.True(t, htfifoAttributesEqual(a, b), + "PartitionCount 0 (unset) and 1 (explicit single partition) must compare equal") + require.True(t, htfifoAttributesEqual(b, a), + "equality must be symmetric") + // Real divergence (>1 vs 0/1) still rejects. + c := &sqsQueueMeta{PartitionCount: 2} + require.False(t, htfifoAttributesEqual(a, c), + "PartitionCount=2 must differ from unset") + require.False(t, htfifoAttributesEqual(b, c), + "PartitionCount=2 must differ from explicit 1") + // Same > 1 value still equal. + d := &sqsQueueMeta{PartitionCount: 2} + require.True(t, htfifoAttributesEqual(c, d), + "identical PartitionCount > 1 must compare equal") +} + +func TestNormalisePartitionCount(t *testing.T) { + t.Parallel() + require.Equal(t, uint32(1), normalisePartitionCount(0)) + require.Equal(t, uint32(1), normalisePartitionCount(1)) + require.Equal(t, uint32(2), normalisePartitionCount(2)) + require.Equal(t, uint32(8), normalisePartitionCount(8)) +} diff --git a/adapter/sqs_query_protocol.go b/adapter/sqs_query_protocol.go index adfadf7be..71fe013f1 100644 --- a/adapter/sqs_query_protocol.go +++ b/adapter/sqs_query_protocol.go @@ -147,9 +147,9 @@ func readQueryForm(r *http.Request) (url.Values, error) { return nil, newSQSAPIError(http.StatusBadRequest, sqsErrMalformedRequest, "missing request URL") } // r.URL.Query() returns a fresh map on each call so we can adopt - // it directly as the base instead of copying entries one-by-one - // (Gemini high-priority on PR #662). On GET we are done; on POST - // the body's form values are merged into the same map. + // it directly as the base instead of copying entries one-by-one. + // On GET we are done; on POST the body's form values are merged + // into the same map. values := r.URL.Query() if r.Method == http.MethodGet { return values, nil @@ -508,8 +508,8 @@ func errorTypeForStatus(status int) string { // like the AWS RequestId: 22 chars of base32 (no padding). Base32 // emits 1 char per 5 input bits, so 22 chars require ceil(22*5/8)=14 // random bytes (110 bits, comfortably more entropy than a UUID-v4). -// Gemini medium on PR #662 flagged the prior 16-byte source — that -// produces a 26-char ID, not 22, and the comment was a lie. +// A 16-byte source would produce a 26-char ID, not 22, so the byte +// count is pinned at 14 to match the documented length. func newQueryRequestID() string { var raw [14]byte if _, err := rand.Read(raw[:]); err != nil { diff --git a/adapter/sqs_query_protocol_test.go b/adapter/sqs_query_protocol_test.go index e1d68ed63..1dd0ec9fd 100644 --- a/adapter/sqs_query_protocol_test.go +++ b/adapter/sqs_query_protocol_test.go @@ -148,8 +148,8 @@ func TestCollectIndexedKVPairs_Empty(t *testing.T) { } // TestNewQueryRequestID_Length pins the AWS shape: 22 base32 chars. -// Gemini medium on PR #662 caught the prior 26-char output that -// contradicted the function's own doc comment. +// A regression where this function returned 26 chars contradicted +// its own doc comment, so the length is asserted explicitly. func TestNewQueryRequestID_Length(t *testing.T) { t.Parallel() for i := 0; i < 64; i++ { diff --git a/adapter/sqs_throttle.go b/adapter/sqs_throttle.go new file mode 100644 index 000000000..a726c7847 --- /dev/null +++ b/adapter/sqs_throttle.go @@ -0,0 +1,657 @@ +package adapter + +import ( + "context" + "math" + "net/http" + "sync" + "time" + + "github.com/cockroachdb/errors" +) + +// Per-queue throttling — token-bucket store that hangs off *SQSServer. +// See docs/design/2026_04_26_proposed_sqs_per_queue_throttling.md for +// the full design and rollout context. This file implements §3.1 (bucket +// store + token bucket), §3.3 (charging model), §3.4 (Throttling +// envelope helpers) and the cache-invalidation primitives §3.1 calls +// out for SetQueueAttributes / DeleteQueue. + +// Canonical bucket-action vocabulary. The JSON field-name prefixes +// (Send*, Recv*, Default*) are the operator-facing contract; these +// values are what the in-memory map is keyed on. The mapping is fixed +// per the design's §3.2 "Config-field → bucket-action mapping" table. +const ( + bucketActionSend = "Send" + bucketActionReceive = "Receive" + bucketActionAny = "*" +) + +// throttleAttributeNames is the wire-side set of Throttle* +// attributes a SetQueueAttributes request can carry. Used by the +// invalidation gate in setQueueAttributes so an unrelated update +// (e.g. VisibilityTimeout only) does not pay the cache invalidation +// cost or, worse, give the caller a way to silently reset bucket +// state via a no-op SetQueueAttributes. +var throttleAttributeNames = []string{ + "ThrottleSendCapacity", + "ThrottleSendRefillPerSecond", + "ThrottleRecvCapacity", + "ThrottleRecvRefillPerSecond", + "ThrottleDefaultCapacity", + "ThrottleDefaultRefillPerSecond", +} + +// throttleAttributesPresent reports whether attrs carries any +// Throttle* key. Cheap O(6) check; the throttleAttributeNames slice +// is the source of truth so a future Throttle* attribute name added +// in one place automatically participates in the gate. +func throttleAttributesPresent(attrs map[string]string) bool { + for _, k := range throttleAttributeNames { + if _, ok := attrs[k]; ok { + return true + } + } + return false +} + +// throttleHardCeilingPerSecond bounds any user-supplied capacity or +// refill rate. A typo like SendCapacity=1e9 silently meaning "no limit" +// is more dangerous than an explicit InvalidAttributeValue: a +// wide-open queue masks itself as "throttled". +const throttleHardCeilingPerSecond = 100_000.0 + +// throttleMinBatchCapacity is the smallest acceptable per-action +// capacity once the action covers a batch verb. SendMessageBatch and +// DeleteMessageBatch each charge up to 10 tokens (AWS caps batch size +// at 10), so a SendCapacity below 10 makes every full batch +// permanently unserviceable. +const throttleMinBatchCapacity = float64(sqsBatchMaxEntries) + +// throttleIdleEvictAfter is the idle window after which a quiet bucket +// is dropped from the in-memory store. A background goroutine +// (runSweepLoop) fires the eviction sweep on each +// throttleEvictSweepEvery tick; the hot path never calls sweep(). +// A queue that resumes activity rebuilds its bucket from the meta +// record at full capacity, matching the failover semantics +// documented in §3.1. +const throttleIdleEvictAfter = time.Hour + +// throttleEvictSweepEvery is the interval at which runSweepLoop fires +// the idle-evict sweep in its background goroutine. The hot-path +// charge() never calls into the sweep so a many-queue cluster pays +// the O(N) cost only on the goroutine's tick, never on a request. +const throttleEvictSweepEvery = time.Minute + +// bucketKey is the in-memory map key. incarnation is the queue's +// CreateQueue-only counter (sqsQueueMeta.Incarnation): it bumps on +// each successful CreateQueue and is preserved across PurgeQueue and +// SetQueueAttributes. Including it isolates token state across +// DeleteQueue+CreateQueue cycles and across leader changes where a +// different node holds an in-memory bucket from an earlier +// incarnation. Using Incarnation rather than Generation also blocks +// a PurgeQueue-driven reset bypass: Generation bumps on every Purge, +// so a (queue, action, incarnation)-keyed bucket would be re-keyed +// on each purge and the next request would mint a fresh full bucket +// — letting a caller exceed the configured rate by repeatedly +// purging. +type bucketKey struct { + queue string + action string + incarnation uint64 +} + +// tokenBucket is one bucket's mutable state. mu is per-bucket so +// concurrent traffic on different queues never serialises on the same +// lock; refill + take + release of a single bucket is the only +// critical section. evicted flips to true exactly once (under mu) when +// the bucket is removed from the store by sweep / invalidateQueue / +// loadOrInit reconciliation; charge re-checks it after acquiring mu so +// goroutines that loaded the now-orphaned bucket retry and converge on +// the live entry. Without that retry, sweep racing N concurrent chargers +// could let them drain up to one full capacity from the orphan while +// later requests get a fresh full-capacity bucket — a one-time burst +// of up to 2× capacity per evict cycle. +// Never held across the bucketStore's sync.Map. +type tokenBucket struct { + mu sync.Mutex + capacity float64 + refillRate float64 + tokens float64 + lastRefill time.Time + evicted bool +} + +// bucketStore holds every active bucket for an SQS server process. +// sync.Map matches the read-mostly access pattern: lookups are nearly +// always Load hits; LoadOrStore pays the write cost only on first use. +// +// The idle-evict sweep runs from runSweepLoop on a background ticker +// — there is no hot-path serialisation primitive because the only +// caller of sweep() is the sole goroutine the ticker drives. +type bucketStore struct { + buckets sync.Map // map[bucketKey]*tokenBucket + clock func() time.Time + evictedAfter time.Duration + sweepEvery time.Duration +} + +// newBucketStore constructs a store whose clock + idle-evict window +// can be overridden for tests. The sweep cadence is fixed at +// throttleEvictSweepEvery; tests that want a different cadence have +// no use case yet (the sweep itself is a low-cost no-op when the +// store is small). Production calls newBucketStoreDefault. +func newBucketStore(clock func() time.Time, evictedAfter time.Duration) *bucketStore { + if clock == nil { + clock = time.Now + } + return &bucketStore{ + clock: clock, + evictedAfter: evictedAfter, + sweepEvery: throttleEvictSweepEvery, + } +} + +// newBucketStoreDefault uses the production constants. Kept as a +// separate constructor so test wiring stays explicit about the +// time-window overrides. +func newBucketStoreDefault() *bucketStore { + return newBucketStore(time.Now, throttleIdleEvictAfter) +} + +// chargeOutcome is returned from charge so the caller can build the +// Throttling envelope (Retry-After computed from refillRate + +// requestedCount, see §3.4) without re-loading the bucket. +type chargeOutcome struct { + allowed bool + retryAfter time.Duration + tokensAfter float64 + bucketPresent bool +} + +// charge takes count tokens from the bucket identified by (queue, +// action, incarnation) using cfg as the source-of-truth for capacity / +// refillRate. cfg may be nil — in which case throttling is disabled for +// the queue and charge returns allowed=true without touching the map. +// incarnation is sqsQueueMeta.Incarnation so a same-name recreate or +// cross-leader failover never reuses an older incarnation's tokens. +// +// count must be ≥ 1; the caller has already validated batch size at +// the request layer (sqs_messages_batch.go bounds it to +// sqsBatchMaxEntries). +func (b *bucketStore) charge(cfg *sqsQueueThrottle, queue, action string, incarnation uint64, count int) chargeOutcome { + if b == nil || cfg == nil || cfg.IsEmpty() { + // Throttling disabled (default): every request allowed, no + // bucket allocated. The hot path stays a single nil-check. + return chargeOutcome{allowed: true, bucketPresent: false} + } + resolvedAction, capacity, refill := resolveActionConfig(cfg, action) + if capacity == 0 || refill == 0 { + // This action has no throttle configured (e.g. only Send is + // configured and the request is a Recv). Default* covers any + // remaining unconfigured action; if Default* is also zero the + // request is unthrottled. + return chargeOutcome{allowed: true, bucketPresent: false} + } + if count < 1 { + count = 1 + } + // Bucket key uses the *resolved* action so Send-falls-through-to- + // Default and Recv-falls-through-to-Default share the same Default + // bucket. Without the resolution, an operator who configures only + // Default would still get one bucket per requesting action — three + // independent quotas instead of one shared cap. + // + // Loop bound: each retry happens only when the bucket we loaded was + // evicted between the Load and the mu acquisition. Two iterations + // is the realistic ceiling (sweep / reconciliation can't repeatedly + // evict the same fresh bucket without time advancing past + // evictedAfter); the cap keeps a pathological invariant violation + // from spinning the goroutine. + for range 4 { + bucket := b.loadOrInit(queue, resolvedAction, incarnation, capacity, refill) + outcome, retry := chargeBucket(bucket, b.clock(), count) + if !retry { + return outcome + } + } + // Loop exhaustion means we kept finding a freshly-evicted bucket + // for four passes without ever landing on a live one — pathological + // (sweep / reconciliation cannot repeatedly evict the same fresh + // bucket without time advancing past evictedAfter). Fail closed + // rather than allow the request: a fail-open here would turn the + // invalidate/reconcile race into a throttle bypass on the exact + // path that is supposed to enforce limits. Returning a Throttling + // response with a + // non-zero retryAfter gives the client a normal back-off cue + // rather than a hard 500 — the next attempt will almost certainly + // land on the now-stable bucket. + return chargeOutcome{ + allowed: false, + retryAfter: time.Second, + bucketPresent: false, + } +} + +// chargeBucket runs the under-mu refill+take for a single bucket and +// returns retry=true if the caller should drop the reference and reload +// from the store. Retry is the orphan-bucket signal: sweep / +// invalidateQueue / loadOrInit reconciliation set evicted=true under +// mu before dropping the bucket from the map, so a goroutine that +// loaded the bucket pre-eviction can detect it here. +func chargeBucket(bucket *tokenBucket, now time.Time, count int) (chargeOutcome, bool) { + bucket.mu.Lock() + defer bucket.mu.Unlock() + if bucket.evicted { + return chargeOutcome{}, true + } + // Refill before reading: tokens accrue at refillRate * elapsed, + // capped at the configured capacity. This is the single place that + // advances tokens forward in time so the "fresh bucket on failover" + // guarantee from §3.1 holds: a new leader's bucket starts at full + // capacity and refills only based on elapsed time on this process. + if elapsed := now.Sub(bucket.lastRefill).Seconds(); elapsed > 0 { + bucket.tokens += elapsed * bucket.refillRate + if bucket.tokens > bucket.capacity { + bucket.tokens = bucket.capacity + } + bucket.lastRefill = now + } + requested := float64(count) + if bucket.tokens >= requested { + bucket.tokens -= requested + return chargeOutcome{allowed: true, tokensAfter: bucket.tokens, bucketPresent: true}, false + } + // Reject the whole batch — partial throttling within a batch is + // hard to reason about and AWS rejects the whole call. + return chargeOutcome{ + allowed: false, + retryAfter: computeRetryAfter(requested, bucket.tokens, bucket.refillRate), + tokensAfter: bucket.tokens, + bucketPresent: true, + }, false +} + +// loadOrInit handles the first-use insert race. Two concurrent first +// requests for the same (queue, action, incarnation) both arrive at +// LoadOrStore; one wins and the loser's freshly-built bucket is +// discarded. This is safe because both racers compute identical +// (capacity, refillRate) from the same meta snapshot — the bucket +// they would build is behaviourally interchangeable. +// +// Reconciliation against stale config: if a +// cached bucket's capacity/refillRate differ from the cfg's current +// values, the bucket is replaced with a fresh one built from the +// current config. Without this check, a node that lost leadership +// during a SetQueueAttributes commit and then regained leadership +// later would keep enforcing the prior leader-term's limits — the +// SetQueueAttributes invalidation only runs on the leader that +// processed the commit, so a different leader's stale buckets +// survive. The reconciliation also covers the case where the +// invalidation gate in setQueueAttributes is bypassed (e.g. by a +// future admin path that mutates throttle config without touching +// SetQueueAttributes). +// +// incarnation participates in the key: a +// DeleteQueue+CreateQueue cycle bumps Generation, so the new +// incarnation lands in a different map entry and starts from a +// fresh full bucket regardless of what stale per-process cache +// the prior incarnation left on this or any other node. +func (b *bucketStore) loadOrInit(queue, action string, incarnation uint64, capacity, refill float64) *tokenBucket { + key := bucketKey{queue: queue, action: action, incarnation: incarnation} + if v, ok := b.buckets.Load(key); ok { + // type assertion is sound: only tokenBucket pointers are stored. + bucket, _ := v.(*tokenBucket) + // Cheap field comparison under the bucket's own lock — if the + // cached bucket matches the current config we return it + // directly. A mismatch means the on-disk meta moved while + // this node held a stale bucket; rebuild from the current + // config (full capacity, matching the failover semantics). + // + // Hold mu across the matches check, the CompareAndDelete, and + // the evicted=true flag. The + // earlier ordering — unlock-after-matches, CompareAndDelete, + // re-lock-and-flag — left a window between the + // CompareAndDelete success and the re-lock during which a + // concurrent charger blocked on bucket.mu could acquire it, + // see evicted=false (the flag is only set on the next-but-one + // statement), and spend tokens against the orphaned bucket. + // Subsequent requests would mint a fresh full-capacity bucket + // via LoadOrStore — a one-time burst above the configured + // limit right after a config reconciliation. Holding mu + // across the whole sequence makes the lock-then-delete-then- + // flag ordering match sweep / invalidateQueue, so a charger + // blocked on mu sees evicted=true on entry and retries. + bucket.mu.Lock() + matches := bucket.capacity == capacity && bucket.refillRate == refill + if matches { + bucket.mu.Unlock() + return bucket + } + // CompareAndDelete is mandatory here: an unconditional Delete + // races against a concurrent goroutine that already detected + // the same mismatch and replaced the entry with its own fresh + // bucket — our Delete would evict its fresh entry, then our + // LoadOrStore would put another fresh bucket. The map ends up + // holding our bucket, but the racer's bucket might have + // already been handed out via LoadOrStore to a third + // goroutine that is now charging a bucket no longer in the + // map, while later requests get a different fresh bucket at + // full capacity. CompareAndDelete makes our Delete a no-op + // when the map already holds someone else's fresh bucket. + if b.buckets.CompareAndDelete(key, v) { + bucket.evicted = true + } + bucket.mu.Unlock() + // fall through to LoadOrStore — a concurrent racer might + // have already inserted a fresh bucket with the current + // config, in which case LoadOrStore picks it up and the new + // bucket below is discarded. + } + now := b.clock() + fresh := &tokenBucket{ + capacity: capacity, + refillRate: refill, + tokens: capacity, // start at full capacity, matches failover semantics. + lastRefill: now, + } + actual, _ := b.buckets.LoadOrStore(key, fresh) + bucket, _ := actual.(*tokenBucket) + return bucket +} + +// invalidateQueue drops every bucket belonging to the named queue. +// Called *after* the Raft commit on SetQueueAttributes / DeleteQueue / +// CreateQueue so the next request rebuilds from the freshly committed +// meta. Mirrors sweep's lock-then-delete-then-flag ordering: +// +// 1. Load the pointer from the map (without deleting). +// 2. Acquire bucket.mu so any concurrent charger either runs to +// completion before us (and we observe its updated lastRefill +// but still proceed — invalidation is unconditional) or blocks +// on mu until we set evicted=true. +// 3. CompareAndDelete with the loaded v to avoid evicting a +// replacement bucket inserted by a concurrent reconciliation. +// 4. Set evicted=true under mu so the next mu acquirer sees the +// orphan signal and retries against the live entry. +// +// LoadAndDelete-then-lock would let a concurrent charger that +// loaded the pointer pre-LoadAndDelete acquire mu first and charge +// while evicted is still false, then later requests would create a +// fresh full-capacity bucket — a 2x burst on every invalidation +// event. +func (b *bucketStore) invalidateQueue(queue string) { + if b == nil { + return + } + // Incarnation participates in the key: we do + // not know which incarnations have buckets cached, so range the map + // and remove any entry whose queue matches. A SetQueueAttributes + // invalidation on the same incarnation must drop the same-incarnation + // bucket so the new throttle config takes effect; a DeleteQueue / + // CreateQueue cycle would also drop any pre-existing incarnation's + // bucket here, although those entries also fall out via idle + // eviction since the new incarnation lands under a different key + // and the old key never sees traffic again. + b.buckets.Range(func(k, v any) bool { + key, _ := k.(bucketKey) + if key.queue != queue { + return true + } + bucket, _ := v.(*tokenBucket) + bucket.mu.Lock() + if b.buckets.CompareAndDelete(k, v) { + bucket.evicted = true + } + bucket.mu.Unlock() + return true + }) +} + +// runSweepLoop runs the idle-evict sweep on a background ticker so +// the request hot path never pays the O(N) sync.Map.Range cost: a +// many-queue cluster would otherwise see latency spikes on whichever +// request was unlucky enough to trigger the per-minute on-hot-path +// sweep. Returns when ctx is done — the +// SQSServer wires this to s.reaperCtx so a Stop() call cleans the +// goroutine up alongside the existing reaper. +func (b *bucketStore) runSweepLoop(ctx context.Context) { + if b == nil || b.evictedAfter <= 0 || b.sweepEvery <= 0 { + return + } + t := time.NewTicker(b.sweepEvery) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + b.sweep() + } + } +} + +// sweep walks the bucket store dropping any bucket idle longer than +// evictedAfter. Called from runSweepLoop on a background ticker — +// the ticker is the only caller, so sweep() does not need its own +// serialisation. Bucket lookups stay O(1) on the hot path; sweep +// iterates every entry under the per-bucket lock so it can re-check +// idle and the map entry atomically. +// +// Eviction race: three +// guards work together to keep idle eviction from inflating the burst +// budget for a queue: +// 1. Hold bucket.mu across the Delete so the idle observation +// cannot be invalidated between check and delete. A concurrent +// charge() that loaded the bucket either runs to completion +// before sweep acquires mu (sweep then sees the updated +// lastRefill and skips delete) or blocks on mu until sweep +// releases — and on release sees evicted=true and retries. +// 2. CompareAndDelete with v ensures sweep does not evict a +// replacement bucket inserted by invalidateQueue or a future +// reconciliation path. +// 3. Set evicted=true under mu after a successful CompareAndDelete. +// Without this signal, goroutines that loaded the bucket +// pre-eviction and were blocked on mu would charge the orphan +// after release while later requests get a fresh full-capacity +// bucket — a one-time burst of up to 2× capacity per evict +// cycle on workloads where requests align with sweep ticks. +// +// Holding bucket.mu across sync.Map.Delete is safe — sync.Map.Load +// is wait-free on the read-only path and never blocks waiting for +// bucket.mu, so there is no AB-BA cycle with charge(). +func (b *bucketStore) sweep() { + cutoff := b.clock().Add(-b.evictedAfter) + b.buckets.Range(func(k, v any) bool { + bucket, _ := v.(*tokenBucket) + bucket.mu.Lock() + if bucket.lastRefill.Before(cutoff) { + if b.buckets.CompareAndDelete(k, v) { + bucket.evicted = true + } + } + bucket.mu.Unlock() + return true + }) +} + +// resolveActionConfig maps a charge() action to (effective bucket +// action, capacity, refillRate) from cfg. Send* and Recv* keep their +// own buckets when configured; otherwise the action falls through to +// the Default bucket and gets the canonical "*" key so all +// fall-through actions share one bucket. Returning (_, 0, 0) means +// "no throttle for this action" and the caller short-circuits. +func resolveActionConfig(cfg *sqsQueueThrottle, action string) (string, float64, float64) { + switch action { + case bucketActionSend: + if cfg.SendCapacity > 0 { + return bucketActionSend, cfg.SendCapacity, cfg.SendRefillPerSecond + } + case bucketActionReceive: + if cfg.RecvCapacity > 0 { + return bucketActionReceive, cfg.RecvCapacity, cfg.RecvRefillPerSecond + } + } + if cfg.DefaultCapacity > 0 { + return bucketActionAny, cfg.DefaultCapacity, cfg.DefaultRefillPerSecond + } + return action, 0, 0 +} + +// throttleRetryAfterCap bounds the Retry-After value the client sees. +// Without a cap, a tiny refillRate plus +// a large requested count would compute a multi-day wait — and +// time.Duration arithmetic can overflow at the upper end. One hour +// matches the bucket store's idle-evict window: by the time the +// suggested retry would otherwise expire, the bucket would have +// been evicted and rebuilt at full capacity anyway, so a longer +// suggestion is meaningless. Producers that hit the cap are also +// strongly mis-configured; capping is a guard rail, not a feature. +const throttleRetryAfterCap = time.Hour + +// computeRetryAfter implements the §3.4 formula: +// +// needed := requested - currentTokens +// secondsToNextRefill := ceil(needed / refillRate) +// retryAfter := max(1, int(secondsToNextRefill)) +// +// requested is the same count the charge step uses (1 for single-message +// verbs, len(Entries) for batch verbs). The min-1 floor matches the +// HTTP/1.1 §10.2.3 integer-second granularity. The validator keeps +// refillRate > 0 so no divide-by-zero guard is needed. +// +// Capped at throttleRetryAfterCap to bound time.Duration arithmetic +// against pathologically small refillRate / large requested values. +func computeRetryAfter(requested, current, refillRate float64) time.Duration { + needed := requested - current + if needed <= 0 { + // Pathological — caller invoked us with allowed=false but + // tokens >= requested. Treat as "wait one tick" rather than + // zero so the client backs off at least once. + return time.Second + } + secs := math.Ceil(needed / refillRate) + if secs < 1 { + secs = 1 + } + // Cap before multiplying to avoid time.Duration overflow on + // pathological inputs (e.g. refillRate just above zero). + const capSecs = float64(throttleRetryAfterCap / time.Second) + if secs > capSecs { + secs = capSecs + } + return time.Duration(secs) * time.Second +} + +// throttleChargeCount maps a request to the token count the bucket +// should be charged for. Single-message verbs charge 1; batch verbs +// charge len(Entries). The bucket store itself takes count as a +// parameter so this helper can stay close to the wire-protocol layer +// in the request path. +func throttleChargeCount(entries int) int { + if entries < 1 { + return 1 + } + return entries +} + +// chargeQueue is the per-handler entry point used by handlers that +// do not pre-load the queue meta before the throttle check +// (deleteMessage, changeMessageVisibility, sendMessageBatch, +// receiveMessageBatch, deleteMessageBatch — the batch handlers run +// the throttle check OUTSIDE their per-entry OCC retry loop, so the +// meta they later load inside the loop is not in scope here). It +// loads the meta at a fresh read timestamp (Pebble cache makes this +// cheap) and runs the bucket store's charge against the queue's +// Throttle config. +// +// Handlers that DO pre-load the meta before charging — sendMessage +// and receiveMessage — should use chargeQueueWithThrottle to avoid +// the redundant load. The batch handlers +// could be refactored similarly but each entry-loop iteration would +// then need its own lookup, defeating the savings; the single +// throttle-time meta load there pays for the whole batch. +// +// chargeQueue intentionally swallows missing-queue errors: the caller +// is going to discover that the queue does not exist a few lines +// later and respond with QueueDoesNotExist. Letting the throttle +// check race the catalog read avoids two lookups in the fast path. +// +// Designed to sit OUTSIDE the OCC transaction (§4.2): a rejected +// request never reaches the coordinator. The retry loop in +// sendMessageWithRetry et al. would otherwise busy-loop on a +// permanent rate-limit reject, burning leader CPU. +func (s *SQSServer) chargeQueue(w http.ResponseWriter, r *http.Request, queueName, action string, count int) bool { + if s.throttle == nil { + return true + } + throttle, incarnation, err := s.queueThrottleConfig(r, queueName) + if err != nil { + // Fail closed on a transient storage error. Earlier code + // converted the error to (nil, 0) + // which made the throttle check short-circuit to "allowed"; + // if the same storage hiccup did not also break the OCC + // commit a few lines later, the request would be processed + // unthrottled — a silent rate-limit bypass under storage + // instability. 500 here matches what the OCC layer would + // also surface for a meta read failure. + writeSQSErrorFromErr(w, err) + return false + } + return s.chargeQueueWithThrottle(w, queueName, action, count, throttle, incarnation) +} + +// chargeQueueWithThrottle is the variant for handlers that already +// have the throttle config in hand from their own meta load. Drops +// the per-request meta load chargeQueue does, avoiding redundant +// storage reads on the hot path. incarnation is +// sqsQueueMeta.Incarnation: it must come from the same meta snapshot +// the throttle config was read from so a recreate committed +// mid-request lands in a fresh bucket on the next call rather than +// mixing tokens with the prior incarnation. NOTE: meta.Incarnation, +// NOT meta.Generation — PurgeQueue bumps Generation but preserves +// Incarnation, so keying the bucket by Generation would let a caller +// bypass the rate limit by repeatedly purging. +func (s *SQSServer) chargeQueueWithThrottle(w http.ResponseWriter, queueName, action string, count int, throttle *sqsQueueThrottle, incarnation uint64) bool { + if s.throttle == nil { + return true + } + outcome := s.throttle.charge(throttle, queueName, action, incarnation, count) + if outcome.allowed { + return true + } + writeSQSThrottlingError(w, queueName, action, outcome.retryAfter) + return false +} + +// queueThrottleConfig loads the Throttle config and Incarnation off a +// queue's meta record. Incarnation participates in the bucket key, so +// it must travel with the throttle snapshot to avoid a stale-meta +// read that mints a fresh bucket under the wrong incarnation. +// +// Returns: +// - (cfg, incarnation, nil) on a successful read of an existing queue. +// - (nil, 0, nil) when the queue does not exist — the caller's +// handler will surface QueueDoesNotExist a few lines later, and a +// nil throttle config short-circuits the charge to "allowed". +// - (nil, 0, err) on a storage-layer error. The caller MUST fail +// the request closed; converting an error to (nil, 0) silently +// would let traffic bypass the throttle check during a transient +// storage outage if the OCC commit later succeeded. +// +// Held as a method on *SQSServer so a test can swap the meta loader +// via the existing nextTxnReadTS / loadQueueMetaAt seam. +func (s *SQSServer) queueThrottleConfig(r *http.Request, queueName string) (*sqsQueueThrottle, uint64, error) { + if s.store == nil { + return nil, 0, nil + } + readTS := s.nextTxnReadTS(r.Context()) + meta, exists, err := s.loadQueueMetaAt(r.Context(), queueName, readTS) + if err != nil { + return nil, 0, errors.WithStack(err) + } + if !exists || meta == nil { + return nil, 0, nil + } + return meta.Throttle, meta.Incarnation, nil +} diff --git a/adapter/sqs_throttle_integration_test.go b/adapter/sqs_throttle_integration_test.go new file mode 100644 index 000000000..26f0b6880 --- /dev/null +++ b/adapter/sqs_throttle_integration_test.go @@ -0,0 +1,376 @@ +package adapter + +import ( + "net/http" + "strconv" + "testing" +) + +// TestSQSServer_Throttle_DefaultOff_AllowsUnboundedSends pins the +// default-off contract: a queue without any Throttle* attributes +// accepts arbitrary send volume. The hot path for unconfigured queues +// must never reject — anything else would change steady-state +// behaviour for every existing deployment that hasn't opted in. +func TestSQSServer_Throttle_DefaultOff_AllowsUnboundedSends(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-off") + for i := range 50 { + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, + "MessageBody": "msg-" + strconv.Itoa(i), + }) + if status != http.StatusOK { + t.Fatalf("default-off send #%d: status %d", i+1, status) + } + } +} + +// TestSQSServer_Throttle_SendBucketRejectsAfterCapacity is the §6 +// item 2 end-to-end test: configure SendCapacity=10 RefillPerSec=1, +// send 10 → 200, send 11th immediately → 400 Throttling with +// Retry-After. Pins the wire-level contract: the 400 carries the +// AWS-shaped error envelope and the Retry-After header. +func TestSQSServer_Throttle_SendBucketRejectsAfterCapacity(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-send") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + for i := range 10 { + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, + "MessageBody": "msg-" + strconv.Itoa(i), + }) + if status != http.StatusOK { + t.Fatalf("send #%d: status %d (expected 200)", i+1, status) + } + } + // 11th must reject with Throttling + Retry-After. + resp := postSQSRequest(t, "http://"+node.sqsAddress+"/", sqsSendMessageTarget, `{"QueueUrl":"`+url+`","MessageBody":"overflow"}`) + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("11th send: status %d (expected 400)", resp.StatusCode) + } + if got := resp.Header.Get("x-amzn-ErrorType"); got != sqsErrThrottling { + t.Fatalf("11th send: x-amzn-ErrorType=%q (expected Throttling)", got) + } + if ra := resp.Header.Get("Retry-After"); ra != "1" { + t.Fatalf("11th send: Retry-After=%q (expected 1)", ra) + } +} + +// TestSQSServer_Throttle_RecvBucketRejectsAfterCapacity is the +// receive-side mirror of the send test. ReceiveMessage charges 1 +// from Recv regardless of MaxNumberOfMessages, so 11 calls drain a +// capacity-10 bucket. Empty queue is fine — the throttle check sits +// before any catalog read. +func TestSQSServer_Throttle_RecvBucketRejectsAfterCapacity(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-recv") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleRecvCapacity": "10", + "ThrottleRecvRefillPerSecond": "1", + }) + for i := range 10 { + status, _ := callSQS(t, node, sqsReceiveMessageTarget, map[string]any{"QueueUrl": url}) + if status != http.StatusOK { + t.Fatalf("recv #%d: status %d (expected 200)", i+1, status) + } + } + resp := postSQSRequest(t, "http://"+node.sqsAddress+"/", sqsReceiveMessageTarget, `{"QueueUrl":"`+url+`"}`) + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("11th recv: status %d (expected 400)", resp.StatusCode) + } + if got := resp.Header.Get("x-amzn-ErrorType"); got != sqsErrThrottling { + t.Fatalf("11th recv: x-amzn-ErrorType=%q", got) + } +} + +// TestSQSServer_Throttle_BatchChargesByEntryCount pins the §3.3 +// charging table for SendMessageBatch: 10 entries → 10 tokens. With +// SendCapacity=10 a single 10-entry batch drains the bucket and the +// next single SendMessage rejects. +func TestSQSServer_Throttle_BatchChargesByEntryCount(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-batch") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + entries := make([]map[string]any, 10) + for i := range entries { + entries[i] = map[string]any{ + "Id": "id" + strconv.Itoa(i), + "MessageBody": "body-" + strconv.Itoa(i), + } + } + status, _ := callSQS(t, node, sqsSendMessageBatchTarget, map[string]any{ + "QueueUrl": url, + "Entries": entries, + }) + if status != http.StatusOK { + t.Fatalf("10-entry batch: status %d (expected 200)", status) + } + // Bucket now empty. Single send must reject. + resp := postSQSRequest(t, "http://"+node.sqsAddress+"/", sqsSendMessageTarget, `{"QueueUrl":"`+url+`","MessageBody":"overflow"}`) + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("post-batch single send: status %d (expected 400)", resp.StatusCode) + } +} + +// TestSQSServer_Throttle_SetQueueAttributesInvalidatesBucket pins the +// §3.1 cache-invalidation contract for SetQueueAttributes: a raise of +// the limit must take effect on the very next request, not after the +// 1h idle-evict sweep. Without invalidation the test's final send +// would still reject under the old (now-empty) bucket. +func TestSQSServer_Throttle_SetQueueAttributesInvalidatesBucket(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-invalidate") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + // Drain. + for range 10 { + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "drain", + }) + if status != http.StatusOK { + t.Fatalf("drain send: status %d", status) + } + } + // Sanity-check exhaustion. + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "should-throttle", + }) + if status != http.StatusBadRequest { + t.Fatalf("expected throttle, got %d", status) + } + // Raise capacity and refill. + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "20", + "ThrottleSendRefillPerSecond": "20", + }) + // Immediate send must succeed — a fresh bucket starts at capacity. + status, _ = callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "post-set", + }) + if status != http.StatusOK { + t.Fatalf("post-SetQueueAttributes send: status %d (expected 200; bucket invalidation broken)", status) + } +} + +// TestSQSServer_Throttle_DeleteQueueInvalidatesBucket pins the §3.1 +// cache-invalidation contract for DeleteQueue: a same-name recreate +// gets a fresh bucket, not the stale balance from the previous +// incarnation. Without invalidation the post-recreate send would +// inherit the drained bucket and reject. +func TestSQSServer_Throttle_DeleteQueueInvalidatesBucket(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + queue := "throttle-recreate" + url := mustCreateQueue(t, node, queue) + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + for range 10 { + _, _ = callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "drain", + }) + } + // Delete and recreate. + if status, _ := callSQS(t, node, sqsDeleteQueueTarget, map[string]any{"QueueUrl": url}); status != http.StatusOK { + t.Fatalf("delete: %d", status) + } + url2 := mustCreateQueue(t, node, queue) + mustSetQueueAttributes(t, node, url2, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + // First send on the recreated queue must succeed (full-capacity bucket). + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url2, "MessageBody": "fresh", + }) + if status != http.StatusOK { + t.Fatalf("post-recreate send: status %d (bucket invalidation on DeleteQueue broken)", status) + } +} + +// TestSQSServer_Throttle_GetQueueAttributesRoundTrip pins the §6 item +// 4 contract: SetQueueAttributes(Throttle*) followed by +// GetQueueAttributes("All") returns the same values. SDKs use the +// round-trip to confirm the config landed; a missing field on +// GetQueueAttributes would make operators think the SetQueueAttributes +// call silently failed. +func TestSQSServer_Throttle_GetQueueAttributesRoundTrip(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-roundtrip") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "100", + "ThrottleSendRefillPerSecond": "50", + "ThrottleRecvCapacity": "20", + "ThrottleRecvRefillPerSecond": "5", + }) + status, out := callSQS(t, node, sqsGetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "AttributeNames": []string{"All"}, + }) + if status != http.StatusOK { + t.Fatalf("GetQueueAttributes: status %d", status) + } + attrs, _ := out["Attributes"].(map[string]any) + if attrs == nil { + t.Fatalf("missing Attributes in response: %v", out) + } + expect := map[string]string{ + "ThrottleSendCapacity": "100", + "ThrottleSendRefillPerSecond": "50", + "ThrottleRecvCapacity": "20", + "ThrottleRecvRefillPerSecond": "5", + } + for k, want := range expect { + got, _ := attrs[k].(string) + if got != want { + t.Fatalf("attr %s: got %q want %q (full attrs: %v)", k, got, want, attrs) + } + } +} + +// TestSQSServer_Throttle_RejectsCapacityBelowBatchMin pins the §3.2 +// validator's batch-floor rule: SendCapacity < 10 makes every full +// batch permanently unserviceable. The SetQueueAttributes call is +// rejected with InvalidAttributeValue rather than landing. +func TestSQSServer_Throttle_RejectsCapacityBelowBatchMin(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-bad-cap") + status, out := callSQS(t, node, sqsSetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "Attributes": map[string]string{ + "ThrottleSendCapacity": "5", + "ThrottleSendRefillPerSecond": "1", + }, + }) + if status != http.StatusBadRequest { + t.Fatalf("expected 400 for SendCapacity=5 (< batch min 10), got %d body %v", status, out) + } + if got, _ := out["__type"].(string); got != sqsErrInvalidAttributeValue { + t.Fatalf("expected __type=InvalidAttributeValue, got %q", got) + } +} + +// --- helpers --- + +func mustCreateQueue(t *testing.T, node Node, name string) string { + t.Helper() + status, out := callSQS(t, node, sqsCreateQueueTarget, map[string]any{"QueueName": name}) + if status != http.StatusOK { + t.Fatalf("createQueue %q: status %d body %v", name, status, out) + } + url, _ := out["QueueUrl"].(string) + if url == "" { + t.Fatalf("createQueue %q: missing QueueUrl", name) + } + return url +} + +// TestSQSServer_Throttle_NoOpSetQueueAttributesPreservesBucket pins +// the no-op gate. Earlier code invalidated the +// throttle bucket whenever any Throttle* attribute appeared in a +// SetQueueAttributes request — including same-value writes. A caller +// could therefore force the bucket back to full capacity by +// re-submitting their own current throttle config in a tight loop, +// effectively bypassing the rate limit. The fix gates the +// invalidation on a real value change (snapshot before applyAttributes, +// throttleConfigEqual after), so a no-op SetQueueAttributes leaves +// the in-memory bucket alone. +func TestSQSServer_Throttle_NoOpSetQueueAttributesPreservesBucket(t *testing.T) { + t.Parallel() + nodes, _, _ := createNode(t, 1) + defer shutdown(nodes) + node := sqsLeaderNode(t, nodes) + + url := mustCreateQueue(t, node, "throttle-noop-set") + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + // Drain the bucket so the next charge would only succeed if the + // bucket was reset to a fresh full-capacity replacement. + for range 10 { + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "drain", + }) + if status != http.StatusOK { + t.Fatalf("drain send: status %d", status) + } + } + // Sanity-check: drained bucket rejects. + status, _ := callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "should-throttle", + }) + if status != http.StatusBadRequest { + t.Fatalf("expected throttle, got %d", status) + } + // Re-submit identical Throttle* values. Old code invalidated on + // key presence and the next send would have been allowed against + // a fresh full bucket. + mustSetQueueAttributes(t, node, url, map[string]string{ + "ThrottleSendCapacity": "10", + "ThrottleSendRefillPerSecond": "1", + }) + // Bucket must still be drained — no-op SetQueueAttributes must not + // reset the rate limiter. + status, _ = callSQS(t, node, sqsSendMessageTarget, map[string]any{ + "QueueUrl": url, "MessageBody": "post-noop-set", + }) + if status != http.StatusBadRequest { + t.Fatalf("post-no-op SetQueueAttributes send: status %d (expected 400; "+ + "no-op invalidate-bypass regression)", status) + } +} + +func mustSetQueueAttributes(t *testing.T, node Node, url string, attrs map[string]string) { + t.Helper() + status, out := callSQS(t, node, sqsSetQueueAttributesTarget, map[string]any{ + "QueueUrl": url, + "Attributes": attrs, + }) + if status != http.StatusOK { + t.Fatalf("setQueueAttributes(%v): status %d body %v", attrs, status, out) + } +} diff --git a/adapter/sqs_throttle_test.go b/adapter/sqs_throttle_test.go new file mode 100644 index 000000000..eddc732d4 --- /dev/null +++ b/adapter/sqs_throttle_test.go @@ -0,0 +1,880 @@ +package adapter + +import ( + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestBucketStore_DefaultOff_ShortCircuit pins the contract that a +// nil throttle config never allocates a bucket and never rejects. +// This is the hot path for unconfigured queues — every nil-check that +// short-circuits keeps the per-request cost at one map-load on the +// SQSServer struct and one nil-comparison in charge(). +func TestBucketStore_DefaultOff_ShortCircuit(t *testing.T) { + t.Parallel() + store := newBucketStoreDefault() + for range 100 { + out := store.charge(nil, "orders", bucketActionSend, 1, 1) + require.True(t, out.allowed) + require.False(t, out.bucketPresent, "nil cfg must not allocate a bucket") + } +} + +// TestBucketStore_Empty_ShortCircuit covers the post-validator +// canonicalisation path: an all-zero sqsQueueThrottle is equivalent +// to nil. Without this branch, a queue whose operator wrote +// "ThrottleSendCapacity=0" would still pay the bucket allocation. +func TestBucketStore_Empty_ShortCircuit(t *testing.T) { + t.Parallel() + store := newBucketStoreDefault() + out := store.charge(&sqsQueueThrottle{}, "orders", bucketActionSend, 1, 1) + require.True(t, out.allowed) + require.False(t, out.bucketPresent) +} + +// TestBucketStore_FreshAllowsUpToCapacity checks the fresh-bucket +// initial-state contract: a brand-new bucket starts at full capacity +// and accepts exactly that many tokens before rejecting the next one. +// This matches both the AWS rate-limiter behaviour and the §3.1 +// failover semantic ("fresh bucket on failover starts at capacity"). +func TestBucketStore_FreshAllowsUpToCapacity(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + for i := range 10 { + out := store.charge(cfg, "orders", bucketActionSend, 1, 1) + require.True(t, out.allowed, "send %d must be allowed", i+1) + } + out := store.charge(cfg, "orders", bucketActionSend, 1, 1) + require.False(t, out.allowed, "11th send must be rejected") + require.Equal(t, time.Second, out.retryAfter, "Retry-After floor is 1s") +} + +// TestBucketStore_RefillBetweenChargesUsesElapsed pins the refill +// math: tokens accrue at refillRate per elapsed second, capped at +// capacity. Time is injected so the test does not race the wall +// clock. +func TestBucketStore_RefillBetweenChargesUsesElapsed(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 5} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + // Drain. + for range 10 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + // Advance 1.5s → 7.5 tokens accrued (capped under capacity 10). + now = now.Add(1500 * time.Millisecond) + for range 7 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "after 1.5s refill at 5 RPS, 7 sends must succeed") + } + // 8th must reject — only 7.5 tokens accrued, charged 7, leaves 0.5. + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) +} + +// TestBucketStore_RefillCapsAtCapacity pins the upper bound on +// long-idle refill: a queue idle for an hour does NOT come back with +// 3600 tokens — the bucket caps at the configured capacity. +func TestBucketStore_RefillCapsAtCapacity(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, 2*time.Hour) + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + now = now.Add(time.Hour) // 3600 seconds, would be 3600 tokens uncapped + for range 10 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "refill capped at capacity: 11th send post-idle must reject") +} + +// TestBucketStore_BatchRejectsWholeBatchWhenShort pins the §3.3 +// "batch verbs charge before dispatching individual entries" rule. +// A bucket with 3 tokens facing a 10-entry batch rejects the whole +// call and consumes nothing — partial-credit behaviour would make the +// "I have 3, you wanted 10" semantics ambiguous and AWS itself +// rejects the whole call. +func TestBucketStore_BatchRejectsWholeBatchWhenShort(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + // Drain to 3. + for range 7 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + } + // Try a 10-entry batch — should reject without consuming the 3. + out := store.charge(cfg, "orders", bucketActionSend, 1, 10) + require.False(t, out.allowed) + require.Equal(t, 7*time.Second, out.retryAfter, + "Retry-After computed from (10-3)/1 = 7s") + // The 3 leftover tokens are still spendable. + for range 3 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "the rejected batch must not have drained the leftover credit") + } +} + +// TestBucketStore_RetryAfterUsesRequestedCount pins the §3.4 fix +// where the formula's numerator is the requested count, not 1. A +// SendMessageBatch of 10 against refillRate=1 with 0 tokens needs 10s +// to refill, not 1s — telling the client to wait 1s creates a busy- +// loop of premature retries that all reject again. +func TestBucketStore_RetryAfterUsesRequestedCount(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + for range 10 { + store.charge(cfg, "orders", bucketActionSend, 1, 1) + } + // Now batch of 10 against an empty bucket: needs 10s to refill. + out := store.charge(cfg, "orders", bucketActionSend, 1, 10) + require.False(t, out.allowed) + require.Equal(t, 10*time.Second, out.retryAfter) +} + +// TestBucketStore_RetryAfterFloorWithSlowRefill pins the §3.4 rule +// for sub-1-RPS rates: SendRefillPerSecond=0.1 with 0 tokens needs +// 10s for the next single token, not 1s. +func TestBucketStore_RetryAfterFloorWithSlowRefill(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 0.1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + for range 10 { + store.charge(cfg, "orders", bucketActionSend, 1, 1) + } + out := store.charge(cfg, "orders", bucketActionSend, 1, 1) + require.False(t, out.allowed) + require.Equal(t, 10*time.Second, out.retryAfter) +} + +// TestBucketStore_ActionsHaveSeparateBuckets pins the (queue, action) +// granularity: a Send-bucket exhaustion does not leak into the Recv +// bucket's accounting and vice versa. +func TestBucketStore_ActionsHaveSeparateBuckets(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{ + SendCapacity: 10, SendRefillPerSecond: 1, + RecvCapacity: 10, RecvRefillPerSecond: 1, + } + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + // Drain Send. + for range 10 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + // Recv must still have full capacity. + for range 10 { + require.True(t, store.charge(cfg, "orders", bucketActionReceive, 1, 1).allowed) + } +} + +// TestBucketStore_QueuesHaveSeparateBuckets pins per-queue isolation: +// a noisy queue does not consume another queue's budget. +func TestBucketStore_QueuesHaveSeparateBuckets(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + for range 10 { + store.charge(cfg, "orders", bucketActionSend, 1, 1) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + // Other queue, same cfg → fresh bucket. + for range 10 { + require.True(t, store.charge(cfg, "events", bucketActionSend, 1, 1).allowed) + } +} + +// TestBucketStore_DefaultBucketCovers covers the §3.2 "Default*" +// fallback: a verb that doesn't match Send or Recv falls through to +// Default, allowing operators to set one cap that covers everything. +func TestBucketStore_DefaultBucketCovers(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{ + DefaultCapacity: 5, DefaultRefillPerSecond: 1, + } + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + for range 5 { + require.True(t, store.charge(cfg, "orders", bucketActionAny, 1, 1).allowed) + } + require.False(t, store.charge(cfg, "orders", bucketActionAny, 1, 1).allowed) + // And Send falls through to Default too when only Default is set. + for range 5 { + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "Send falls through to Default which is empty") + } +} + +// TestBucketStore_ReconcilesBucketOnConfigChange pins the +// reconciliation contract: a cached bucket whose capacity/refillRate +// no longer match the queue's current Throttle config gets rebuilt +// on the next charge() call. Without this, a node that loses leadership +// during a SetQueueAttributes commit and regains it later would keep +// enforcing the prior leader-term's limits — the SetQueueAttributes +// invalidation only runs on the leader that processed the commit, +// so a different leader's stale buckets survive. +func TestBucketStore_ReconcilesBucketOnConfigChange(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + cfgOld := &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1} + // Drain the old bucket entirely. + for range 10 { + require.True(t, store.charge(cfgOld, "orders", bucketActionSend, 1, 1).allowed) + } + require.False(t, store.charge(cfgOld, "orders", bucketActionSend, 1, 1).allowed, + "sanity: old config bucket exhausted") + // Now charge with a NEW config — capacity 100, refill 50. The + // bucket reconciliation must spot the cap/refill mismatch and + // rebuild a fresh bucket at the new full capacity. + cfgNew := &sqsQueueThrottle{SendCapacity: 100, SendRefillPerSecond: 50} + for range 100 { + require.True(t, store.charge(cfgNew, "orders", bucketActionSend, 1, 1).allowed, + "new config charge must succeed against a fresh bucket; stale-bucket bug would reject") + } + // 101st must reject under the new cap. + require.False(t, store.charge(cfgNew, "orders", bucketActionSend, 1, 1).allowed) +} + +// TestBucketStore_ConcurrentReconciliationRespectsNewCapacity pins +// the CompareAndDelete fix: two concurrent +// goroutines hitting a stale bucket must not race each other into +// double-replacing the map entry. Without CompareAndDelete the +// second goroutine's unconditional Delete would evict the first +// goroutine's fresh bucket, leaving the second's fresh bucket +// behind — but the first's bucket is already being charged, so +// total charges across the mismatch window can exceed the new +// capacity. +// +// Race the test by having N goroutines each invoke charge() with +// the new config (post-mismatch) on the same (queue, action). The +// first one through builds the fresh bucket; every later one must +// observe the same fresh bucket and share its capacity. After all +// goroutines finish, total successful charges must equal exactly +// the new capacity — anything more means a Delete-after-replace +// orphaned a fresh bucket. +func TestBucketStore_ConcurrentReconciliationRespectsNewCapacity(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + + // Seed the store with a stale bucket from cfgOld. + cfgOld := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + for range 5 { + require.True(t, store.charge(cfgOld, "orders", bucketActionSend, 1, 1).allowed) + } + + // Now race many goroutines through the new config. Each charge + // triggers reconciliation against cfgNew. The race window is + // between Load detecting the stale bucket and CompareAndDelete + + // LoadOrStore committing the replacement; without + // CompareAndDelete, two racers can each Delete + LoadOrStore and + // the loser's fresh bucket may end up orphaned while still being + // charged through a leaked pointer. + cfgNew := &sqsQueueThrottle{SendCapacity: 50, SendRefillPerSecond: 1} + const goroutines = 200 + var ( + wg sync.WaitGroup + successes int64 + mu sync.Mutex + ) + for range goroutines { + wg.Add(1) + go func() { + defer wg.Done() + if store.charge(cfgNew, "orders", bucketActionSend, 1, 1).allowed { + mu.Lock() + successes++ + mu.Unlock() + } + }() + } + wg.Wait() + require.EqualValues(t, 50, successes, + "exactly cfgNew.SendCapacity successes; a Delete-after-replace race would let some past the cap") +} + +// TestBucketStore_SweepRaceDoesNotInflateBudget pins the sweep +// race fix. The earlier code path was: +// +// sweep computes idle under mu, releases mu, then Deletes. +// A concurrent charge() that loaded the same bucket pre-Delete +// would refill+take after sweep released mu, then later requests +// would miss the map and create a fresh full-capacity bucket — +// a one-time burst of up to 2× capacity per evict cycle. +// +// Two complementary guards close it: holding mu across the Delete +// closes half the window, and setting evicted=true under mu closes +// the rest — goroutines that loaded the bucket *before* sweep +// removed it from the map see the flag on their mu acquisition and +// retry against the live entry instead of charging the orphan. +// +// The test is a -race stress test: race sweep against many chargers +// hammering the same bucket. The integrity assertion is on the total +// successful-charge count: with a fully-refilled idle bucket of +// capacity=N entering the race, the maximum tokens any sequence of +// charges should observe is N. The old buggy code could yield up to +// 2N when the race triggered the orphan path. +func TestBucketStore_SweepRaceDoesNotInflateBudget(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + clk := now + store := newBucketStore(func() time.Time { return clk }, time.Hour) + const capacity = 10 + cfg := &sqsQueueThrottle{SendCapacity: capacity, SendRefillPerSecond: 1} + // Build the bucket via a single charge so it lands in the store, + // then backdate it past the evict cutoff. The clock is then frozen + // so refill cannot top up tokens during the race — every charge + // either spends an existing token or fails, making the total- + // success count a tight bound on the burst budget. + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + key := bucketKey{queue: "orders", action: bucketActionSend, incarnation: 1} + v, ok := store.buckets.Load(key) + require.True(t, ok) + bucket, _ := v.(*tokenBucket) + bucket.mu.Lock() + bucket.lastRefill = now.Add(-2 * time.Hour) + bucket.tokens = capacity + bucket.mu.Unlock() + clk = now.Add(2 * time.Hour) + + var wg sync.WaitGroup + var successes atomic.Int64 + const chargers = 64 + const sweeps = 4 + wg.Add(chargers + sweeps) + for range sweeps { + go func() { + defer wg.Done() + store.sweep() + }() + } + for range chargers { + go func() { + defer wg.Done() + if store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed { + successes.Add(1) + } + }() + } + wg.Wait() + + // Old code: a charger that loaded the pre-Delete bucket could take + // a token, then later chargers would create a fresh full-capacity + // bucket — total successes could climb to 2*capacity. With the + // evicted-flag retry, every charger converges on a single live + // bucket and total successes are bounded by capacity. + require.LessOrEqualf(t, successes.Load(), int64(capacity), + "sweep race must not let total successful charges exceed capacity (got %d, capacity %d)", + successes.Load(), capacity) +} + +// TestBucketStore_OrphanedBucketRetriesToLiveEntry exercises the +// evicted-flag retry path in chargeBucket directly. The race the +// stress test above tries to trigger probabilistically is forced +// here deterministically by interleaving the charge / sweep steps +// by hand. +func TestBucketStore_OrphanedBucketRetriesToLiveEntry(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + clk := now + store := newBucketStore(func() time.Time { return clk }, time.Hour) + cfg := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + key := bucketKey{queue: "orders", action: bucketActionSend, incarnation: 1} + v, ok := store.buckets.Load(key) + require.True(t, ok) + original, _ := v.(*tokenBucket) + // Backdate the bucket and advance the clock so sweep evicts it. + original.mu.Lock() + original.lastRefill = now.Add(-2 * time.Hour) + original.mu.Unlock() + clk = now.Add(2 * time.Hour) + + store.sweep() + + // Sweep must have evicted the bucket from the map and marked it. + _, stillThere := store.buckets.Load(key) + require.False(t, stillThere, "sweep must remove the idle bucket from the map") + original.mu.Lock() + require.True(t, original.evicted, "sweep must mark the dropped bucket evicted") + original.mu.Unlock() + + // A charge against the live store reaches a fresh bucket via the + // loadOrInit path; any goroutine still holding the orphan would + // retry through chargeBucket's evicted check and converge here. + out := store.charge(cfg, "orders", bucketActionSend, 1, 1) + require.True(t, out.allowed) + v2, ok := store.buckets.Load(key) + require.True(t, ok) + live, _ := v2.(*tokenBucket) + require.NotSame(t, original, live, "post-eviction charge must allocate a fresh bucket") +} + +// TestBucketStore_LoadOrInitReconciliationMarksOrphanEvictedUnderMu +// pins the lock-then-delete-then-flag ordering. The earlier +// loadOrInit reconciliation path used: +// +// bucket.mu.Lock(); matches := ...; bucket.mu.Unlock() +// if !matches { CompareAndDelete; bucket.mu.Lock(); evicted=true; Unlock } +// +// which left an unlocked window between CompareAndDelete success +// and the re-lock. A concurrent charger that already held the +// stale bucket pointer could acquire bucket.mu in that window, +// observe evicted=false, and spend tokens against the orphaned +// bucket. The fix holds bucket.mu across the matches check, the +// CompareAndDelete, and the evicted=true assignment — matching +// the lock-then-delete-then-flag ordering sweep / invalidateQueue +// already use. This deterministic test replicates the post- +// reconciliation orphan check by forcing a config mismatch and +// verifying the orphan bucket is evicted=true after the +// reconciler's call returns. +func TestBucketStore_LoadOrInitReconciliationMarksOrphanEvictedUnderMu(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + // Seed with a bucket built from cfgOld. + cfgOld := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + require.True(t, store.charge(cfgOld, "orders", bucketActionSend, 1, 1).allowed) + key := bucketKey{queue: "orders", action: bucketActionSend, incarnation: 1} + v, ok := store.buckets.Load(key) + require.True(t, ok) + original, _ := v.(*tokenBucket) + + // Trigger reconciliation with a different config — loadOrInit + // must spot the cap/refill mismatch, evict the original bucket, + // and mark it evicted=true under the same mu acquisition. + cfgNew := &sqsQueueThrottle{SendCapacity: 50, SendRefillPerSecond: 50} + require.True(t, store.charge(cfgNew, "orders", bucketActionSend, 1, 1).allowed) + + // The map should now hold a fresh bucket (different pointer). + v2, ok := store.buckets.Load(key) + require.True(t, ok) + live, _ := v2.(*tokenBucket) + require.NotSame(t, original, live, + "reconciliation must replace the stale bucket with a fresh one") + + // The original (orphan) bucket must be evicted=true. With the + // pre-fix unlock-between-CompareAndDelete-and-flag ordering, a + // concurrent charger could observe evicted=false in the gap and + // spend a token before the flag flipped. + original.mu.Lock() + require.True(t, original.evicted, + "reconciliation must set evicted=true on the orphan bucket "+ + "under the same mu it held for the matches check, so a "+ + "concurrent charger blocked on mu sees the flag on entry "+ + "and retries against the live bucket") + original.mu.Unlock() +} + +// TestBucketStore_InvalidateMarksOrphanEvicted pins the +// invalidateQueue contract: dropped buckets must flip evicted=true under mu +// so a sendMessage that loaded meta pre-invalidation and is racing +// against DeleteQueue / SetQueueAttributes / CreateQueue retries +// rather than charging the orphan. +func TestBucketStore_InvalidateMarksOrphanEvicted(t *testing.T) { + t.Parallel() + store := newBucketStoreDefault() + cfg := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + key := bucketKey{queue: "orders", action: bucketActionSend, incarnation: 1} + v, ok := store.buckets.Load(key) + require.True(t, ok) + original, _ := v.(*tokenBucket) + + store.invalidateQueue("orders") + + _, stillThere := store.buckets.Load(key) + require.False(t, stillThere, "invalidateQueue must remove the bucket from the map") + original.mu.Lock() + require.True(t, original.evicted, "invalidateQueue must mark the dropped bucket evicted") + original.mu.Unlock() +} + +// TestBucketStore_InvalidateUnderConcurrencyIsRaceFree pins the +// invalidateQueue lock ordering. The earlier invalidateQueue used +// LoadAndDelete-then-lock, which let a concurrent charger that loaded +// the pointer pre-LoadAndDelete acquire bucket.mu first and observe +// evicted=false on a bucket that had already been removed from the +// map. The fix mirrors sweep's lock-then-CompareAndDelete-then-flag +// ordering, so any charger blocked on mu sees evicted=true the moment +// it unblocks and retries against the live entry. +// +// Bounding the *successful charge count* across an invalidate race is +// not meaningful: invalidate is supposed to reset the bucket, so the +// post-invalidate fresh bucket can absorb up to capacity additional +// tokens by design — that 2× window is structural, not a bug. What +// the fix guarantees instead is that the race is data-race-clean +// (-race detector finds nothing) and that any bucket the store +// removed is observably evicted=true under mu when the next charger +// acquires it. The deterministic +// TestBucketStore_InvalidateMarksOrphanEvicted pins that property +// directly; this stress test exists to surface any new -race finding +// the new lock ordering might introduce. +func TestBucketStore_InvalidateUnderConcurrencyIsRaceFree(t *testing.T) { + t.Parallel() + const capacity = 10 + cfg := &sqsQueueThrottle{SendCapacity: capacity, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + + var wg sync.WaitGroup + const chargers = 64 + const invalidates = 4 + wg.Add(chargers + invalidates) + for range invalidates { + go func() { + defer wg.Done() + store.invalidateQueue("orders") + }() + } + for range chargers { + go func() { + defer wg.Done() + store.charge(cfg, "orders", bucketActionSend, 1, 1) + }() + } + wg.Wait() +} + +// TestComputeRetryAfter_CapsAtMaximum pins the overflow guard: +// a tiny refillRate (e.g. 1e-9) plus a large requested +// count would otherwise compute a multi-day Retry-After and +// time.Duration arithmetic could overflow. Capped at +// throttleRetryAfterCap so the client always sees a sane value. +func TestComputeRetryAfter_CapsAtMaximum(t *testing.T) { + t.Parallel() + got := computeRetryAfter(1, 0, 1e-9) + require.Equal(t, throttleRetryAfterCap, got, + "computeRetryAfter must cap at throttleRetryAfterCap regardless of input") +} + +// TestThrottleAttributesPresent covers the request-gate helper used +// by setQueueAttributes to skip cache invalidation on unrelated +// updates. +func TestThrottleAttributesPresent(t *testing.T) { + t.Parallel() + require.False(t, throttleAttributesPresent(map[string]string{})) + require.False(t, throttleAttributesPresent(map[string]string{"VisibilityTimeout": "30"})) + require.True(t, throttleAttributesPresent(map[string]string{"ThrottleSendCapacity": "10"})) + require.True(t, throttleAttributesPresent(map[string]string{"ThrottleRecvRefillPerSecond": "5"})) + require.True(t, throttleAttributesPresent(map[string]string{"ThrottleDefaultCapacity": "5"})) +} + +// TestBucketStore_IncarnationKeyedDoesNotReuseAcrossIncarnations +// pins the incarnation-keyed contract: bucketKey includes incarnation +// so a DeleteQueue+CreateQueue cycle (or a leadership move to a node holding +// a stale per-process cache) lands the new incarnation under a different +// map entry and starts from a fresh full bucket. Without incarnation in +// the key, the recreated queue would inherit the drained token state +// from the previous incarnation. +// +// charge signature is (cfg, queue, action, incarnation, count). +func TestBucketStore_IncarnationKeyedDoesNotReuseAcrossIncarnations(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + cfg := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + // Drain incarnation 1 entirely (5 successful charges of 1 token). + for range 5 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "sanity: incarnation=1 bucket drained") + // Incarnation 2 (recreate) must start from a fresh full bucket + // regardless of what incarnation=1 left in cache. Stale-key code + // would keep rejecting on the same drained bucket. + for range 5 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 2, 1).allowed, + "incarnation=2 charge must succeed against a fresh bucket; "+ + "shared-key bug would reuse incarnation=1's drained state") + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 2, 1).allowed, + "incarnation=2 bucket must enforce its own capacity once drained") + // incarnation=1's bucket should still be drained — the two + // incarnations must not cross-pollinate. + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed, + "incarnation=1 must remain drained; incarnation=2 charges must not refill incarnation=1") +} + +// TestBucketStore_InvalidateQueueClearsAllIncarnations pins that +// invalidateQueue ranges by queue prefix and removes every cached +// incarnation, not just one. Required because the leader's view of +// "current incarnation" can drift from a follower's during a +// failover-mid-CreateQueue, and an invalidate that only hit one +// incarnation would leave the other side stale. +func TestBucketStore_InvalidateQueueClearsAllIncarnations(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + cfg := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + // Populate orders@incarnation=1, orders@incarnation=2, events@incarnation=1. + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + require.True(t, store.charge(cfg, "orders", bucketActionSend, 2, 1).allowed) + require.True(t, store.charge(cfg, "events", bucketActionSend, 1, 1).allowed) + require.Equal(t, 3, countBuckets(store)) + + store.invalidateQueue("orders") + + // Both orders incarnations gone, events untouched. + require.Equal(t, 1, countBuckets(store), + "invalidateQueue must drop every incarnation belonging to the queue") + _, hasOrdersInc1 := store.buckets.Load(bucketKey{queue: "orders", action: bucketActionSend, incarnation: 1}) + _, hasOrdersInc2 := store.buckets.Load(bucketKey{queue: "orders", action: bucketActionSend, incarnation: 2}) + _, hasEvents := store.buckets.Load(bucketKey{queue: "events", action: bucketActionSend, incarnation: 1}) + require.False(t, hasOrdersInc1) + require.False(t, hasOrdersInc2) + require.True(t, hasEvents, "unrelated queue must not be evicted") +} + +// TestBucketStore_PurgeQueueDoesNotResetBucket pins the +// PurgeQueue-bypass guard: PurgeQueue bumps sqsQueueMeta.Generation +// but preserves Incarnation, and the throttle bucket keys by Incarnation. +// Earlier code keyed by Generation, which let a caller bypass the +// rate limit by repeatedly purging — every purge re-keyed the bucket +// and the next charge minted a fresh full-capacity replacement. The +// 60s AWS-spec PurgeQueue rate-limit bounded the bypass but it was +// real. This test simulates the purge by keeping the same +// incarnation across two charges with different generation values +// (which the throttle layer doesn't see, but would be the upstream +// signal — the API contract is "incarnation only changes on +// CreateQueue"). +func TestBucketStore_PurgeQueueDoesNotResetBucket(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + cfg := &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1} + const incarnation uint64 = 1 + for range 5 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, incarnation, 1).allowed) + } + // Bucket fully drained. + require.False(t, store.charge(cfg, "orders", bucketActionSend, incarnation, 1).allowed) + // "Purge" — in the real wire path, sqsQueueMeta.Generation bumps + // here but sqsQueueMeta.Incarnation does not. The throttle layer + // only sees incarnation, so the very next charge must still find + // the same drained bucket — not a fresh one. + require.False(t, store.charge(cfg, "orders", bucketActionSend, incarnation, 1).allowed, + "PurgeQueue (which preserves Incarnation) must not reset the throttle bucket; "+ + "a Generation-keyed bucket would be re-keyed and let a caller "+ + "bypass the rate limit by repeatedly purging") +} + +func countBuckets(b *bucketStore) int { + n := 0 + b.buckets.Range(func(_, _ any) bool { + n++ + return true + }) + return n +} + +func TestBucketStore_InvalidateQueueDropsAllActions(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{ + SendCapacity: 10, SendRefillPerSecond: 1, + RecvCapacity: 10, RecvRefillPerSecond: 1, + } + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + // Drain both buckets. + for range 10 { + store.charge(cfg, "orders", bucketActionSend, 1, 1) + store.charge(cfg, "orders", bucketActionReceive, 1, 1) + } + require.False(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + require.False(t, store.charge(cfg, "orders", bucketActionReceive, 1, 1).allowed) + // Invalidate. + store.invalidateQueue("orders") + // Both buckets must now be at full capacity again. + for range 10 { + require.True(t, store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed) + require.True(t, store.charge(cfg, "orders", bucketActionReceive, 1, 1).allowed) + } +} + +// TestBucketStore_ConcurrentChargesPreserveCount pins the concurrency +// contract under -race: 100 goroutines race for tokens against a +// capacity-50 bucket. Exactly 50 must succeed; the other 50 must be +// rejected. Anything else (101 successes, partial-credit consumption +// during reject) means the per-bucket mutex is broken. +func TestBucketStore_ConcurrentChargesPreserveCount(t *testing.T) { + t.Parallel() + cfg := &sqsQueueThrottle{SendCapacity: 50, SendRefillPerSecond: 1} + now := time.Date(2026, 4, 27, 10, 0, 0, 0, time.UTC) + store := newBucketStore(func() time.Time { return now }, time.Hour) + var ( + wg sync.WaitGroup + successes int64 + mu sync.Mutex + ) + for range 100 { + wg.Add(1) + go func() { + defer wg.Done() + if store.charge(cfg, "orders", bucketActionSend, 1, 1).allowed { + mu.Lock() + successes++ + mu.Unlock() + } + }() + } + wg.Wait() + require.EqualValues(t, 50, successes, + "exactly capacity successes; broken mutex would let some race past or double-charge") +} + +// --- Validator tests --- + +// TestValidateThrottleConfig_NilOrEmpty is the no-op: a meta with no +// Throttle, or with the zero-valued struct, validates clean and gets +// canonicalised so downstream code only has to handle the nil case. +func TestValidateThrottleConfig_NilOrEmpty(t *testing.T) { + t.Parallel() + m := &sqsQueueMeta{} + require.NoError(t, validateThrottleConfig(m)) + require.Nil(t, m.Throttle) + m.Throttle = &sqsQueueThrottle{} + require.NoError(t, validateThrottleConfig(m)) + require.Nil(t, m.Throttle, "all-zero post-validate must canonicalise to nil") +} + +// TestValidateThrottleConfig_BothZeroOrBothPositive pins the §3.2 +// pair-wise rule: an action's capacity and refill must agree on +// whether the action is enabled. +func TestValidateThrottleConfig_BothZeroOrBothPositive(t *testing.T) { + t.Parallel() + cases := []struct { + name string + cfg sqsQueueThrottle + wantErr bool + }{ + {"send capacity without refill", sqsQueueThrottle{SendCapacity: 10}, true}, + {"send refill without capacity", sqsQueueThrottle{SendRefillPerSecond: 1}, true}, + {"recv capacity without refill", sqsQueueThrottle{RecvCapacity: 10}, true}, + {"recv refill without capacity", sqsQueueThrottle{RecvRefillPerSecond: 1}, true}, + {"both positive ok", sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + cfg := tc.cfg + err := validateThrottleConfig(&sqsQueueMeta{Throttle: &cfg}) + if tc.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +// TestValidateThrottleConfig_CapacityGEMaxBatchCharge pins the §3.2 +// floor for batch-covered actions: SendMessageBatch and +// DeleteMessageBatch each charge up to 10 tokens, so a capacity below +// 10 makes every full batch permanently unserviceable. +func TestValidateThrottleConfig_CapacityGEMaxBatchCharge(t *testing.T) { + t.Parallel() + err := validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{SendCapacity: 5, SendRefillPerSecond: 1}, + }) + require.Error(t, err) + err = validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{RecvCapacity: 9, RecvRefillPerSecond: 1}, + }) + require.Error(t, err) + err = validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 1}, + }) + require.NoError(t, err) +} + +// TestValidateThrottleConfig_DefaultBucketBatchFloor pins the +// Default* batch floor: Default* gets the same batch-capacity ≥ 10 +// floor as Send/Recv because resolveActionConfig +// falls Send/Recv traffic through to Default when the dedicated +// pair is unset. Without the floor a Default-only config of +// {capacity=5, refill=1} would accept SendMessageBatch entries=10 +// requests at the validator and reject them forever at the bucket. +func TestValidateThrottleConfig_DefaultBucketBatchFloor(t *testing.T) { + t.Parallel() + // Capacity 1 (below the batch floor) must reject. + err := validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{DefaultCapacity: 1, DefaultRefillPerSecond: 1}, + }) + require.Error(t, err) + // Capacity below batch floor at 5 must also reject. + err = validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{DefaultCapacity: 5, DefaultRefillPerSecond: 1}, + }) + require.Error(t, err) + // Capacity exactly at the batch floor is accepted. + err = validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{DefaultCapacity: 10, DefaultRefillPerSecond: 1}, + }) + require.NoError(t, err) +} + +// TestValidateThrottleConfig_CapacityGERefill pins the §3.2 burst +// rule: capacity below refill makes the bucket unable to accumulate +// any burst headroom — the capacity floor is the refill rate. +func TestValidateThrottleConfig_CapacityGERefill(t *testing.T) { + t.Parallel() + err := validateThrottleConfig(&sqsQueueMeta{ + Throttle: &sqsQueueThrottle{SendCapacity: 10, SendRefillPerSecond: 50}, + }) + require.Error(t, err) +} + +// TestParseThrottleFloat_RejectsBadInputs covers the per-field +// parser: NaN, infinity, negative values, malformed strings, and the +// hard ceiling all reject with InvalidAttributeValue. +func TestParseThrottleFloat_RejectsBadInputs(t *testing.T) { + t.Parallel() + bad := []string{ + "", + "not a number", + "NaN", + "Inf", + "-1", + "-0.5", + "1e100", // > hard ceiling + "100000.01", // > hard ceiling by epsilon + } + for _, in := range bad { + t.Run(in, func(t *testing.T) { + t.Parallel() + _, err := parseThrottleFloat(in) + require.Error(t, err, "input %q must be rejected", in) + }) + } + // Boundary: hard ceiling exactly is accepted. + v, err := parseThrottleFloat("100000") + require.NoError(t, err) + require.Equal(t, 100000.0, v) +} + +// TestComputeRetryAfter_FloorsAtOneSecond pins the §3.4 minimum-1 +// floor: HTTP/1.1 §10.2.3 specifies integer-second granularity, so +// even a sub-second wait is rounded up to 1. +func TestComputeRetryAfter_FloorsAtOneSecond(t *testing.T) { + t.Parallel() + // needed=0.5, refill=10 → ceil(0.05) = 1 + require.Equal(t, time.Second, computeRetryAfter(1, 0.5, 10)) + // needed=1, refill=100 → ceil(0.01) = 1 + require.Equal(t, time.Second, computeRetryAfter(1, 0, 100)) +} diff --git a/docs/design/2026_04_26_proposed_sqs_per_queue_throttling.md b/docs/design/2026_04_26_proposed_sqs_per_queue_throttling.md new file mode 100644 index 000000000..91e66be67 --- /dev/null +++ b/docs/design/2026_04_26_proposed_sqs_per_queue_throttling.md @@ -0,0 +1,338 @@ +# Per-Queue Throttling and Tenant Fairness for the SQS Adapter + +**Status:** Proposed +**Author:** bootjp +**Date:** 2026-04-26 + +--- + +## 1. Background and Motivation + +elastickv's SQS adapter currently has **no per-queue rate limiting**. A single tenant's runaway producer can: + +1. Saturate the leader's Raft proposal pipeline (one OCC dispatch per `SendMessage`), pushing latency on every other queue's writes through the same shard. +2. Exhaust the receive-path's visibility-index scan budget (`sqsVisScanPageLimit = 1024`), causing other tenants' `ReceiveMessage` calls to time out empty. +3. Fill the message keyspace fast enough that the retention reaper cannot keep up — the keyspace grows unbounded until the next manual purge. + +Phase 3.C in [`docs/design/2026_04_24_partial_sqs_compatible_adapter.md`](2026_04_24_partial_sqs_compatible_adapter.md) §16.5 marks this as TODO. AWS itself enforces per-account / per-API limits ("standard request throttle of 3000 RPS per region per AWS account" plus per-API limits like 300 TPS for batch APIs); operators running elastickv as a multi-tenant SQS facade need an equivalent control plane. Without it, the only knobs are (a) shard-level capacity (too coarse — adding a shard requires a Raft membership change) and (b) external load-balancer rate limiting (no visibility into per-queue cost). + +This document proposes per-queue token-bucket throttling, configured per-queue in queue meta, evaluated at the SQS-adapter layer on the leader, and surfaced as the same `Throttling.Sender` error AWS uses (so existing SDK retry/backoff logic engages naturally). + +--- + +## 2. Goals and Non-Goals + +### 2.1 Goals + +1. **Per-queue rate limits** that an operator can set via `SetQueueAttributes` and read back via `GetQueueAttributes`. Limits are persisted on the queue meta record (one Raft commit, no separate keyspace). +2. **Per-action granularity** — `SendMessage` and `ReceiveMessage` have independent buckets so a slow consumer cannot pin the producer or vice versa. Batch verbs charge by entry count, not by call count. +3. **AWS-shape errors**: throttled requests return HTTP `400` with the `Throttling` error code in whichever envelope the request protocol uses (`{"__type":"Throttling", ...}` for the JSON path, `Throttling` for the query/XML path — see §3.4 for the exact wire shape per protocol) and a `Retry-After` header. SDKs already special-case this code with exponential backoff; we do not invent a new code. +4. **Default-off**. Queues created before this feature, and queues created without explicit limits, are not throttled. Operators opt in per queue. +5. **No coordination per request**. Token replenishment is local to whichever node owns the bucket (the leader for the queue's shard); there is no Raft round-trip on the throttling check. +6. **Observable**: per-queue throttle counters are exposed via the existing Prometheus registry so dashboards can spot throttling before users do. + +### 2.2 Non-Goals + +1. **Cross-shard tenant fairness**. A noisy queue on shard A still affects other queues on shard A; the cross-shard story is the future "queue-level scheduling" RFC, not this one. +2. **Per-message-attribute limits** ("max 1MB/s of binary attributes"). Useful, but a bytes-budget on top of the count-budget is a follow-up. +3. **Cluster-wide limits** ("the entire elastickv deployment caps at 50,000 RPS"). Out of scope; operators put a global limit at the load balancer. +4. **Throttling on the read-only fast path** (`ListQueues`, `GetQueueUrl`, `GetQueueAttributes`). These are point reads or scans of the catalog; they do not touch user messages and have their own per-listener resource caps. +5. **Distributed token buckets** (Redis-backed, gossip-replicated, etc.). Per-leader buckets are sufficient for the queue-per-shard model; cross-shard distribution is a Phase 4 problem. + +--- + +## 3. High-Level Design + +```text + ┌────────────────────────────┐ + │ SQS HTTP listener │ + ┌──────────┐ │ (leader-resolved already) │ + │ producer │──▶│ │ + └──────────┘ │ authorize → pickProtocol │ + │ │ │ + │ ▼ │ + │ per-queue bucket lookup │ + │ │ │ + │ ┌────────┴─────────┐ │ + │ ▼ ▼ │ + │ allow throttle + │ │ │ │ + │ ▼ ▼ │ + │ handler 400 Throttling + └────────────────────────────┘ +``` + +The check sits **between SigV4 authorisation and the existing handler dispatch**. By the time we reach this point we already know: + +- the queue name (parsed from the request body's `QueueUrl` / `QueueName`); +- this node is the verified leader for the queue's shard (`isVerifiedSQSLeader`); any non-leader has been forwarded by `proxyToLeader` and re-evaluates the limit on landing; +- the action (`X-Amz-Target` for JSON, `Action` form parameter for query). + +### 3.1 Where the bucket lives + +A `bucketStore` instance hangs off `*SQSServer`. Internally: + +```go +// adapter/sqs_throttle.go (new in implementation PR) +type bucketStore struct { + // sync.Map rather than a single mu+map so the hot SendMessage / + // ReceiveMessage path does not contend on a process-wide lock. + // sync.Map's read-mostly optimisation matches the access pattern: + // bucket lookup is overwhelmingly read (the bucket already exists), + // and the rare insert-on-first-use happens once per (queue, action) + // pair. Each bucket's own mutation (charge / refill) is guarded by + // a per-bucket sync.Mutex inside *tokenBucket, scoped to one queue, + // so cross-queue traffic never serialises on the same lock. + // (Gemini medium on PR #664 flagged a single-mutex bucket store as + // a hot-path contention point; this design avoids that.) + buckets sync.Map // map[bucketKey]*tokenBucket + clock func() time.Time +} + +type bucketKey struct { + queue string + action string // "Send" | "Receive" | "*" + incarnation uint64 // sqsQueueMeta.Incarnation; isolates create-cycles. +} + +type tokenBucket struct { + mu sync.Mutex // per-bucket; never held across the bucketStore + capacity float64 // burst size + refillRate float64 // tokens per second + tokens float64 // current credit + lastRefill time.Time +} +``` + +The `charge` operation: + +1. `bucketStore.buckets.Load(key)` (lock-free read). +2. On miss, build the bucket from queue meta and `LoadOrStore` it (one-shot insert race tolerated — both racers will agree on the same configuration). +3. Acquire the bucket's own `mu`, refill based on elapsed time, take or reject the requested tokens, release `mu`. + +No global lock is held during step 3; concurrent traffic on different queues runs in parallel. + +**Cache invalidation on `SetQueueAttributes`**: when an operator updates the throttle config via `SetQueueAttributes`, the handler — *after* the Raft commit that persists the new `sqsQueueThrottle` — calls `buckets.invalidateQueue(name)` (the same path described in the `DeleteQueue` / `CreateQueue` paragraph below). `invalidateQueue` ranges the map and drops every entry whose `queue` matches under a lock-then-`CompareAndDelete`-then-`evicted=true` ordering; a raw per-key `buckets.Delete(key)` would reintroduce the orphan-bucket race that ordering closes (a charger holding the old pointer pre-Delete acquires the bucket's mu after the map entry is gone, sees `evicted=false`, spends a token, then later requests mint a fresh full-capacity bucket — a transient double-allotment window). Without this step at all, the in-memory bucket would keep enforcing the old limits until the idle-eviction sweep removes the stale entry (default 1 h window), defeating the operator's intent to throttle a noisy tenant in real time. The handler also gates the invalidation on a real value change — a same-value `SetQueueAttributes` does not reset the bucket — so a caller cannot bypass the rate limit by re-submitting their own current config. Claude P1 on PR #664 caught the gap; round 9 / round 12 refined the race-free semantics and the no-op gate. + +**Cache invalidation on `DeleteQueue` / `CreateQueue`**: when a queue is deleted, the handler — *after* the Raft commit that purges the queue meta — calls `buckets.invalidateQueue(name)`, which ranges the map and drops every `bucketKey` whose `queue` matches (regardless of incarnation), mirroring the `SetQueueAttributes` path above. The `CreateQueue` handler invokes the same call after a genuine create commit (the idempotent-return path skips it) so a same-name recreate that races with in-flight stale-meta traffic still resets the bucket. **Incarnation** participates in the key — `sqsQueueMeta.Incarnation` is set to `lastGen + 1` at `CreateQueue` time and is *preserved* across `PurgeQueue` and `SetQueueAttributes` (the read-modify-write on the meta record carries it forward). A `DeleteQueue`+`CreateQueue` cycle therefore lands the new incarnation at a different `bucketKey` and starts from a fresh full bucket regardless of any per-process cache the previous incarnation left behind on this or any other node. The two mechanisms are complementary — `invalidateQueue` is the cheap hot-path optimisation that keeps the in-memory map small, and the incarnation-keyed structure is the cross-leader correctness guarantee. + +**Why Incarnation, not Generation** (Codex P2 on PR #664 round 9): an earlier draft of this design used `sqsQueueMeta.Generation` as the bucket-key discriminator. `Generation` bumps on every `CreateQueue` *and* on every `PurgeQueue` (because message keys are prefixed with the generation, so a purge needs a new prefix to make the old data unreachable). Keying the throttle bucket by `Generation` would therefore re-key the bucket on every purge — letting any caller authorised to call `PurgeQueue` reset the rate limiter to a fresh full bucket once per purge. The 60-second AWS-spec rate limit on `PurgeQueue` bounds the bypass but does not eliminate it. `Incarnation` solves this by isolating the "queue identity changed" semantics (DeleteQueue+CreateQueue cycle) from the "data prefix changed" semantics (any of Create / Delete / Purge), and the throttle layer keys only on the former. + +The bucket map is per-process. There is no Raft replication of bucket state. The behaviour at the moment a node assumes leadership of a queue depends on which of three failover paths fires (Claude low on PR #664 round 7 caught the over-broad earlier wording): + +1. **First-time leader** (the most common case for a fresh process): no cached bucket, `loadOrInit` misses, the charge path mints a fresh bucket at full capacity. +2. **Re-elected node, throttle config changed during the prior leader's term**: `loadOrInit` finds the cached bucket but its `capacity`/`refillRate` no longer match the freshly-loaded meta, so the reconciliation path evicts the stale bucket and mints a fresh full-capacity replacement. +3. **Re-elected node, same queue incarnation, same throttle config**: `loadOrInit` finds the cached bucket and returns it as-is. The bucket keeps the token balance it had when this node last led the queue; the next charge runs the standard refill (`tokens += elapsed * refillRate`, capped at `capacity`) before deciding allow/reject. This is a correctness *feature*, not a bug — the elapsed time during which this node was a follower legitimately accrues credit, and resetting to full capacity on every re-election would let an operator trigger a burst by repeatedly bouncing leadership. + +**Why this is correct**: the worst-case behaviour across all three paths is that a noisy queue gets one extra burst worth of bandwidth right after a leader change (paths 1 and 2). Path 3 inherits the correct token-bucket semantics from the local cache. Replicating bucket state would cost a Raft commit per token decrement, which would defeat the entire point of the token bucket. AWS's own rate limiter has the same property at region failover boundaries. + +**Cross-leader incarnation isolation**: a node that previously led a queue could keep an old `(queue, action, inc=N)` bucket in cache until idle eviction. After a `DeleteQueue`+`CreateQueue` cycle that bumps the incarnation to `N+1`, leadership for the recreated queue could move back to that node. With incarnation in `bucketKey`, the new incarnation lands at `(queue, action, inc=N+1)` — a key the cache has never seen, so the charge path mints a fresh bucket. The stale `inc=N` entry is unreachable (no traffic uses it any more) and falls out via the idle-eviction sweep. Without incarnation in the key, the old leader's stale bucket would silently service the new incarnation's traffic. + +Buckets are created lazily on first request. They self-evict after a configurable idle window (default 1h) so a queue that goes silent does not keep its bucket entry forever. + +### 3.2 Configuration on queue meta + +`sqsQueueMeta` gains: + +```go +type sqsQueueMeta struct { + // ... existing fields ... + + // Throttle is the per-queue rate-limit configuration. Empty + // value disables throttling (default). Set via SetQueueAttributes + // with the AWS-style attribute names ThrottleSendCapacity / + // ThrottleSendRefillPerSecond / etc. Persisted on the meta + // record so a leader failover loads the configuration along + // with the rest of the queue. + Throttle *sqsQueueThrottle `json:"throttle,omitempty"` +} + +type sqsQueueThrottle struct { + SendCapacity float64 `json:"send_capacity,omitempty"` + SendRefillPerSecond float64 `json:"send_refill_per_second,omitempty"` + RecvCapacity float64 `json:"recv_capacity,omitempty"` + RecvRefillPerSecond float64 `json:"recv_refill_per_second,omitempty"` + DefaultCapacity float64 `json:"default_capacity,omitempty"` + DefaultRefillPerSecond float64 `json:"default_refill_per_second,omitempty"` +} +``` + +Setting `SendCapacity = 100, SendRefillPerSecond = 50` means: bursts up to 100 `SendMessage` requests, sustained 50 RPS, and any client overrun gets `Throttling`. + +`Default*` fields catch any action not covered by an action-specific pair (so a future `PurgeQueue` rate limit costs nothing once defaults are wired). + +**Config-field → bucket-action mapping** (Codex P1 on PR #664 sixth-round Codex review): the JSON config field-name prefixes use short forms (`Send*`, `Recv*`, `Default*`) but the in-memory `bucketKey.action` from §3.1 uses the canonical action vocabulary (`"Send"`, `"Receive"`, `"*"`). The mapping is fixed: `Send*` → `bucketKey{action:"Send"}`, `Recv*` → `bucketKey{action:"Receive"}`, `Default*` → `bucketKey{action:"*"}`. Cache invalidation paragraphs in §3.1 use the bucket-action vocabulary (the actual map keys). Use the config-field vocabulary when discussing the JSON contract (`SetQueueAttributes` payload, `GetQueueAttributes` response) and the bucket-action vocabulary when discussing the in-memory map. Implementation must apply this mapping when looking up buckets after a `SetQueueAttributes` commit. + +The `SetQueueAttributes` validator enforces: + +- All four `Send*` / `Recv*` fields must be either both zero (disabled) or both positive. +- Capacity ≥ refill (otherwise the bucket can never burst above the steady state). +- A hard ceiling per queue (e.g. 100,000 RPS) so a typo (`SendCapacity = 1e9`) does not silently mean "no limit at all" but rejects with `InvalidAttributeValue`. +- **Capacity ≥ max single-request charge** (Codex P1 on PR #664 sixth-round review). Per the §3.3 charging table, a `SendMessageBatch` charges up to 10 from the Send bucket and `DeleteMessageBatch` charges up to 10 from the Recv bucket (AWS caps both at 10 entries). Therefore: when `SendCapacity > 0` it must also be `≥ 10`, and when `RecvCapacity > 0` it must also be `≥ 10`. Without this rule, a queue configured with `SendCapacity = 5` enters a permanently unserviceable state for full batches — the bucket can never accumulate the 10 tokens a `SendMessageBatch(len=10)` requires, every full batch is rejected with `Throttling`, and `Retry-After` (§3.4) keeps reporting "wait N seconds" forever with no recovery path short of re-running `SetQueueAttributes`. The validator rejects with `InvalidAttributeValue` and an explicit message naming the per-bucket minimum so the operator sees the cause immediately. **`Default*` requires the same floor**: `resolveActionConfig` in `adapter/sqs_throttle.go` falls Send and Receive traffic through to the Default bucket whenever the dedicated `Send*` / `Recv*` pair is unset, so a `SendMessageBatch` or `DeleteMessageBatch` request can charge the Default bucket. A `DefaultCapacity < 10` therefore creates the same permanently-unserviceable-batch trap as `SendCapacity < 10`. (Earlier drafts of this proposal exempted `Default*` on the assumption that the catch-all set had no batch verb in scope; that was incorrect — the fall-through means batch verbs do hit Default*. The implementation passes `requireBatchCapacity = true` to `validateThrottlePair` for all three action sets — see Codex P1 on PR #679 round 5.) + +### 3.3 Charging model + +| Action | Charge | +|---|---| +| `SendMessage` | 1 from the Send bucket | +| `SendMessageBatch` | `len(Entries)` from the Send bucket (typically 1–10) | +| `ReceiveMessage` | 1 from the Recv bucket regardless of `MaxNumberOfMessages` | +| `DeleteMessage` | 1 from the Recv bucket (consumer-side action) | +| `DeleteMessageBatch` | `len(Entries)` from the Recv bucket | +| `ChangeMessageVisibility[Batch]` | same as Delete | +| Everything else (catalog ops, tag ops) | not throttled in this PR | + +Batch verbs charge **before** dispatching individual entries. If the bucket has 3 tokens and the batch carries 10 entries, the call is rejected as a whole — partial throttling within a batch is harder to reason about and AWS itself rejects the whole call. Recorded in §11 (open questions) as a possible future tweak. + +### 3.4 The `Throttling` envelope + +On rejection: + +| Protocol | Response | +|---|---| +| JSON | HTTP 400, body `{"__type":"Throttling","message":"Rate exceeded for queue '' action ''"}`, header `x-amzn-ErrorType: Throttling`, header `Retry-After: ` (computed per below) | +| Query | HTTP 400, body `SenderThrottling......`, headers as above | + +`Retry-After` is computed from the *actual* refill rate AND the *requested* token count so neither slow refill nor large batches cause a busy-loop of premature retries (two consecutive Claude reviews on PR #664 caught both: first the `Retry-After: 1` constant lying for sub-1-RPS refill — `SendRefillPerSecond = 0.1` needs 10 s for the next token; then the formula's hardcoded numerator `1.0` lying for batch verbs that charge >1 token — a `SendMessageBatch` of 10 against `refillRate = 1.0` and 0 tokens needs 10 s, not 1): + +```text +needed := float64(requestedCount) - currentTokens +secondsToNextRefill := math.Ceil(needed / refillRate) +retryAfter := max(1, int(secondsToNextRefill)) // never less than 1 +``` + +`requestedCount` is the same value the charge step uses: `1` for single-message verbs, `len(Entries)` for batch verbs (§3.3). A `SendMessageBatch` of 10 against a bucket with `refillRate = 1.0` and 0 tokens correctly returns `Retry-After: 10`; a single `SendMessage` against `refillRate = 0.1` and 0 tokens correctly returns `Retry-After: 10`; the common case (single op, fast refill) keeps the floor of 1. + +The minimum-1 floor matches `Retry-After`'s integer-second granularity (HTTP/1.1 §10.2.3). The validator (§3.2) keeps `refillRate > 0`, so the divide-by-zero guard is unnecessary in the formula above. + +--- + +## 4. Implementation Path + +### 4.1 Files touched + +| File | Change | +|---|---| +| `adapter/sqs_throttle.go` (new) | `bucketStore`, `tokenBucket`, charging helper. ~250 lines. | +| `adapter/sqs_catalog.go` | Add `Throttle` field to `sqsQueueMeta`. Extend `applyAttributes` with the new `Throttle*` attribute names. Render the four Throttle fields in `queueMetaToAttributes` so `GetQueueAttributes("All")` surfaces them. | +| `adapter/sqs.go` | After `authorizeSQSRequest`, call `bucketStore.charge(queueName, action, count)`. On reject, write the `Throttling` envelope and return. | +| `adapter/sqs_throttle_test.go` (new) | Unit tests for bucket math (edge cases: idle drift, burst, partial refill, batch over-charge, default-off). ~300 lines. | +| `adapter/sqs_throttle_integration_test.go` (new) | End-to-end: configure a queue with low limits, send N messages back-to-back, confirm the (N+1)th gets `Throttling` with `Retry-After`. ~150 lines. | +| `monitoring/registry.go` | New counter `sqs_throttled_requests_total{queue, action}` and new **gauge** `sqs_throttle_tokens_remaining{queue, action}`. (Codex P2 on PR #664: tokens go up *and* down so a counter is the wrong instrument.) | +| `docs/design/2026_04_24_partial_sqs_compatible_adapter.md` §16.5 | Status update once this lands: TODO → Landed. | + +### 4.2 OCC interaction + +Throttling sits *outside* the OCC transaction — a rejected request never touches the coordinator. This is critical: the existing OCC retry loop in `sendMessageWithRetry` would otherwise loop on a permanent rate-limit failure, burning leader CPU. Confirmed by reading `sqs_messages.go: tryPurgeQueueOnce` and friends — none of them treat `sqsAPIError` codes as retryable. + +### 4.3 Multi-shard correctness + +Each queue is owned by exactly one shard (queue-per-shard routing in `kv/shard_router.go`). The leader of that shard owns the bucket. A request that lands on a follower is forwarded by `proxyToLeader` *before* the bucket check, so the bucket is always evaluated by the leader that is also doing the OCC dispatch — no risk of a follower checking against a stale bucket and the leader committing without checking. + +Once Phase 3.D (split-queue FIFO) lands, a single queue may span multiple shards. At that point each *partition* gets its own bucket, **keyed by `(queueName, partitionID)`** — not by `MessageGroupId`. `MessageGroupId` is the *input* to `partitionFor`; using it directly as the bucket key would create one bucket per unique group value (unbounded, attacker-amplifiable map size, and hot groups would never share a budget). `partitionID` is bounded by `PartitionCount` so the worst-case bucket count per queue is tiny. The throttle proposal is forward-compatible: the bucket lookup key changes from `queueName` to `(queueName, partitionID)`, and the `bucketKey` struct in §3.1 grows a `partition uint32` field. Documented in §11. (Claude P1 on PR #664 caught the misnomer.) + +**Budget semantics per partition:** each partition's bucket gets the *full* configured `SendCapacity` / `RecvCapacity` / `DefaultCapacity`. The effective aggregate throughput of an N-partition queue is therefore N × the configured per-partition limit. This is intentional and analogous to how AWS High Throughput FIFO multiplies throughput by partition count; operators sizing the throttle should treat `SendCapacity` as the *per-partition* budget. A shared queue-level budget (divided across partitions) would require cross-shard coordination on every `SendMessage` — an extra Raft round-trip per call, defeating the point of partitioning. If per-queue aggregate throttling is needed after Phase 3.D lands, a new `SendCapacityTotal` attribute could be added that gets divided by `PartitionCount` at config time and stored as the per-partition capacity; that design is out of scope for this proposal. + +--- + +## 5. AWS-Compatibility Surface + +The throttling configuration is **non-AWS** — there is no `ThrottleSendCapacity` attribute in AWS SQS. The `Throttle*` names are exposed alongside the AWS-defined queue attributes; any principal whose SigV4 credentials grant `SetQueueAttributes` / `GetQueueAttributes` on the queue can read or modify them, the same access model the rest of the queue meta uses. Operators control who can mutate throttle config the same way they control any other queue attribute — at the credential / IAM-policy boundary, not via a separate role inside the SQS adapter. + +Strict-validation SDKs that reject unknown attribute names will reject `Throttle*` on read (or balk at the `Set*` call); operators using such SDKs either set the attribute through a non-strict client (the AWS CLI, `awscurl`, the elastickv admin RPC) or whitelist the names in their SDK config. The throttling enforcement itself runs for every authenticated principal — the cost of the bucket check is independent of which SigV4 key signed the request. + +(An earlier draft of this proposal scoped throttle config to an "admin principal" backed by an access-key-to-role mapping. The mapping does not exist at the SQS HTTP layer — only the operator-only admin RPCs (`AdminDeleteQueue`, etc.) carry an `AdminPrincipal`, and HTTP requests authenticate via the static credentials map (`WithSQSStaticCredentials`) which is access-key → secret only. Adding a role layer would be a separable change with its own design — see future work / §11.) + +--- + +## 6. Testing Strategy + +1. **Bucket math unit tests** (`adapter/sqs_throttle_test.go`): + - Fresh bucket allows up to capacity, then rejects. + - After idle T seconds, refills exactly `T * refillRate` tokens (cap at capacity). + - Batch charge of N rejects when `currentTokens < N`, no partial credit consumed. + - Concurrent `charge` calls preserve the count invariant under `-race`. + - Default-off: nil throttle config short-circuits to allow. + +2. **End-to-end** (`adapter/sqs_throttle_integration_test.go`): + - Configure a queue with `SendCapacity=10 SendRefillPerSecond=1`. (`SendCapacity` must be ≥ 10 per the §3.2 validator so the test mirrors a configuration the validator would actually accept; the earlier draft used 5 and would have been rejected at setup.) + - Send 10 messages back-to-back → all 200. + - Send 1 more immediately → 400 `Throttling` with `Retry-After: 1`. + - Sleep 2s, send → 200 (refill happened). + - Same shape for `ReceiveMessage`. + +3. **Immediate bucket invalidation on `SetQueueAttributes`** (`adapter/sqs_throttle_invalidation_test.go`): pins the §3.1 cache-invalidation contract so the `buckets.Delete` call after the Raft commit cannot be silently dropped during a refactor. + - Configure a queue with `SendCapacity=10 SendRefillPerSecond=1`. + - Send 10 messages back-to-back to exhaust the bucket. + - Send 1 more → 400 `Throttling` (sanity check that the bucket really is empty). + - Call `SetQueueAttributes` to raise `SendCapacity=20 SendRefillPerSecond=20`. + - **Immediately** (no sleep) send 1 more message → 200. Without the invalidation step the request would still be rejected for the next ≈19 seconds (or up to 1 h, until the idle-eviction sweep removes the stale entry). + - Same shape for `ReceiveMessage` (raise `RecvCapacity` after exhausting the Recv bucket). + - Same shape for `DeleteQueue` lifecycle: send 10 to exhaust → `DeleteQueue` → `CreateQueue` with the same name and `SendCapacity=10` → first `SendMessage` returns 200 (full-capacity fresh bucket), not the stale empty bucket from the previous incarnation. + +4. **Configuration round-trip**: `SetQueueAttributes` with throttle config → `GetQueueAttributes` returns the same values; an unknown `Throttle*` attribute name is rejected with 400 `InvalidAttributeName` (matching AWS behaviour for unrecognised attributes). + +5. **Cross-protocol parity**: throttled JSON and Query requests both surface `Throttling` (different envelope, same code). + +6. **Failover behaviour** (3-node cluster): kill the current leader after 3 messages, confirm the next leader starts the bucket fresh and accepts up to capacity again. Log line records the failover so operators can correlate. + +7. **Lint + race**: `go test -race ./adapter/...` must stay clean. The bucket store uses `sync.Mutex` (no atomic-only tricks); the race detector should have nothing to find. + +--- + +## 7. Operational and Configuration + +No new flags. Limits are per-queue, set via `SetQueueAttributes`. Defaults are zero (disabled). + +Two new Prometheus instruments (Section 4.1) expose the throttling activity: + +- `sqs_throttled_requests_total{queue, action}` — **counter**. Use `rate(...)` per queue in Grafana to spot the noisy tenant. +- `sqs_throttle_tokens_remaining{queue, action}` — **gauge** (Codex P2 on PR #664: token budgets go up *and* down over time, so a counter would mask the depletion that operators most need to see). Sample directly; trending toward zero is the early warning sign. + +--- + +## 8. Failure Handling + +1. **Bucket evicted while in flight**: a burst-after-eviction request creates a new bucket at capacity. Same as the failover case — at most one extra burst. Acceptable. +2. **Clock skew between leader and request**: the bucket uses the leader's local wall clock for refill, so per-leader skew is irrelevant. Cross-leader skew is bounded by §3.1's "fresh bucket on failover". +3. **Over-saturated bucket store** (millions of queues): the eviction goroutine sweeps every `bucketEvictionInterval = 1m`. Per-queue map entries are ~80 bytes; 1M queues = 80MB worst case. Operators concerned about this can lower the eviction window. + +--- + +## 9. Alternatives Considered + +### 9.1 Replicate bucket state via Raft + +Every `charge` proposes a bucket update through the FSM. **Rejected**: an extra Raft commit per `SendMessage` defeats the SQS adapter's existing throughput; AWS's own throttling does not replicate state. + +### 9.2 External rate limiter (Envoy / NGINX in front) + +**Rejected**: those layers do not see the `QueueName` (it's inside the request body), so per-queue limits are not expressible. They also do not know which node is the leader for a given queue. + +### 9.3 In-memory per-IP rate limit + +**Rejected**: producers behind NAT or a load balancer share an IP. AWS-shape per-queue limits are what operators actually want. + +### 9.4 Token bucket on the catalog layer (one big shared bucket) + +**Rejected**: defeats the multi-tenant goal. A single queue still pins everyone else. + +--- + +## 10. Rollout Plan + +| Phase | Content | +|---|---| +| 1 | Doc lands (this PR). No code yet. Operators have time to comment. | +| 2 | Implementation PR per §4.1. Default-off; existing queues unaffected. | +| 3 | Operators opt in per queue via `SetQueueAttributes`. Monitor `sqs_throttled_requests_total` for false positives. | +| 4 | Once stable, the partial doc's TODO list moves 3.C from TODO to Landed. | + +--- + +## 11. Open Questions + +1. **Partial-batch throttling**: should `SendMessageBatch` accept what fits and reject the rest, or all-or-nothing as proposed? AWS does the latter; sticking with AWS for parity is the conservative answer but a per-entry breakdown is more efficient under spiky load. +2. **Per-MessageGroupId throttling for FIFO**: a single producer hammering one MessageGroupId on a multi-partition FIFO (Phase 3.D) can still hot-spot one partition. A `(queue, group, action)` key would bound it, but adds map cardinality. Defer to the Phase 3.D design. +3. **Should the `Throttling` body name the offending action**? Right now the `` includes it. Useful for operators; SDKs ignore the message body. Trade-off is that the message is user-visible — a noisy producer's logs will spam the queue name. Probably fine; flagged here for review. +4. **Should idle-bucket eviction emit a metric**? Eviction is a normal lifecycle event but a sudden spike could indicate someone shotgun-creating queues. Probably yes; cheap to add. diff --git a/docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md b/docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md new file mode 100644 index 000000000..c5f7c2b61 --- /dev/null +++ b/docs/design/2026_04_26_proposed_sqs_split_queue_fifo.md @@ -0,0 +1,419 @@ +# Split-Queue FIFO for the SQS Adapter + +**Status:** Proposed +**Author:** bootjp +**Date:** 2026-04-26 + +--- + +## 1. Background and Motivation + +elastickv's SQS adapter implements FIFO queues with **single-partition** semantics: every message in a `.fifo` queue lives in exactly one Raft group, ordered globally by send time, with the group lock keyed on `(queue, generation, MessageGroupId)` so messages with the same group ID deliver in order while messages with different group IDs can deliver in parallel. + +This matches AWS's Standard FIFO contract for *modest* throughput — but AWS's **High Throughput FIFO** (HT-FIFO) feature lifts the per-FIFO-queue ceiling from 300 transactions per second (TPS) per API per region to 70,000+ TPS per region by **partitioning** the queue across multiple data planes, with ordering preserved *within* each `MessageGroupId`. AWS exposes this as the `DeduplicationScope` and `FifoThroughputLimit` queue attributes. + +The Phase 1+2 of `docs/design/2026_04_24_partial_sqs_compatible_adapter.md` deliberately deferred this — the design doc §16.6 marks it as TODO and notes "**the** large item in Phase 3" because it touches replication topology, routing, FIFO group-lock semantics, the reaper, the metrics surface, and the migration path for queues that already exist. + +This document is the proposal that unblocks the implementation. It is **not the implementation**: the work splits naturally into multiple PRs, and any one of them is too big to land without prior agreement on the partition assignment scheme, the migration story, and the rollback story. Concretely, this proposal is **gate-of-no-return** material — once a partitioned FIFO queue exists in production, the data layout cannot change without a full migration. + +--- + +## 2. Goals and Non-Goals + +### 2.1 Goals + +1. **Multiple partitions per FIFO queue**, each owned by its own Raft group, with `MessageGroupId` deterministically routed to a partition. +2. **Within-group ordering preserved**: any two messages with the same `MessageGroupId` land on the same partition and deliver in send order, exactly as today's single-partition FIFO does. +3. **Across-group parallelism**: different `MessageGroupId` values may land on different partitions and deliver concurrently. Effective throughput scales with the partition count when the producer's group ID distribution is even. +4. **Backward compatibility**: existing `.fifo` queues created before this feature stay single-partition forever (one partition is the special case of N=1). No migration runs implicitly. +5. **AWS-shape configuration**: `DeduplicationScope` (`messageGroup` | `queue`) and `FifoThroughputLimit` (`perMessageGroupId` | `perQueue`) are accepted via `CreateQueue` / `SetQueueAttributes` and rejected on Standard queues, matching AWS. +6. **Operational safety**: a partitioned FIFO queue's per-partition keyspace is reapable, observable, and survives leader failover with the same OCC discipline today's queues do. + +### 2.2 Non-Goals + +1. **Auto-rebalancing**. A queue's partition count is set at create time and never changes. AWS's HT-FIFO works the same way; resharding a live FIFO queue while preserving order is fundamentally hard and is not in this proposal. +2. **Cross-partition transactions**. Operations on a partitioned queue still touch one partition per request. There is no "atomic delete from all partitions"; admin operations like PurgeQueue iterate partitions and tolerate partial progress. +3. **Group-level throttling**. The §16.5 (per-queue throttling) proposal is forward-compatible with this design, but the per-`MessageGroupId` rate limit is its own follow-up. +4. **Pluggable partition assignment**. The hash function is fixed (FNV-1a 32 over the UTF-8 bytes of `MessageGroupId`); operators do not get a knob. +5. **Migration of existing single-partition `.fifo` queues to N partitions**. Out of scope. Operators who want HT-FIFO on an existing workload create a new queue, drain the old one, and switch producers. +6. **Standard (non-FIFO) queue partitioning**. Standard queues already parallelize across the consumer pool because they have no ordering contract; partitioning them adds complexity for no win. + +--- + +## 3. Data Model + +### 3.1 Partition identity + +Each partitioned FIFO queue has `N` partitions where `N ∈ {1, 2, 4, 8, 16, 32}`. The `N=1` case is the existing single-partition layout, unchanged. Powers of two only so the hash → partition step is a `hash & (N-1)` (cheap and consistent) and so future `N` changes via offline rebuild stay tractable. + +A partition is identified by the tuple `(queueName, partitionIndex)` where `partitionIndex ∈ [0, N)`. The key shape is **conditional on whether the queue is partitioned** (Codex P1 + Gemini high on PR #664 — naively inserting a `` segment into every key would shift offsets for `` and `` and break readback of every existing message on disk): + +- **Legacy / `PartitionCount = 0` / Standard queues** keep today's `!sqs|msg|data|||` byte-for-byte. No partition segment is written or read. Existing data on disk is unaffected; existing key constructors stay unchanged on this code path. +- **Partitioned FIFO queues (`PartitionCount > 1`)** use a *new* keyspace prefix that explicitly includes the partition: `!sqs|msg|data|p||||` (note the extra `p|` discriminator after `data|`). The discriminator is what guarantees no collision with the legacy prefix even when ` = 0` happens to match the first 8 bytes of a legacy ``. + +The `p|` discriminator is **safe by name-validator construction**, not by accident: AWS SQS queue names (and elastickv's `validateQueueName`) admit only `[A-Za-z0-9_-]` plus the optional `.fifo` suffix, so no queue name can contain `|`. The existing `!sqs|msg|data||...` segment is therefore terminated by a `|` that no queue name can produce, and the new `!sqs|msg|data|p||...` segment starts with a literal byte sequence (`p|`) that cannot appear at the same position in any legacy key (it would require a queue name of `p`, which would still be followed by a `|` from the *segment* terminator, not from the queue name itself — but the prefix routing reads the bytes as `data|p|` vs `data||`, and `` is base32-encoded so it never starts with the literal ASCII `p`). The implementation PR's name validator must continue to reject `|` in queue names; any future relaxation of that rule has to revisit this prefix scheme first. + +Concretely, the implementation PR exposes **two named constructors** rather than a variadic dispatcher (Claude review on PR #664 flagged the variadic form as a footgun: `sqsMsgDataKey(q, gen, id, p0, p1)` would silently ignore `p1` and the compiler would not catch it). The dispatch lives at the call site, where `meta.PartitionCount` is already in scope: + +```go +// Two distinct constructors, one per keyspace. +func legacyMsgDataKey(queueName string, gen uint64, messageID string) []byte +func partitionedMsgDataKey(queueName string, partition uint32, gen uint64, messageID string) []byte + +// Dispatch at the call site. No variadic, no silent argument loss. +var dataKey []byte +if meta.PartitionCount > 1 { + dataKey = partitionedMsgDataKey(queueName, partition, gen, msgID) +} else { + dataKey = legacyMsgDataKey(queueName, gen, msgID) +} +``` + +The reaper enumerates **both** prefixes when reaping a queue, so a queue that was created legacy and later (in a hypothetical future migration) gains partitions does not strand its old data — out of scope today, but the prefix choice keeps that door open. + +Scans on a partitioned queue use `!sqs|msg|data|p|||` so a worker handling partition `k` never sees keys for partition `k+1`. Scans on a legacy queue use `!sqs|msg|data||`, identical to today. + +### 3.2 Queue meta extensions + +`sqsQueueMeta` gains: + +```go +type sqsQueueMeta struct { + // ... existing fields ... + + // PartitionCount is the number of FIFO partitions for this queue. + // 1 (or 0, treated as 1) means the existing single-partition + // layout — no schema change. >1 enables HT-FIFO. Set at + // CreateQueue time and immutable thereafter (see §3.2 below for + // the enforcement rule and the reason for the create-time gate). + // Power-of-two values only (validator rejects others). + PartitionCount uint32 `json:"partition_count,omitempty"` + + // DeduplicationScope mirrors the AWS attribute. "messageGroup" + // means the dedup window is per (partition, MessageGroupId) + // pair; "queue" means it is per queue (the legacy behaviour). + // Only meaningful when PartitionCount > 1. + DeduplicationScope string `json:"deduplication_scope,omitempty"` + + // FifoThroughputLimit mirrors the AWS attribute. Defaults to + // "perMessageGroupId" when PartitionCount > 1; the alternative + // "perQueue" reduces the partition assignment to a single + // partition (effectively N=1) and is mostly useful for clients + // that want the AWS attribute set without the extra capacity. + FifoThroughputLimit string `json:"fifo_throughput_limit,omitempty"` +} +``` + +`PartitionCount`, `FifoThroughputLimit`, and `DeduplicationScope` are **all immutable from `CreateQueue` onward** (Codex P1 on PR #664 sixth-round Codex review; gate boundary refined to AWS-aligned create-time per Claude P1 on PR #664 seventh-round Claude review). The validator on `SetQueueAttributes` rejects any change to any of the three; operators who want different values create a new queue. Why all three: + +- **`PartitionCount`** — changing it would require re-hashing every existing message into a new partition, which (a) breaks ordering for in-flight messages of every group whose hash bucket changed, and (b) is a multi-second / multi-minute operation that cannot be expressed as one OCC transaction. +- **`FifoThroughputLimit`** — a flip from `perMessageGroupId` → `perQueue` activates the §3.3 short-circuit that collapses every group ID to partition 0. In-flight messages from groups previously routed to partitions 1…N-1 stay where they are; new messages for those same groups land on partition 0; consumers see the group split across partitions and within-group FIFO ordering is silently violated. The reverse flip has the symmetric problem. +- **`DeduplicationScope`** — affects how the dedup key is scoped (`(queue, dedupId)` vs. `(queue, partitionId, dedupId)` vs. `(queue, MessageGroupId, dedupId)` depending on the value). Changing it live can either resurrect duplicates the previous scope had de-duped (when narrowing scope) or suppress a legitimately new send (when widening scope and the new key collides with a still-cached entry from the prior scope). + +The pattern is the same: each attribute participates in a routing or dedup decision whose correctness depends on every existing message having been written under a single, consistent value. Live mutation creates a "before" set and an "after" set with incompatible invariants that the runtime cannot reconcile without a full drain. + +**Cross-attribute validation at `CreateQueue` and `SetQueueAttributes`** (Codex P1 on PR #664 tenth-round Codex review): the same validator that enforces immutability also rejects incoherent attribute combinations *before* the queue is created. Specifically, `{PartitionCount > 1, DeduplicationScope = "queue"}` is rejected with `InvalidParameterValue` ("queue-scoped deduplication is incompatible with multi-partition FIFO because the dedup key cannot be globally unique across partitions without a cross-partition OCC transaction"). Without this control-plane gate, an operator who mis-configures the combination at `CreateQueue` gets a successful response and only discovers the problem when every subsequent `SendMessage` fails — a created-but-unserviceable queue with no recovery short of `DeleteQueue`+`CreateQueue`. Any other invalid combinations the implementation discovers during PR 2 should land in the same validator (and the §4.x rejection paragraphs that mention them should be reframed as "cannot reach this code" notes, not runtime rejections). + +In practice this rule is only reachable at `CreateQueue` time. `SetQueueAttributes` cannot present the `{PartitionCount > 1, DeduplicationScope = "queue"}` combination because both `PartitionCount` and `DeduplicationScope` are immutable: any change to either is caught by the immutability check (described in the next paragraph) before the cross-attribute check has the chance to run. Implementing the cross-attribute rule on the `SetQueueAttributes` path would therefore be dead code — it is harmless to add for symmetry, but PR 2's author should know it is unreachable rather than spend time hunting for the test case that hits it. + +**Enforcement (gate is `CreateQueue`, not first send)**: when `SetQueueAttributes` is called, the validator loads the current `sqsQueueMeta` (one Raft-consistent point read against the catalog — already required for the OCC compare-and-set) and rejects with `InvalidAttributeValue` if any of the three attributes in the request differs from the value already on the meta. **`SetQueueAttributes` is all-or-nothing**: if any immutable attribute in the request carries a differing value, the entire request is rejected before any attribute is persisted — including the *mutable* attributes in the same call (e.g. `VisibilityTimeout` paired with an attempted `PartitionCount` change is rejected as a whole; the `VisibilityTimeout` change does not commit on its own). The §9 immutability test pins this rule. No range scan over the message keyspace is required; no `firstSendAt` timestamp needs to be added; no concurrent-send race exists, because the meta value is set once at `CreateQueue` commit and never changes thereafter. This matches AWS's published behaviour ("you can't change the queue type after you create it" extends to `FifoThroughputLimit` and `DeduplicationScope` in HT-FIFO queues), so SDK clients see the same rejection envelope on the same request as on AWS proper. Picking the create-time gate over a first-send gate is also defensible from a correctness lens: the corner case "operator creates a queue with `PartitionCount=8` and changes their mind to `PartitionCount=4` before any producer connects" is rare and can be solved by `DeleteQueue`+`CreateQueue` (which the operator can also do post-first-send for any other reason). The simplicity of a stateless validator is worth more than the vanishingly small set of operators who would benefit from a brief mutability window. + +### 3.3 Routing + +`partitionFor(meta, messageGroupId) uint32`: + +```go +// Single-partition path: cheap fast path. Three cases collapse here: +// 1. Standard queues + N=1 FIFOs (PartitionCount <= 1). +// 2. FIFOs explicitly configured with FifoThroughputLimit=perQueue, +// which §3.2 documents as "reduces routing to a single partition +// regardless of PartitionCount." A queue created with +// PartitionCount=8 + perQueue MUST land every group on partition 0; +// hashing across all 8 would directly contradict the documented +// semantics that operators selected when they picked perQueue. +// (Codex P2 + Claude on PR #664 caught this.) +if meta.PartitionCount <= 1 || meta.FifoThroughputLimit == "perQueue" { + return 0 +} +if messageGroupId == "" { + // Defensive: a FIFO send without MessageGroupId is rejected + // upstream by validateSendFIFOParams. If we somehow reach here, + // route to partition 0 so the failure is contained, not fanned + // out across every partition. + return 0 +} +// Inlined FNV-1a 32-bit (sidesteps a uint64 → uint32 narrowing the +// 64-bit variant would require for the partition mask AND, which +// gosec G115 would otherwise flag and force a //nolint comment). +const ( + fnv32Offset uint32 = 2166136261 + fnv32Prime uint32 = 16777619 +) +hash := fnv32Offset +for i := 0; i < len(messageGroupId); i++ { + hash ^= uint32(messageGroupId[i]) + hash *= fnv32Prime +} +return hash & (meta.PartitionCount - 1) +``` + +The choice of FNV-1a 32-bit is deliberate. FNV-1a is fast (no SIMD setup), has no key, and is identical across Go versions and architectures. The 32-bit variant gives ample entropy for the `PartitionCount ≤ htfifoMaxPartitions = 32` regime — only the low `log2(PartitionCount)` bits (≤ 5) survive the mask, so the upper 32 bits the 64-bit variant would have produced are wasted work that costs a `uint64 → uint32` narrowing on the routing hot path. Operators do not need this to be cryptographically strong — they need it to be deterministic and well-distributed, both of which FNV-1a satisfies. + +### 3.4 Cross-shard placement + +Partitions live in **separate Raft groups** when the queue's shard config maps each partition to a different group. The router's *dispatch algorithm* (`kv/shard_router.go`) does not change — it already routes by an arbitrary byte string today, and each `(queueName, partition)` pair becomes its own routing key with no algorithmic work. PR 4 in §11 *does* still touch `shard_router.go`, but only to wire in the `(queueName, partition)` composite key format and to load per-partition shard assignments from the new `--sqsFifoPartitionMap` flag (see §5); the byte-key dispatch core stays the way it is. The *configuration surface* for partition-to-Raft-group assignment is the new `--sqsFifoPartitionMap` flag, **not** the existing `--raftSqsMap` (which maps `raftAddr=sqsAddr` for `proxyToLeader` endpoint resolution and is unchanged by this design). Conflating the two flags would parse partition assignments as endpoint pairs and route to the wrong leader; keeping them separate is the reason §5 introduces a dedicated flag rather than overloading the existing one. + +For deployments that don't want one Raft group per partition (e.g. a small cluster with limited shard capacity), partitions can co-locate on the same group. The choice is operator-driven via `--sqsFifoPartitionMap`; it does not affect correctness, only throughput scaling. + +--- + +## 4. Request Flows + +### 4.1 SendMessage on a partitioned FIFO + +```text +1. Decode → sqsSendMessageInput (existing). +2. validateSendFIFOParams: same as today (MessageGroupId required). +3. partitionIndex := partitionFor(meta, in.MessageGroupId). +4. Resolve the leader for (queue, partitionIndex) via shard_router. + - Today's queue-per-shard router becomes (queue, partition)-per-shard. +5. Build the OCC OperationGroup with the right keyspace + constructor for this queue's PartitionCount (named constructors + per §3.1; no variadic): + if meta.PartitionCount > 1 { + dataKey = partitionedMsgDataKey(queue, partitionIndex, gen, msgID) + visKey = partitionedMsgVisKey(queue, partitionIndex, gen, ...) + groupKey = partitionedMsgGroupKey(queue, partitionIndex, gen, MessageGroupId) + } else { + dataKey = legacyMsgDataKey(queue, gen, msgID) + visKey = legacyMsgVisKey(queue, gen, ...) + groupKey = legacyMsgGroupKey(queue, gen, MessageGroupId) + } +6. Dispatch through the leader of the resolved partition (existing + leader-proxy path, unchanged). +``` + +Steps 1–2 are unchanged; step 3 is the new routing call (~10 lines); steps 4–6 are the existing send path with `partitionIndex` threaded through the key constructors. The dedup record written by step 6 keys on `(queue, partition, MessageGroupId, dedupID)` — when `DeduplicationScope = messageGroup`, this is correct by construction; the `{DeduplicationScope = queue, PartitionCount > 1}` combination cannot reach this code path because the §3.2 control-plane validator rejects it at `CreateQueue` / `SetQueueAttributes` time (see "Cross-attribute validation" in §3.2 below). Rejecting here at `SendMessage` time would mean an operator could create a queue and only discover the misconfiguration on the first send — a created-but-unserviceable state with no recovery short of `DeleteQueue`+`CreateQueue`. The control-plane gate makes the misconfiguration impossible. + +### 4.2 ReceiveMessage on a partitioned FIFO + +ReceiveMessage today scans `sqsMsgVisPrefixForQueue(queue, gen)` once. Under partitioning that becomes a scan **per partition** with **leader proxying for the partitions whose leader lives on a different node**: + +```text +1. Decode → sqsReceiveMessageInput (existing). +2. Compute partitionOrder := starting offset chosen by hashing the + request's RequestId (or random when absent) so successive calls + from the same consumer rotate which partition they hit first. + This avoids head-of-line bias toward partition 0 under load. +3. Set deadline := start + WaitTimeSeconds (capped at the AWS-defined + maximum of 20s). All sub-calls share this deadline. +4. For each partitionIndex in partitionOrder, until MaxNumberOfMessages + are collected or every partition has been tried: + a. Compute remainingWait := max(0, deadline - now()). Pass it as + WaitTimeSeconds to the sub-call (so the per-partition long-poll + is bounded by the remaining global budget). When remainingWait + reaches 0, **continue iterating remaining unvisited partitions + with WaitTimeSeconds=0 (non-blocking point-read)** rather than + breaking immediately — a consumer must not miss a message that + is already visible on an unvisited partition simply because the + long-poll budget ran out mid-loop. The loop terminates only on + the two conditions named above (MaxNumberOfMessages collected, + or every partition tried even non-blockingly). The "deadline + expires" condition therefore degrades long-poll calls to point + reads, it does not cut the fanout short. + b. Resolve the leader for (queue, partitionIndex). + c. If this node is the leader: scan locally, deliver candidates, + long-polling for at most remainingWait. + d. Otherwise: forward the request to the leader-of-partition via + the existing leader-proxy machinery (proxyToLeader, extended + to accept a partition argument so the proxy target is the + right shard, not just "the queue's leader"). The proxied call + carries an `X-Elastickv-Receive-Partition: ` header so the + downstream handler knows to skip its own partition fanout and + scan only partition k. The remainingWait value is passed as + the proxied call's WaitTimeSeconds. +5. Aggregate the per-partition results, cap at MaxNumberOfMessages. +``` + +The point is that a consumer pinned to a single endpoint **must still see messages from every partition**, even partitions whose leader is elsewhere — otherwise the SDK's "ReceiveMessage returned nothing, sleep and retry" assumption silently leaks messages forever (Codex P1 + Gemini medium on PR #664). The cost is one extra hop per non-local-leader partition; for the common deployment where partitions are co-located on one Raft group, every partition's leader is the same node and there is no fanout. For deployments that spread partitions across nodes, the proxy fanout is exactly what AWS does internally — clients see uniform behaviour regardless of topology. + +**`X-Elastickv-Receive-Partition` is an internal hop hint only** (Claude medium on PR #664 round 12). The header is set by `proxyToLeader` when the local node forwards a `ReceiveMessage` call to a partition leader; it tells the downstream handler "skip your own fan-out, scan only partition k". It is **never trusted from a client request**: the ingress handler MUST strip (or unconditionally ignore) any client-supplied `X-Elastickv-Receive-Partition` value before the partition fan-out runs. Without this guard a malicious client could send `X-Elastickv-Receive-Partition: ` directly to the public endpoint to force single-partition reads, observe false-empty responses for groups whose `MessageGroupId` hashes to a different partition, or repeatedly drain a targeted partition while leaving the others untouched — exactly the forwarding-semantics gap the fan-out was designed to close. The implementation PR (Phase 3.D PR 5) lands the strip as part of the same handler change that adds the fan-out, and the integration-test plan in §9 covers a "client-set internal header is ignored" case. + +The proxy fanout is bounded: `partitionOrder` short-circuits as soon as `MaxNumberOfMessages` are collected, so a consumer asking for 1 message touches at most 1 remote leader (the one for the first non-empty partition in their rotation order). A FIFO with no in-flight messages costs at most N proxy round-trips to confirm empty. + +**Why the shared deadline matters (Claude P1 on PR #664 fifth-round review).** Without step 3, a naive implementation would pass the *original* `WaitTimeSeconds` to every sub-call, so an empty queue with `WaitTimeSeconds = 20` and `PartitionCount = 8` would hold the connection for up to **160 s** before returning empty — well past any reasonable client or load-balancer idle timeout, and a behaviour shift the SDK does not expect. With the shared deadline, the total wall-clock wait is bounded by the original `WaitTimeSeconds`: each sub-call takes whatever budget remains, the long-poll can finish early on the first non-empty partition, and the response always comes back within the AWS-documented bound. (Alternative considered: probe all partitions in parallel with a shared cancellation context, returning on the first non-empty result. Rejected for the proof PR because it changes the per-partition polling order — long-polled partitions see traffic in `partitionOrder` rotation today, and parallel probing would erase that ordering. Sequential with a shared deadline is simpler, preserves rotation, and matches AWS's internal fanout semantics.) + +### 4.3 PurgeQueue / DeleteQueue on a partitioned FIFO + +Both verbs become **partition-iterative**: the handler loops over `[0, PartitionCount)`, dispatching the same OCC operation against each partition's leader. PurgeQueue's tombstone (Phase 2 from PR #638) is per-partition, keyed by the partition index in addition to the generation. The reaper already enumerates tombstones; the partition prefix becomes part of the tombstone key. + +Per-call atomicity for the entire queue is **not** preserved — a Purge that fails on partition 2 of 8 leaves partitions 0 and 1 purged. AWS itself does not promise atomicity here either; `PurgeQueue` is documented as a deletion that "may take up to 60 seconds" and is best-effort across partitions. The handler retries with exponential backoff on each partition independently and only reports success when all partitions succeed. + +### 4.4 ChangeMessageVisibility / DeleteMessage + +These take a `ReceiptHandle` which already encodes the partition index. The receipt-handle codec adds a **4-byte big-endian segment** for `partitionIndex` after the existing version byte — `partitionIndex` is `uint32` everywhere in this design (`partitionFor` returns `uint32`, `PartitionCount` is `uint32`), so 4 bytes is the natural fixed-width encoding (an 8-byte encoding would waste 4 bytes per handle and double this field's contribution to the base64-encoded handle length the SDK exposes to operators). The handler decodes the partition from the handle and dispatches against the right shard. No fanout — these are single-partition operations. + +--- + +## 5. Routing Layer Changes + +`kv/shard_router.go` today routes by queue name. With partitions, the routing key becomes `(queueName, partitionIndex)`. The partition-to-Raft-group assignment lands on a **new dedicated flag** rather than overloading the existing `--raftSqsMap` (Claude P1 on PR #664 caught the original proposal to extend `--raftSqsMap` syntax — that flag already maps `raftAddr=sqsAddr` for `proxyToLeader`'s endpoint resolution, and overloading the same parser with partition assignments creates a parsing ambiguity that could silently produce the wrong proxy target in §4.2's fanout): + +```text +--sqsFifoPartitionMap "orders.fifo:8=group-7,group-8,group-9,group-10,group-11,group-12,group-13,group-14" +``` + +Reads as: queue `orders.fifo` has `8` partitions, mapped to Raft groups `group-7` through `group-14` in partition order. The existing `--raftSqsMap` keeps doing what it does today — endpoint mapping for `proxyToLeader` — and is unchanged by this design. + +Backward compatibility: queues without an entry in `--sqsFifoPartitionMap` keep the single-partition layout. A queue whose `PartitionCount` in meta does not match the partition-map's entry count is a configuration error: the CreateQueue handler resolves the count from the `Attributes` first, then verifies the partition map agrees; mismatch returns 400 `InvalidParameterValue`. A queue with `PartitionCount > 1` and no entry in `--sqsFifoPartitionMap` is also rejected (the routing layer has no Raft-group mapping to use). + +--- + +## 6. Reaper Implications + +The retention reaper (`adapter/sqs_reaper.go` from PR #638) walks `sqsMsgByAgePrefix(queue, gen)`. With partitioning, the prefix becomes `sqsMsgByAgePrefix(queue, partitionIndex, gen)` and the reaper iterates partitions. + +The per-queue scan budget (`sqsReaperPerQueueBudget`) becomes a per-partition budget — otherwise a queue with 32 partitions starves every other queue. Practical effect: a partitioned queue's reaper completes in `partitions × budget` time per cycle, scaling linearly. Acceptable because the reaper runs every 30s and the budget is sized so a single partition completes in well under that. + +Tombstones written by `DeleteQueue` and `PurgeQueue` are per-partition (Section 4.3). The reaper enumerates tombstones across all partitions identically. + +--- + +## 7. Migration + +### 7.1 Existing queues stay single-partition + +Every queue created before this feature has `PartitionCount = 0` (zero value); the routing function treats that as 1 (Section 3.3). No code path changes for those queues; their key layout is byte-identical. + +### 7.2 New queues opt in + +`CreateQueue` accepts the AWS-style attribute `FifoThroughputLimit = perMessageGroupId` plus a non-AWS `PartitionCount` attribute. **`PartitionCount` is required** whenever multi-partition FIFO is requested — a `CreateQueue` that carries `FifoThroughputLimit = perMessageGroupId` or `DeduplicationScope = messageGroup` but omits `PartitionCount` is rejected with `InvalidAttributeValue`. There is no server-side default: allowing omission would make `CreateQueue` idempotency depend on hidden deployment state (an operator who changes the server's default between two identical `CreateQueue` calls gets a different queue shape each time), and `PartitionCount` is immutable from creation onward (§3.2), so a wrong inferred value requires a `DeleteQueue`+`CreateQueue` cycle to fix. + +### 7.3 No live re-partitioning + +Per §2.2 #5: changing a queue's partition count is not supported. A future migration tool could: + +1. Create a new queue with the new partition count. +2. Set the old queue's `Attributes.RedrivePolicy` to point at the new queue. +3. Drain by consuming from old, redriving to new. +4. Cut over producers. +5. Delete old. + +This is out of scope here. + +--- + +## 8. Failure Modes and Edge Cases + +1. **Proxy RTT under spread deployment**: ReceiveMessage on a queue whose partitions are spread across multiple Raft groups pays one extra round-trip per non-local-leader partition (§4.2 proxies them server-side, so a consumer pinned to one endpoint still sees every partition's messages — no false-empty failure). The cost is bounded: a request for `MaxNumberOfMessages = 1` short-circuits as soon as the first non-empty partition responds, so the *typical* extra hop count is one. The pathological case is a queue with N partitions where the consumer is asking "is anything here?" against an empty queue — that costs at most N proxy round-trips before returning empty, and the **wall-clock wait is bounded by the original `WaitTimeSeconds`** (§4.2 step 3: a shared deadline is threaded through the fanout). Mitigation: latency-sensitive deployments can co-locate partitions on fewer Raft groups (at the cost of less throughput parallelism); a single-partition or co-located deployment pays nothing. + +2. **Partition-leader churn**: a leader change on partition 3 causes that partition's ReceiveMessage to fail-over while partitions 0–2 and 4–7 keep serving. Existing `proxyToLeader` machinery handles the transition. + +3. **Hot partition**: an unbalanced `MessageGroupId` distribution (e.g. 90% of traffic on group "user-1") makes one partition the bottleneck. This is fundamental to any hash-partitioned FIFO; the answer is operator-side group-ID rebalancing, not server-side magic. AWS's HT-FIFO has the same property. + +4. **Receipt-handle from old version**: existing receipt handles encode no partition. When this feature lands, the receipt-handle codec gains a version byte distinguishing v1 (no partition) from v2 (with partition). v1 handles still work for single-partition queues forever; partitioned queues issue v2 only. ChangeMessageVisibility / DeleteMessage check the version before decoding the partition field. + +5. **Mixed-version cluster**: a rolling upgrade where some nodes have HT-FIFO and others don't. The new feature gates on the queue's `PartitionCount > 1` field, which is set at create time; old nodes that try to scan a partitioned queue's keyspace will simply not find anything (the prefix has changed). The catalog rejects `CreateQueue` with `PartitionCount > 1` until every node in the cluster reports the new feature flag. + + **The capability advertisement mechanism**: each node's existing `/sqs_health` endpoint (`adapter/sqs.go: serveSQSHealthz`) gains a new field in its JSON body — `capabilities: ["htfifo"]` once this PR's code is in the binary. The catalog's CreateQueue handler reads the live node set from the distribution layer's node registry (the same registry used by `proxyToLeader` to locate leaders), polls `/sqs_health` on each, and gates `PartitionCount > 1` on every node reporting the `htfifo` capability. Nodes that don't respond within a short timeout are treated as not-yet-upgraded — a deliberate fail-closed default so a network blip does not let a partitioned queue land in a partially-upgraded cluster. This mirrors the §3.3.2 admin-forwarding upgrade gate from the admin dashboard design (PR #644), which uses the same "all-nodes-must-report" pattern for `AdminForward`. + + **Runtime safeguard for downgraded leaders** (Codex P1 on PR #664 sixth-round review). The create-time gate prevents *new* partitioned queues from being created in a mixed-version cluster, but does not protect against the following sequence: (1) all nodes have `htfifo`, (2) a partitioned queue `orders.fifo` is created, (3) node A is rolled back to a pre-`htfifo` binary and rejoins, (4) node A is elected leader for one of the partition Raft groups, (5) node A's `ReceiveMessage` scans the old single-prefix keyspace (`!sqs|msg|data|orders.fifo|...`) and finds nothing — false-empty reads — and (6) any `SendMessage` it accepts is written under the old key format, so messages effectively land in a key-prefix the reaper does not enumerate. + + The runtime safeguard is a node-admission check that complements the create-time gate: on startup *and* on every leadership acquisition for an SQS Raft group, the node enumerates the catalog for queues whose `PartitionCount > 1` that map to the local shard. If any such queue exists and the binary does not advertise `htfifo`, the node refuses leadership for that group with an explicit log line (`sqs: refusing leadership of group %s — partitioned queue %s requires htfifo capability`) and steps down (etcd/raft `TransferLeadership` to any peer; if no peer is willing, the group becomes leaderless until an `htfifo`-capable node joins, which is the desired fail-closed behaviour). The check piggybacks on the existing leadership-acquisition hook in `kv/lease_state.go` so it costs nothing during steady-state operation. Implementations of Phase 3.D's PR set must include this safeguard before the rollout step that marks the binary `htfifo`-eligible (§11 PR 4). + +--- + +## 9. Testing Strategy + +1. **Unit tests** (`adapter/sqs_partition_test.go`): + - `partitionFor` distribution: 100k random group IDs across 8 partitions land within ±5% of equal share. + - `partitionFor` determinism: same group ID always returns same partition across runs / process restarts. + - Edge: `PartitionCount = 0` and `1` route to partition 0 unconditionally. + - Edge: empty `MessageGroupId` routes to partition 0 (defensive). + - Edge: `FifoThroughputLimit = "perQueue"` with `PartitionCount = 8` routes every group ID to partition 0 — the §3.3 short-circuit guard. Locks the fix down against regression; the perQueue branch is a one-line guard that could easily be dropped during a refactor. + +2. **End-to-end** (`adapter/sqs_partitioned_fifo_test.go`): + - Create a queue with `PartitionCount = 4`, send 1000 messages with random group IDs, confirm ordered delivery within each group, parallel delivery across groups. + - PurgeQueue iterates all partitions, leaves none orphaned. + - DeleteQueue similarly. + +3. **Receipt-handle round-trip**: v1 handle (legacy) on single-partition queue, v2 handle (with partition) on partitioned queue, both decode + ChangeMessageVisibility / DeleteMessage correctly. Cross-version handle rejection (v1 handle against partitioned queue → 400 `ReceiptHandleIsInvalid`). + +4. **`SetQueueAttributes` immutability enforcement** (`adapter/sqs_partitioned_immutability_test.go`): pins the §3.2 enforcement rule against silent regressions during a validator refactor. + - Create a partitioned queue with `PartitionCount = 4`, `FifoThroughputLimit = "perMessageGroupId"`, `DeduplicationScope = "messageGroup"`. + - For each of the three attributes, call `SetQueueAttributes` attempting to change just that attribute (e.g. `PartitionCount = 8`); each call must return 400 `InvalidAttributeValue` and the response message must name the attribute that was rejected so the operator sees the cause. + - Combined-change variant: a single `SetQueueAttributes` request that touches all three immutable attributes simultaneously rejects with the same error code (no partial application). + - Touching a *mutable* attribute alongside (e.g. `VisibilityTimeout` plus an attempted `PartitionCount` change) still rejects — no partial commit of the mutable changes when an immutable one is invalid. + - Same-value "no-op": `SetQueueAttributes` with `PartitionCount = 4` (matching the stored meta) succeeds; only differing values are rejected. + +5. **`WaitTimeSeconds` shared-deadline bound** (`adapter/sqs_partitioned_long_poll_timing_test.go`): pins the §4.2 step 3 contract that the total wall-clock wait is bounded by the original `WaitTimeSeconds`, not `PartitionCount × WaitTimeSeconds`. + - Create a partitioned queue with `PartitionCount = 4`. Send no messages. + - Call `ReceiveMessage(WaitTimeSeconds = 2)` and time the call. + - Assert it returns in **≤ 3 s** (2 s budget + reasonable overhead), not in 4 × 2 = 8 s. A naive implementation that drops the `remainingWait` threading and passes the original `WaitTimeSeconds` to every sub-call will time out at 8 s and the test will fail. + - Companion variant: send no messages, call `ReceiveMessage(WaitTimeSeconds = 0)` — the loop must still iterate every partition with a non-blocking point-read (per §4.2 step 4a) and return promptly with empty. + +6. **`X-Elastickv-Receive-Partition` ingress strip** (`adapter/sqs_partitioned_internal_header_test.go`): pins the §4.2 security invariant that the partition hop hint is internal-only. A direct client request must not be able to bypass the fan-out. + - Create a partitioned queue with `PartitionCount = 4`. Send 4 messages whose `MessageGroupId` values hash to 4 different partitions (one per partition). + - Call `ReceiveMessage(MaxNumberOfMessages = 4)` from a direct client with `X-Elastickv-Receive-Partition: 0` set in the request headers (i.e. simulating a malicious client trying to skip the fan-out). + - Assert the response carries messages from **all four partitions**, identical to a call without the header. A regression that trusts the client-supplied header would return at most one message (only the partition-0 entry) and a malicious client could observe false-empty results for the other three groups. + - Companion variant: same request without the header — the response must be byte-identical to the variant above (same message set), so the strip path cannot accidentally introduce a divergence. + +7. **Jepsen** (`jepsen/sqs/htfifo/`): a new workload that stresses cross-partition delivery — many groups, many consumers, network partition mid-burst — and verifies (a) within-group ordering and (b) no message loss. + +8. **Metrics / observability**: new `sqs_partition_messages_total{queue, partition, action}` counter so dashboards can spot hot partitions. + +--- + +## 10. Open Questions + +1. **Partition count limits**: 32 is the proposal's max. AWS HT-FIFO has no documented per-queue cap; 32 is enough for ~30,000 RPS per queue at the per-shard ~1,000 RPS limit. Higher would require larger per-queue meta records and more reaper cycles. Adjust later if operators demand more. + +2. **Hash function**: FNV-1a is fast and stable but not cryptographically strong. An attacker who can pick `MessageGroupId` values can pin all traffic to one partition. Mitigation options: + - Document that group IDs must be random / non-attacker-controlled. + - Switch to xxHash64 with a process-startup-random seed (defeats the targeted attack but breaks determinism across processes — bad for the "where did this message land" question). + - Accept the risk and document it. + + The proposal's working answer is **document and accept** — the feature is for cooperative operators, not adversarial multi-tenancy. + +3. **Cross-partition ordering for visibility**: does a consumer need to see messages from partition 0 *before* partition 1 within a single ReceiveMessage call? The answer is no (within-group ordering is the only contract), but the test plan should pin this so a future "fairness" tweak does not accidentally introduce ordering across partitions. + +(Two earlier "open questions" in this section have been **resolved** and are intentionally omitted: a default `PartitionCount` for AWS-shape callers, and a `--sqsDefaultFifoPartitionCount` flag making `PartitionCount` per-shard. §7.2 makes `PartitionCount` required on the wire whenever `FifoThroughputLimit = perMessageGroupId` is set, with no server-side default — both directions are blocked by the validator and the rationale (CreateQueue idempotency depends on hidden deployment state otherwise) is documented there. The resolution is enforced at the wire by `validatePartitionConfig` in `adapter/sqs_partitioning.go` per round-12 of the PR #664 review.) + +--- + +## 11. Rollout Plan (Multi-PR) + +| PR | Content | Reviewable in isolation? | +|---|---|---| +| 1 | This proposal doc lands. Operators have time to flag concerns. | Yes | +| 2 | Schema: `sqsQueueMeta.PartitionCount`, `DeduplicationScope`, `FifoThroughputLimit`. Routing function `partitionFor`. CreateQueue / SetQueueAttributes validation including the §3.2 cross-attribute rules. **Temporary feature gate** (see below): `CreateQueue` rejects `PartitionCount > 1` with `InvalidAttributeValue` ("PartitionCount > 1 requires HT-FIFO data plane — not yet enabled") so the schema field exists in the meta type but cannot land in production data. | Yes (catalog only) | +| 3 | Keyspace: thread `partitionIndex` through every `sqsMsg*Key` constructor, defaulting to 0 so existing queues stay byte-identical. Gate from PR 2 still in place — `PartitionCount > 1` remains rejected. | Yes (mechanical) | +| 4 | Routing layer: `kv/shard_router.go` accepts the `(queue, partition)` key. New `--sqsFifoPartitionMap` flag (separate from the existing `--raftSqsMap` endpoint-mapping flag). Mixed-version gate (§8.5 capability advertisement via `/sqs_health` + catalog polling for `CreateQueue` gating, **and** the §8 leadership-refusal hook in `kv/lease_state.go` that calls `TransferLeadership` when a non-`htfifo` binary discovers a partitioned queue in its shard on startup or leadership acquisition — both components are required before the binary is marked `htfifo`-eligible). PR 2's temporary `PartitionCount > 1` rejection still in place. | Yes (operator-config) | +| 5 | Send / Receive partition fanout. Receipt-handle v2 codec. **Removes the PR 2 `PartitionCount > 1` rejection** in the same commit that wires the data-plane fanout — the gate and its lift land atomically so a half-deployed cluster can never accept a partitioned queue without the data plane to serve it. | Yes (data-plane) | +| 6 | PurgeQueue / DeleteQueue partition iteration. Tombstone schema update. Reaper update. | Yes (control-plane) | +| 7 | Jepsen HT-FIFO workload. Metrics. | Yes (testing) | +| 8 | Partial-doc lifecycle bump: 3.D moves from TODO to Landed. Section 13 from §16.6 of the partial doc gets the as-built record. | Yes (docs) | + +**Why the temporary gate** (Codex P1 on PR #664 tenth-round Codex review): without it, a cluster running PR 2–4 would accept a `CreateQueue` with `PartitionCount = 4` (the schema is in place, the validator only checks per-attribute validity) and then dispatch every subsequent `SendMessage` against the **legacy single-partition keyspace** with `partitionIndex = 0` — silently writing all messages under `!sqs|msg|data||…` regardless of `PartitionCount`. When PR 5 lands and the new fanout reader looks for messages under the partitioned prefix `!sqs|msg|data|p|||…`, every message written during the PR 2–4 window is invisible to it and to the partition-aware reaper scan. The gate-and-lift pattern (PR 2 rejects, PR 5 lifts in the same commit as the data-plane fanout) makes it impossible to land data under the wrong layout: any cluster that accepts `PartitionCount > 1` is, by construction, also running the partition-aware send path. + +**Gate of no return**: PR 5 is the point where a partitioned FIFO queue can hold real data. Once any production cluster runs PR 5 and creates a partitioned queue, rolling back means draining and recreating the queue. PR 1–4 are reversible (no data layout change). Recorded in the PR descriptions. + +--- + +## 12. Alternatives Considered + +### 12.1 Skip HT-FIFO and document the per-queue cap + +Operators who need >300 TPS create multiple FIFO queues and shard at the application level. **Rejected**: this pushes the partitioning burden onto every consumer of elastickv's SQS surface, and clients that already use AWS HT-FIFO have to reimplement their topology. The whole point of an SQS-compatible adapter is to let producers stay AWS-shaped. + +### 12.2 Single Raft group, multiple visibility queues + +Keep one Raft group per queue but partition the visibility index inside it. **Rejected**: the bottleneck on a single FIFO queue is the Raft proposal pipeline, not the visibility scan. Partitioning the index without partitioning the consensus group does not unlock any throughput. + +### 12.3 Cross-partition transactions for PurgeQueue + +Use a coordinator transaction that touches all partitions atomically. **Rejected**: the cross-shard transaction primitive does not yet exist for SQS, and AWS itself does not promise PurgeQueue atomicity. Per-partition iteration with retry is the standard answer. + +### 12.4 Operator-configurable hash function + +Let operators plug in their own `partitionFor`. **Rejected** at this stage: a fixed function is much easier to reason about for cross-version compatibility. Pluggable hashing is a Phase 5+ concern, if ever.