From c08c441cebf916638edc668732a14899f33592db Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sat, 2 May 2026 23:41:19 +0000
Subject: [PATCH 01/28] JIT: Sort stack locals by access density for smaller
 code on x64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sort local variables by access density (weighted ref count / size) before
frame layout in lvaAssignVirtualFrameOffsetsToLocals(). This packs locals
with the highest access frequency per byte into the disp8 zone (±128 bytes
from the frame pointer), reducing 4-byte displacements to 1-byte encodings.

SPMI aspnet2 results: -5,982 bytes (-0.23%), 256 improvements vs 103
regressions, PerfScore -0.01%.

Gated to TARGET_AMD64, FullOpts, non-EnC, frames > 128 bytes estimated size.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 101 +++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index ba92c8035fd012..453708766e8cc0 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5156,6 +5156,98 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     UINT assignMore             = 0xFFFFFFFF;
     bool have_LclVarDoubleAlign = false;
 
+#ifdef TARGET_AMD64
+    // Build a sorted array of local variable indices to optimize displacement encoding.
+    // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
+    // displacement, while larger offsets require 4 bytes — saving 3 bytes per access
+    // promoted from disp32 to disp8.
+    //
+    // The sort uses access density (weighted ref count / local size) as the primary key
+    // so that locals with the highest access frequency per byte of frame space get the
+    // smallest offsets. This maximizes the total number of hot accesses that fit within
+    // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals
+    // requiring 8-byte alignment are grouped before smaller locals to reduce alignment
+    // padding waste.
+    // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
+    //
+    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with
+    // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get
+    // bigger" invariant to preserve between passes.
+    //
+    // We skip this optimization for EnC (which requires stable layout) and when ref counts
+    // are not available.
+    assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
+    unsigned* lclVarSortOrder = nullptr;
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts())
+    {
+        // Estimate total local frame size to decide if sorting is worthwhile.
+        // Only sort when the frame exceeds the disp8 boundary (128 bytes);
+        // in smaller frames, all locals already fit in disp8 and sorting just churns
+        // offsets without benefit.
+        unsigned estimatedLocalSize = 0;
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            estimatedLocalSize += lvaLclStackHomeSize(i);
+        }
+
+        if (estimatedLocalSize > 128)
+        {
+            JITDUMP("Sorting %u locals by access density for frame layout optimization "
+                    "(estimated frame size %u bytes)\n",
+                    lvaCount, estimatedLocalSize);
+
+            lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+            for (unsigned i = 0; i < lvaCount; i++)
+            {
+                lclVarSortOrder[i] = i;
+            }
+
+            jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount,
+                         [this](unsigned n1, unsigned n2) -> bool {
+                             const LclVarDsc* dsc1 = lvaGetDesc(n1);
+                             const LclVarDsc* dsc2 = lvaGetDesc(n2);
+
+                             // Sort by access density (weighted ref count per byte) descending.
+                             // This maximizes the number of hot accesses that fit within the
+                             // disp8 zone (first ~128 bytes of frame). A small hot local is
+                             // more valuable per frame byte than a large hot local.
+                             unsigned size1 = lvaLclStackHomeSize(n1);
+                             unsigned size2 = lvaLclStackHomeSize(n2);
+                             weight_t wt1   = dsc1->lvRefCntWtd(lvaRefCountState);
+                             weight_t wt2   = dsc2->lvRefCntWtd(lvaRefCountState);
+
+                             // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1
+                             // to avoid division. Both sizes are > 0.
+                             weight_t density1 = wt1 * size2;
+                             weight_t density2 = wt2 * size1;
+                             if (density1 != density2)
+                             {
+                                 return density1 > density2;
+                             }
+
+                             // Among locals with equal density, group by alignment class
+                             // (8+ byte locals before smaller ones) to reduce padding.
+                             bool aligned1 = (size1 >= 8);
+                             bool aligned2 = (size2 >= 8);
+                             if (aligned1 != aligned2)
+                             {
+                                 return aligned1;
+                             }
+
+                             unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState);
+                             unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState);
+                             if (cnt1 != cnt2)
+                             {
+                                 return cnt1 > cnt2;
+                             }
+
+                             // Stable tiebreaker: lower lclNum first.
+                             return n1 < n2;
+                         });
+        }
+    }
+#endif // TARGET_AMD64
+
     for (cur = 0; alloc_order[cur]; cur++)
     {
         if ((assignMore & alloc_order[cur]) == 0)
@@ -5168,8 +5260,15 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
         unsigned   lclNum;
         LclVarDsc* varDsc;
 
-        for (lclNum = 0, varDsc = lvaTable; lclNum < lvaCount; lclNum++, varDsc++)
+        for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++)
         {
+#ifdef TARGET_AMD64
+            lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx;
+#else
+            lclNum = sortIdx;
+#endif
+            varDsc = lvaGetDesc(lclNum);
+
             /* Ignore field locals of the promotion type PROMOTION_TYPE_FIELD_DEPENDENT.
                In other words, we will not calculate the "base" address of the struct local if
                the promotion type is PROMOTION_TYPE_FIELD_DEPENDENT.

From e63b42776580aee5998e4da45d0847e88f4646ae Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 01:27:43 +0000
Subject: [PATCH 02/28] JIT: Multi-strategy frame layout selection for smaller
 code on x64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the single access-density sort with a multi-strategy approach
that tries 5 candidate layouts and picks the one with lowest estimated
encoding cost:

  1. Original (unsorted) order as baseline
  2. Weighted access density (refCntWtd / size)
  3. Unweighted ref count
  4. Weighted ref count
  5. Unweighted ref count density (refCnt / size)

A lightweight cost estimation function simulates the frame allocation
loop (including alignment padding and SIMD alignment) and scores each
layout using Σ(refCnt × encodingBytes), where encodingBytes is 1 for
disp8 or 4 for disp32. The strategy with the lowest cost wins; if no
strategy beats the original order, no sorting is applied.

This is gated to frame-pointer-based frames only, since the disp8
boundary check assumes RBP-relative negative virtual offsets.

SPMI aspnet2 results vs single-strategy:
  Code size:   -7,400 bytes (-0.28%) vs -5,982 bytes (-0.23%)
  Regressions: 36 vs 103 (65% fewer)
  PerfScore:   neutral

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 273 +++++++++++++++++++++++++++---------
 1 file changed, 210 insertions(+), 63 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 453708766e8cc0..14cb9100b31eed 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5157,33 +5157,31 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     bool have_LclVarDoubleAlign = false;
 
 #ifdef TARGET_AMD64
-    // Build a sorted array of local variable indices to optimize displacement encoding.
+    // Multi-strategy frame layout optimization for x64.
+    //
     // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
-    // displacement, while larger offsets require 4 bytes — saving 3 bytes per access
-    // promoted from disp32 to disp8.
+    // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
+    // bytes per access. We try multiple sort orders for locals and pick the one that
+    // minimizes total encoding cost, estimated by simulating the frame allocation loop.
     //
-    // The sort uses access density (weighted ref count / local size) as the primary key
-    // so that locals with the highest access frequency per byte of frame space get the
-    // smallest offsets. This maximizes the total number of hot accesses that fit within
-    // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals
-    // requiring 8-byte alignment are grouped before smaller locals to reduce alignment
-    // padding waste.
+    // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1
+    // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction
+    // count. This gives a direct estimate of total displacement encoding bytes.
     // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
     //
-    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with
-    // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get
-    // bigger" invariant to preserve between passes.
+    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called
+    // with FINAL_FRAME_LAYOUT (no tentative layout exists).
+    //
+    // We only run this for frame-pointer-based frames because the disp8 boundary check
+    // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets
+    // after fixup and contribute negligible savings.
     //
-    // We skip this optimization for EnC (which requires stable layout) and when ref counts
-    // are not available.
+    // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
+    // less meaningful), and frames that fit entirely within the disp8 zone.
     assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
     unsigned* lclVarSortOrder = nullptr;
-    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts())
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed())
     {
-        // Estimate total local frame size to decide if sorting is worthwhile.
-        // Only sort when the frame exceeds the disp8 boundary (128 bytes);
-        // in smaller frames, all locals already fit in disp8 and sorting just churns
-        // offsets without benefit.
         unsigned estimatedLocalSize = 0;
         for (unsigned i = 0; i < lvaCount; i++)
         {
@@ -5192,58 +5190,207 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         if (estimatedLocalSize > 128)
         {
-            JITDUMP("Sorting %u locals by access density for frame layout optimization "
+            JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
                     "(estimated frame size %u bytes)\n",
                     lvaCount, estimatedLocalSize);
 
-            lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+            // Pre-compute which locals will be allocated in the main loop and their
+            // pass category. Category 0 means "not allocatable" (skipped by the loop).
+            unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount];
+            for (unsigned i = 0; i < lvaCount; i++)
+            {
+                lclPassCategory[i] = 0;
+                LclVarDsc* varDsc = lvaGetDesc(i);
+
+                if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+                    continue;
+#if FEATURE_FIXED_OUT_ARGS
+                if (i == lvaOutgoingArgSpaceVar)
+                    continue;
+#endif
+                if (lvaIsOSRLocal(i))
+                    continue;
+                if (!varDsc->lvOnFrame)
+                    continue;
+                if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie())
+                    continue;
+                if (i == lvaRetAddrVar)
+                    continue;
+                if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar ||
+                    i == lvaAsyncSynchronizationContextVar)
+                    continue;
+                if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
+                    continue;
+
+                if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
+                {
+                    lclPassCategory[i] =
+                        varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
+                }
+                else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
+                {
+                    lclPassCategory[i] = ALLOC_PTRS;
+                }
+                else
+                {
+                    lclPassCategory[i] = ALLOC_NON_PTRS;
+                }
+            }
+
+            // Simulate frame layout for a given sort order and return total encoding cost.
+            // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
+            // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
+            // Uses the current stkOffs as the starting point, which already accounts for
+            // callee saves, XMM saves, and any pre-allocated special locals.
+            auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
+                unsigned totalCost = 0;
+                int      simOff    = stkOffs;
+
+                for (int p = 0; alloc_order[p]; p++)
+                {
+                    UINT pass = alloc_order[p];
+                    for (unsigned idx = 0; idx < lvaCount; idx++)
+                    {
+                        unsigned lcl = order[idx];
+                        if (lclPassCategory[lcl] != pass)
+                            continue;
+
+                        LclVarDsc* varDsc = lvaGetDesc(lcl);
+                        unsigned   size   = lvaLclStackHomeSize(lcl);
+
+                        // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
+                        if (size >= 8)
+                        {
+#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
+                            if (varTypeIsSIMD(varDsc))
+                            {
+                                int alignment = getSIMDTypeAlignment(varDsc->TypeGet());
+                                if (simOff % alignment != 0)
+                                {
+                                    simOff -= static_cast<int>(alignment + (simOff % alignment));
+                                }
+                            }
+                            else
+#endif
+                            {
+                                if ((simOff % 8) != 0)
+                                {
+                                    simOff -= static_cast<int>(8 + (simOff % 8));
+                                }
+                            }
+                        }
+
+                        simOff -= static_cast<int>(size);
+
+                        unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState);
+                        totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
+                    }
+                }
+
+                return totalCost;
+            };
+
+            lclVarSortOrder              = new (this, CMK_LvaTable) unsigned[lvaCount];
+            unsigned* candidateOrder     = new (this, CMK_LvaTable) unsigned[lvaCount];
             for (unsigned i = 0; i < lvaCount; i++)
             {
                 lclVarSortOrder[i] = i;
+                candidateOrder[i]  = i;
+            }
+
+            // Score the original (unsorted) order as baseline.
+            unsigned    origCost = estimateLayoutCost(lclVarSortOrder);
+            unsigned    bestCost = origCost;
+            const char* bestName = "original";
+
+            // Helper to try a strategy: sort candidateOrder, estimate cost,
+            // and update best if the cost is lower.
+            auto tryStrategy = [&](const char* name, auto comparator) -> unsigned {
+                for (unsigned i = 0; i < lvaCount; i++)
+                    candidateOrder[i] = i;
+                jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator);
+                unsigned cost = estimateLayoutCost(candidateOrder);
+                if (cost < bestCost)
+                {
+                    bestCost = cost;
+                    bestName = name;
+                    memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned));
+                }
+                return cost;
+            };
+
+            // Strategy 1: Access density (weighted ref count / size) descending.
+            // A small hot local is more valuable per frame byte than a large hot local.
+            unsigned densityCost = tryStrategy("density",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    const LclVarDsc* d1 = lvaGetDesc(n1);
+                    const LclVarDsc* d2 = lvaGetDesc(n2);
+                    unsigned s1 = lvaLclStackHomeSize(n1);
+                    unsigned s2 = lvaLclStackHomeSize(n2);
+                    weight_t w1 = d1->lvRefCntWtd(lvaRefCountState);
+                    weight_t w2 = d2->lvRefCntWtd(lvaRefCountState);
+                    // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
+                    weight_t dens1 = w1 * s2;
+                    weight_t dens2 = w2 * s1;
+                    if (dens1 != dens2) return dens1 > dens2;
+                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+                    if (a1 != a2) return a1;
+                    unsigned c1 = d1->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = d2->lvRefCnt(lvaRefCountState);
+                    if (c1 != c2) return c1 > c2;
+                    return n1 < n2;
+                });
+
+            // Strategy 2: Unweighted ref count descending.
+            unsigned refCntCost = tryStrategy("refCnt",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                    if (c1 != c2) return c1 > c2;
+                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // Strategy 3: Weighted ref count descending.
+            unsigned weightCost = tryStrategy("weight",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                    weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    if (w1 != w2) return w1 > w2;
+                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // Strategy 4: Unweighted ref count density (refCnt / size) descending.
+            unsigned refDensityCost = tryStrategy("refDensity",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                    unsigned s1 = lvaLclStackHomeSize(n1);
+                    unsigned s2 = lvaLclStackHomeSize(n2);
+                    // Cross-multiply to avoid division.
+                    unsigned long long dens1 = (unsigned long long)c1 * s2;
+                    unsigned long long dens2 = (unsigned long long)c2 * s1;
+                    if (dens1 != dens2) return dens1 > dens2;
+                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // If original order won, no sorting needed.
+            if (bestCost == origCost)
+            {
+                lclVarSortOrder = nullptr;
             }
 
-            jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount,
-                         [this](unsigned n1, unsigned n2) -> bool {
-                             const LclVarDsc* dsc1 = lvaGetDesc(n1);
-                             const LclVarDsc* dsc2 = lvaGetDesc(n2);
-
-                             // Sort by access density (weighted ref count per byte) descending.
-                             // This maximizes the number of hot accesses that fit within the
-                             // disp8 zone (first ~128 bytes of frame). A small hot local is
-                             // more valuable per frame byte than a large hot local.
-                             unsigned size1 = lvaLclStackHomeSize(n1);
-                             unsigned size2 = lvaLclStackHomeSize(n2);
-                             weight_t wt1   = dsc1->lvRefCntWtd(lvaRefCountState);
-                             weight_t wt2   = dsc2->lvRefCntWtd(lvaRefCountState);
-
-                             // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1
-                             // to avoid division. Both sizes are > 0.
-                             weight_t density1 = wt1 * size2;
-                             weight_t density2 = wt2 * size1;
-                             if (density1 != density2)
-                             {
-                                 return density1 > density2;
-                             }
-
-                             // Among locals with equal density, group by alignment class
-                             // (8+ byte locals before smaller ones) to reduce padding.
-                             bool aligned1 = (size1 >= 8);
-                             bool aligned2 = (size2 >= 8);
-                             if (aligned1 != aligned2)
-                             {
-                                 return aligned1;
-                             }
-
-                             unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState);
-                             unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState);
-                             if (cnt1 != cnt2)
-                             {
-                                 return cnt1 > cnt2;
-                             }
-
-                             // Stable tiebreaker: lower lclNum first.
-                             return n1 < n2;
-                         });
+            JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
+                    "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
+                    origCost, densityCost, refCntCost, weightCost, refDensityCost,
+                    bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
         }
     }
 #endif // TARGET_AMD64

From 839f45707a2f7039b4a5336d1c4438ffba47af29 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 02:04:39 +0000
Subject: [PATCH 03/28] JIT: Enable multi-strategy frame layout optimization
 for x86

Extend the stack local sorting optimization from x64-only (TARGET_AMD64)
to all x86/x64 (TARGET_XARCH). The optimization is applicable to x86
because:
- x86 uses the same disp8 [-128,+127] vs disp32 encoding threshold
- x86 only calls lvaAssignFrameOffsets with FINAL_FRAME_LAYOUT
- x86 frequently uses EBP-based frames (especially with DOUBLE_ALIGN)

No behavioral change on x64; enables the optimization for x86 targets.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 14cb9100b31eed..bd93f583697758 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5156,10 +5156,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     UINT assignMore             = 0xFFFFFFFF;
     bool have_LclVarDoubleAlign = false;
 
-#ifdef TARGET_AMD64
-    // Multi-strategy frame layout optimization for x64.
+#ifdef TARGET_XARCH
+    // Multi-strategy frame layout optimization for x86/x64.
     //
-    // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
+    // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte
     // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
     // bytes per access. We try multiple sort orders for locals and pick the one that
     // minimizes total encoding cost, estimated by simulating the frame allocation loop.
@@ -5169,12 +5169,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     // count. This gives a direct estimate of total displacement encoding bytes.
     // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
     //
-    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called
+    // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called
     // with FINAL_FRAME_LAYOUT (no tentative layout exists).
     //
     // We only run this for frame-pointer-based frames because the disp8 boundary check
-    // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets
-    // after fixup and contribute negligible savings.
+    // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
+    // offsets after fixup and contribute negligible savings.
     //
     // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
     // less meaningful), and frames that fit entirely within the disp8 zone.
@@ -5393,7 +5393,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
                     bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
         }
     }
-#endif // TARGET_AMD64
+#endif // TARGET_XARCH
 
     for (cur = 0; alloc_order[cur]; cur++)
     {
@@ -5409,7 +5409,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++)
         {
-#ifdef TARGET_AMD64
+#ifdef TARGET_XARCH
             lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx;
 #else
             lclNum = sortIdx;

From ff85ffc581ff15f768e577c2f664c3c0913e03a0 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 02:21:26 +0000
Subject: [PATCH 04/28] JIT: Enable frame layout optimization for MinOpts/Tier0
 with lightweight ref counting

For MinOpts/Tier0, precise ref counts are not computed (PreciseRefCountsRequired()
returns false). Previously, the frame layout optimization was skipped entirely for
MinOpts. This change adds a lightweight LIR walk that counts local variable
references without any of the analysis side effects of lvaMarkLclRefs.

The lightweight counts are used by the cost estimation function and sorting
comparators to make informed layout decisions for MinOpts methods.

SPMI results across collections show significant impact, especially on MinOpts-heavy
collections:
  aspnet2:           -8,682 bytes (-0.33%), 262 improvements, 37 regressions
  benchmarks.run:    -409,067 bytes (-1.10%), 8,788 improvements, 706 regressions
  libraries_tests:   -4,503,889 bytes (-1.11%), 96,203 improvements, 7,210 regressions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 96 ++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 22 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index bd93f583697758..80e2b13243a1af 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5176,11 +5176,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
     // offsets after fixup and contribute negligible savings.
     //
-    // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
-    // less meaningful), and frames that fit entirely within the disp8 zone.
+    // We skip this for EnC (which requires stable layout) and frames that fit entirely
+    // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed,
+    // we do a lightweight LIR walk to count local references for sorting purposes.
     assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
     unsigned* lclVarSortOrder = nullptr;
-    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed())
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed())
     {
         unsigned estimatedLocalSize = 0;
         for (unsigned i = 0; i < lvaCount; i++)
@@ -5190,9 +5191,36 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         if (estimatedLocalSize > 128)
         {
+            // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0).
+            // Do a lightweight LIR walk to count local references for sorting purposes.
+            // This is much cheaper than the full lvaMarkLclRefs pass — we only count
+            // occurrences without any of the analysis side effects.
+            unsigned* lclRefCounts = nullptr;
+            if (!PreciseRefCountsRequired())
+            {
+                lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount];
+                memset(lclRefCounts, 0, lvaCount * sizeof(unsigned));
+
+                for (BasicBlock* const block : Blocks())
+                {
+                    for (GenTree* node : LIR::AsRange(block))
+                    {
+                        if (node->OperIsAnyLocal())
+                        {
+                            unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
+                            if (lclNum < lvaCount)
+                            {
+                                lclRefCounts[lclNum]++;
+                            }
+                        }
+                    }
+                }
+            }
+
             JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
-                    "(estimated frame size %u bytes)\n",
-                    lvaCount, estimatedLocalSize);
+                    "(estimated frame size %u bytes%s)\n",
+                    lvaCount, estimatedLocalSize,
+                    lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
 
             // Pre-compute which locals will be allocated in the main loop and their
             // pass category. Category 0 means "not allocatable" (skipped by the loop).
@@ -5282,7 +5310,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
                         simOff -= static_cast<int>(size);
 
-                        unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState);
+                        unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
+                                                                    : varDsc->lvRefCnt(lvaRefCountState);
                         totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
                     }
                 }
@@ -5322,30 +5351,41 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
             // Strategy 1: Access density (weighted ref count / size) descending.
             // A small hot local is more valuable per frame byte than a large hot local.
             unsigned densityCost = tryStrategy("density",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    const LclVarDsc* d1 = lvaGetDesc(n1);
-                    const LclVarDsc* d2 = lvaGetDesc(n2);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
                     unsigned s1 = lvaLclStackHomeSize(n1);
                     unsigned s2 = lvaLclStackHomeSize(n2);
-                    weight_t w1 = d1->lvRefCntWtd(lvaRefCountState);
-                    weight_t w2 = d2->lvRefCntWtd(lvaRefCountState);
+                    weight_t w1, w2;
+                    if (lclRefCounts != nullptr)
+                    {
+                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
+                    }
+                    else
+                    {
+                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    }
                     // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
                     weight_t dens1 = w1 * s2;
                     weight_t dens2 = w2 * s1;
                     if (dens1 != dens2) return dens1 > dens2;
                     bool a1 = (s1 >= 8), a2 = (s2 >= 8);
                     if (a1 != a2) return a1;
-                    unsigned c1 = d1->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = d2->lvRefCnt(lvaRefCountState);
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     if (c1 != c2) return c1 > c2;
                     return n1 < n2;
                 });
 
             // Strategy 2: Unweighted ref count descending.
             unsigned refCntCost = tryStrategy("refCnt",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     if (c1 != c2) return c1 > c2;
                     bool a1 = (lvaLclStackHomeSize(n1) >= 8);
                     bool a2 = (lvaLclStackHomeSize(n2) >= 8);
@@ -5354,10 +5394,20 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
                 });
 
             // Strategy 3: Weighted ref count descending.
+            // For MinOpts, weighted = unweighted (no block weights available).
             unsigned weightCost = tryStrategy("weight",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                    weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    weight_t w1, w2;
+                    if (lclRefCounts != nullptr)
+                    {
+                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
+                    }
+                    else
+                    {
+                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    }
                     if (w1 != w2) return w1 > w2;
                     bool a1 = (lvaLclStackHomeSize(n1) >= 8);
                     bool a2 = (lvaLclStackHomeSize(n2) >= 8);
@@ -5367,9 +5417,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
             // Strategy 4: Unweighted ref count density (refCnt / size) descending.
             unsigned refDensityCost = tryStrategy("refDensity",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     unsigned s1 = lvaLclStackHomeSize(n1);
                     unsigned s2 = lvaLclStackHomeSize(n2);
                     // Cross-multiply to avoid division.

From 81e5559e09f6995a695fca7868126a9b703a5c41 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sat, 2 May 2026 23:41:19 +0000
Subject: [PATCH 05/28] JIT: Sort stack locals by access density for smaller
 code on x64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sort local variables by access density (weighted ref count / size) before
frame layout in lvaAssignVirtualFrameOffsetsToLocals(). This packs locals
with the highest access frequency per byte into the disp8 zone (±128 bytes
from the frame pointer), reducing 4-byte displacements to 1-byte encodings.

SPMI aspnet2 results: -5,982 bytes (-0.23%), 256 improvements vs 103
regressions, PerfScore -0.01%.

Gated to TARGET_AMD64, FullOpts, non-EnC, frames > 128 bytes estimated size.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 101 +++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index ba92c8035fd012..453708766e8cc0 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5156,6 +5156,98 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     UINT assignMore             = 0xFFFFFFFF;
     bool have_LclVarDoubleAlign = false;
 
+#ifdef TARGET_AMD64
+    // Build a sorted array of local variable indices to optimize displacement encoding.
+    // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
+    // displacement, while larger offsets require 4 bytes — saving 3 bytes per access
+    // promoted from disp32 to disp8.
+    //
+    // The sort uses access density (weighted ref count / local size) as the primary key
+    // so that locals with the highest access frequency per byte of frame space get the
+    // smallest offsets. This maximizes the total number of hot accesses that fit within
+    // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals
+    // requiring 8-byte alignment are grouped before smaller locals to reduce alignment
+    // padding waste.
+    // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
+    //
+    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with
+    // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get
+    // bigger" invariant to preserve between passes.
+    //
+    // We skip this optimization for EnC (which requires stable layout) and when ref counts
+    // are not available.
+    assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
+    unsigned* lclVarSortOrder = nullptr;
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts())
+    {
+        // Estimate total local frame size to decide if sorting is worthwhile.
+        // Only sort when the frame exceeds the disp8 boundary (128 bytes);
+        // in smaller frames, all locals already fit in disp8 and sorting just churns
+        // offsets without benefit.
+        unsigned estimatedLocalSize = 0;
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            estimatedLocalSize += lvaLclStackHomeSize(i);
+        }
+
+        if (estimatedLocalSize > 128)
+        {
+            JITDUMP("Sorting %u locals by access density for frame layout optimization "
+                    "(estimated frame size %u bytes)\n",
+                    lvaCount, estimatedLocalSize);
+
+            lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+            for (unsigned i = 0; i < lvaCount; i++)
+            {
+                lclVarSortOrder[i] = i;
+            }
+
+            jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount,
+                         [this](unsigned n1, unsigned n2) -> bool {
+                             const LclVarDsc* dsc1 = lvaGetDesc(n1);
+                             const LclVarDsc* dsc2 = lvaGetDesc(n2);
+
+                             // Sort by access density (weighted ref count per byte) descending.
+                             // This maximizes the number of hot accesses that fit within the
+                             // disp8 zone (first ~128 bytes of frame). A small hot local is
+                             // more valuable per frame byte than a large hot local.
+                             unsigned size1 = lvaLclStackHomeSize(n1);
+                             unsigned size2 = lvaLclStackHomeSize(n2);
+                             weight_t wt1   = dsc1->lvRefCntWtd(lvaRefCountState);
+                             weight_t wt2   = dsc2->lvRefCntWtd(lvaRefCountState);
+
+                             // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1
+                             // to avoid division. Both sizes are > 0.
+                             weight_t density1 = wt1 * size2;
+                             weight_t density2 = wt2 * size1;
+                             if (density1 != density2)
+                             {
+                                 return density1 > density2;
+                             }
+
+                             // Among locals with equal density, group by alignment class
+                             // (8+ byte locals before smaller ones) to reduce padding.
+                             bool aligned1 = (size1 >= 8);
+                             bool aligned2 = (size2 >= 8);
+                             if (aligned1 != aligned2)
+                             {
+                                 return aligned1;
+                             }
+
+                             unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState);
+                             unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState);
+                             if (cnt1 != cnt2)
+                             {
+                                 return cnt1 > cnt2;
+                             }
+
+                             // Stable tiebreaker: lower lclNum first.
+                             return n1 < n2;
+                         });
+        }
+    }
+#endif // TARGET_AMD64
+
     for (cur = 0; alloc_order[cur]; cur++)
     {
         if ((assignMore & alloc_order[cur]) == 0)
@@ -5168,8 +5260,15 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
         unsigned   lclNum;
         LclVarDsc* varDsc;
 
-        for (lclNum = 0, varDsc = lvaTable; lclNum < lvaCount; lclNum++, varDsc++)
+        for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++)
         {
+#ifdef TARGET_AMD64
+            lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx;
+#else
+            lclNum = sortIdx;
+#endif
+            varDsc = lvaGetDesc(lclNum);
+
             /* Ignore field locals of the promotion type PROMOTION_TYPE_FIELD_DEPENDENT.
                In other words, we will not calculate the "base" address of the struct local if
                the promotion type is PROMOTION_TYPE_FIELD_DEPENDENT.

From 113262f8869470f645ca5bd2b146b0b695eea412 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 01:27:43 +0000
Subject: [PATCH 06/28] JIT: Multi-strategy frame layout selection for smaller
 code on x64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the single access-density sort with a multi-strategy approach
that tries 5 candidate layouts and picks the one with lowest estimated
encoding cost:

  1. Original (unsorted) order as baseline
  2. Weighted access density (refCntWtd / size)
  3. Unweighted ref count
  4. Weighted ref count
  5. Unweighted ref count density (refCnt / size)

A lightweight cost estimation function simulates the frame allocation
loop (including alignment padding and SIMD alignment) and scores each
layout using Σ(refCnt × encodingBytes), where encodingBytes is 1 for
disp8 or 4 for disp32. The strategy with the lowest cost wins; if no
strategy beats the original order, no sorting is applied.

This is gated to frame-pointer-based frames only, since the disp8
boundary check assumes RBP-relative negative virtual offsets.

SPMI aspnet2 results vs single-strategy:
  Code size:   -7,400 bytes (-0.28%) vs -5,982 bytes (-0.23%)
  Regressions: 36 vs 103 (65% fewer)
  PerfScore:   neutral

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 273 +++++++++++++++++++++++++++---------
 1 file changed, 210 insertions(+), 63 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 453708766e8cc0..14cb9100b31eed 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5157,33 +5157,31 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     bool have_LclVarDoubleAlign = false;
 
 #ifdef TARGET_AMD64
-    // Build a sorted array of local variable indices to optimize displacement encoding.
+    // Multi-strategy frame layout optimization for x64.
+    //
     // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
-    // displacement, while larger offsets require 4 bytes — saving 3 bytes per access
-    // promoted from disp32 to disp8.
+    // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
+    // bytes per access. We try multiple sort orders for locals and pick the one that
+    // minimizes total encoding cost, estimated by simulating the frame allocation loop.
     //
-    // The sort uses access density (weighted ref count / local size) as the primary key
-    // so that locals with the highest access frequency per byte of frame space get the
-    // smallest offsets. This maximizes the total number of hot accesses that fit within
-    // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals
-    // requiring 8-byte alignment are grouped before smaller locals to reduce alignment
-    // padding waste.
+    // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1
+    // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction
+    // count. This gives a direct estimate of total displacement encoding bytes.
     // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
     //
-    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with
-    // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get
-    // bigger" invariant to preserve between passes.
+    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called
+    // with FINAL_FRAME_LAYOUT (no tentative layout exists).
+    //
+    // We only run this for frame-pointer-based frames because the disp8 boundary check
+    // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets
+    // after fixup and contribute negligible savings.
     //
-    // We skip this optimization for EnC (which requires stable layout) and when ref counts
-    // are not available.
+    // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
+    // less meaningful), and frames that fit entirely within the disp8 zone.
     assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
     unsigned* lclVarSortOrder = nullptr;
-    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts())
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed())
     {
-        // Estimate total local frame size to decide if sorting is worthwhile.
-        // Only sort when the frame exceeds the disp8 boundary (128 bytes);
-        // in smaller frames, all locals already fit in disp8 and sorting just churns
-        // offsets without benefit.
         unsigned estimatedLocalSize = 0;
         for (unsigned i = 0; i < lvaCount; i++)
         {
@@ -5192,58 +5190,207 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         if (estimatedLocalSize > 128)
         {
-            JITDUMP("Sorting %u locals by access density for frame layout optimization "
+            JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
                     "(estimated frame size %u bytes)\n",
                     lvaCount, estimatedLocalSize);
 
-            lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+            // Pre-compute which locals will be allocated in the main loop and their
+            // pass category. Category 0 means "not allocatable" (skipped by the loop).
+            unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount];
+            for (unsigned i = 0; i < lvaCount; i++)
+            {
+                lclPassCategory[i] = 0;
+                LclVarDsc* varDsc = lvaGetDesc(i);
+
+                if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+                    continue;
+#if FEATURE_FIXED_OUT_ARGS
+                if (i == lvaOutgoingArgSpaceVar)
+                    continue;
+#endif
+                if (lvaIsOSRLocal(i))
+                    continue;
+                if (!varDsc->lvOnFrame)
+                    continue;
+                if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie())
+                    continue;
+                if (i == lvaRetAddrVar)
+                    continue;
+                if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar ||
+                    i == lvaAsyncSynchronizationContextVar)
+                    continue;
+                if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
+                    continue;
+
+                if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
+                {
+                    lclPassCategory[i] =
+                        varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
+                }
+                else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
+                {
+                    lclPassCategory[i] = ALLOC_PTRS;
+                }
+                else
+                {
+                    lclPassCategory[i] = ALLOC_NON_PTRS;
+                }
+            }
+
+            // Simulate frame layout for a given sort order and return total encoding cost.
+            // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
+            // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
+            // Uses the current stkOffs as the starting point, which already accounts for
+            // callee saves, XMM saves, and any pre-allocated special locals.
+            auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
+                unsigned totalCost = 0;
+                int      simOff    = stkOffs;
+
+                for (int p = 0; alloc_order[p]; p++)
+                {
+                    UINT pass = alloc_order[p];
+                    for (unsigned idx = 0; idx < lvaCount; idx++)
+                    {
+                        unsigned lcl = order[idx];
+                        if (lclPassCategory[lcl] != pass)
+                            continue;
+
+                        LclVarDsc* varDsc = lvaGetDesc(lcl);
+                        unsigned   size   = lvaLclStackHomeSize(lcl);
+
+                        // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
+                        if (size >= 8)
+                        {
+#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
+                            if (varTypeIsSIMD(varDsc))
+                            {
+                                int alignment = getSIMDTypeAlignment(varDsc->TypeGet());
+                                if (simOff % alignment != 0)
+                                {
+                                    simOff -= static_cast<int>(alignment + (simOff % alignment));
+                                }
+                            }
+                            else
+#endif
+                            {
+                                if ((simOff % 8) != 0)
+                                {
+                                    simOff -= static_cast<int>(8 + (simOff % 8));
+                                }
+                            }
+                        }
+
+                        simOff -= static_cast<int>(size);
+
+                        unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState);
+                        totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
+                    }
+                }
+
+                return totalCost;
+            };
+
+            lclVarSortOrder              = new (this, CMK_LvaTable) unsigned[lvaCount];
+            unsigned* candidateOrder     = new (this, CMK_LvaTable) unsigned[lvaCount];
             for (unsigned i = 0; i < lvaCount; i++)
             {
                 lclVarSortOrder[i] = i;
+                candidateOrder[i]  = i;
+            }
+
+            // Score the original (unsorted) order as baseline.
+            unsigned    origCost = estimateLayoutCost(lclVarSortOrder);
+            unsigned    bestCost = origCost;
+            const char* bestName = "original";
+
+            // Helper to try a strategy: sort candidateOrder, estimate cost,
+            // and update best if the cost is lower.
+            auto tryStrategy = [&](const char* name, auto comparator) -> unsigned {
+                for (unsigned i = 0; i < lvaCount; i++)
+                    candidateOrder[i] = i;
+                jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator);
+                unsigned cost = estimateLayoutCost(candidateOrder);
+                if (cost < bestCost)
+                {
+                    bestCost = cost;
+                    bestName = name;
+                    memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned));
+                }
+                return cost;
+            };
+
+            // Strategy 1: Access density (weighted ref count / size) descending.
+            // A small hot local is more valuable per frame byte than a large hot local.
+            unsigned densityCost = tryStrategy("density",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    const LclVarDsc* d1 = lvaGetDesc(n1);
+                    const LclVarDsc* d2 = lvaGetDesc(n2);
+                    unsigned s1 = lvaLclStackHomeSize(n1);
+                    unsigned s2 = lvaLclStackHomeSize(n2);
+                    weight_t w1 = d1->lvRefCntWtd(lvaRefCountState);
+                    weight_t w2 = d2->lvRefCntWtd(lvaRefCountState);
+                    // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
+                    weight_t dens1 = w1 * s2;
+                    weight_t dens2 = w2 * s1;
+                    if (dens1 != dens2) return dens1 > dens2;
+                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+                    if (a1 != a2) return a1;
+                    unsigned c1 = d1->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = d2->lvRefCnt(lvaRefCountState);
+                    if (c1 != c2) return c1 > c2;
+                    return n1 < n2;
+                });
+
+            // Strategy 2: Unweighted ref count descending.
+            unsigned refCntCost = tryStrategy("refCnt",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                    if (c1 != c2) return c1 > c2;
+                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // Strategy 3: Weighted ref count descending.
+            unsigned weightCost = tryStrategy("weight",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                    weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    if (w1 != w2) return w1 > w2;
+                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // Strategy 4: Unweighted ref count density (refCnt / size) descending.
+            unsigned refDensityCost = tryStrategy("refDensity",
+                [this](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                    unsigned s1 = lvaLclStackHomeSize(n1);
+                    unsigned s2 = lvaLclStackHomeSize(n2);
+                    // Cross-multiply to avoid division.
+                    unsigned long long dens1 = (unsigned long long)c1 * s2;
+                    unsigned long long dens2 = (unsigned long long)c2 * s1;
+                    if (dens1 != dens2) return dens1 > dens2;
+                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+                    if (a1 != a2) return a1;
+                    return n1 < n2;
+                });
+
+            // If original order won, no sorting needed.
+            if (bestCost == origCost)
+            {
+                lclVarSortOrder = nullptr;
             }
 
-            jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount,
-                         [this](unsigned n1, unsigned n2) -> bool {
-                             const LclVarDsc* dsc1 = lvaGetDesc(n1);
-                             const LclVarDsc* dsc2 = lvaGetDesc(n2);
-
-                             // Sort by access density (weighted ref count per byte) descending.
-                             // This maximizes the number of hot accesses that fit within the
-                             // disp8 zone (first ~128 bytes of frame). A small hot local is
-                             // more valuable per frame byte than a large hot local.
-                             unsigned size1 = lvaLclStackHomeSize(n1);
-                             unsigned size2 = lvaLclStackHomeSize(n2);
-                             weight_t wt1   = dsc1->lvRefCntWtd(lvaRefCountState);
-                             weight_t wt2   = dsc2->lvRefCntWtd(lvaRefCountState);
-
-                             // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1
-                             // to avoid division. Both sizes are > 0.
-                             weight_t density1 = wt1 * size2;
-                             weight_t density2 = wt2 * size1;
-                             if (density1 != density2)
-                             {
-                                 return density1 > density2;
-                             }
-
-                             // Among locals with equal density, group by alignment class
-                             // (8+ byte locals before smaller ones) to reduce padding.
-                             bool aligned1 = (size1 >= 8);
-                             bool aligned2 = (size2 >= 8);
-                             if (aligned1 != aligned2)
-                             {
-                                 return aligned1;
-                             }
-
-                             unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState);
-                             unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState);
-                             if (cnt1 != cnt2)
-                             {
-                                 return cnt1 > cnt2;
-                             }
-
-                             // Stable tiebreaker: lower lclNum first.
-                             return n1 < n2;
-                         });
+            JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
+                    "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
+                    origCost, densityCost, refCntCost, weightCost, refDensityCost,
+                    bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
         }
     }
 #endif // TARGET_AMD64

From 086034ddbccc223b9a14737c64b89432f99e14b1 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 02:04:39 +0000
Subject: [PATCH 07/28] JIT: Enable multi-strategy frame layout optimization
 for x86

Extend the stack local sorting optimization from x64-only (TARGET_AMD64)
to all x86/x64 (TARGET_XARCH). The optimization is applicable to x86
because:
- x86 uses the same disp8 [-128,+127] vs disp32 encoding threshold
- x86 only calls lvaAssignFrameOffsets with FINAL_FRAME_LAYOUT
- x86 frequently uses EBP-based frames (especially with DOUBLE_ALIGN)

No behavioral change on x64; enables the optimization for x86 targets.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 14cb9100b31eed..bd93f583697758 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5156,10 +5156,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     UINT assignMore             = 0xFFFFFFFF;
     bool have_LclVarDoubleAlign = false;
 
-#ifdef TARGET_AMD64
-    // Multi-strategy frame layout optimization for x64.
+#ifdef TARGET_XARCH
+    // Multi-strategy frame layout optimization for x86/x64.
     //
-    // On x64, stack accesses within [-128, +127] of the base register use a 1-byte
+    // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte
     // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
     // bytes per access. We try multiple sort orders for locals and pick the one that
     // minimizes total encoding cost, estimated by simulating the frame allocation loop.
@@ -5169,12 +5169,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     // count. This gives a direct estimate of total displacement encoding bytes.
     // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
     //
-    // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called
+    // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called
     // with FINAL_FRAME_LAYOUT (no tentative layout exists).
     //
     // We only run this for frame-pointer-based frames because the disp8 boundary check
-    // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets
-    // after fixup and contribute negligible savings.
+    // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
+    // offsets after fixup and contribute negligible savings.
     //
     // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
     // less meaningful), and frames that fit entirely within the disp8 zone.
@@ -5393,7 +5393,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
                     bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
         }
     }
-#endif // TARGET_AMD64
+#endif // TARGET_XARCH
 
     for (cur = 0; alloc_order[cur]; cur++)
     {
@@ -5409,7 +5409,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++)
         {
-#ifdef TARGET_AMD64
+#ifdef TARGET_XARCH
             lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx;
 #else
             lclNum = sortIdx;

From 59aab9d4b551b3a1eb8cf8a00175381e78ee0548 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 02:21:26 +0000
Subject: [PATCH 08/28] JIT: Enable frame layout optimization for MinOpts/Tier0
 with lightweight ref counting

For MinOpts/Tier0, precise ref counts are not computed (PreciseRefCountsRequired()
returns false). Previously, the frame layout optimization was skipped entirely for
MinOpts. This change adds a lightweight LIR walk that counts local variable
references without any of the analysis side effects of lvaMarkLclRefs.

The lightweight counts are used by the cost estimation function and sorting
comparators to make informed layout decisions for MinOpts methods.

SPMI results across collections show significant impact, especially on MinOpts-heavy
collections:
  aspnet2:           -8,682 bytes (-0.33%), 262 improvements, 37 regressions
  benchmarks.run:    -409,067 bytes (-1.10%), 8,788 improvements, 706 regressions
  libraries_tests:   -4,503,889 bytes (-1.11%), 96,203 improvements, 7,210 regressions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 96 ++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 22 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index bd93f583697758..80e2b13243a1af 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5176,11 +5176,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
     // offsets after fixup and contribute negligible savings.
     //
-    // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are
-    // less meaningful), and frames that fit entirely within the disp8 zone.
+    // We skip this for EnC (which requires stable layout) and frames that fit entirely
+    // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed,
+    // we do a lightweight LIR walk to count local references for sorting purposes.
     assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
     unsigned* lclVarSortOrder = nullptr;
-    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed())
+    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed())
     {
         unsigned estimatedLocalSize = 0;
         for (unsigned i = 0; i < lvaCount; i++)
@@ -5190,9 +5191,36 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
         if (estimatedLocalSize > 128)
         {
+            // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0).
+            // Do a lightweight LIR walk to count local references for sorting purposes.
+            // This is much cheaper than the full lvaMarkLclRefs pass — we only count
+            // occurrences without any of the analysis side effects.
+            unsigned* lclRefCounts = nullptr;
+            if (!PreciseRefCountsRequired())
+            {
+                lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount];
+                memset(lclRefCounts, 0, lvaCount * sizeof(unsigned));
+
+                for (BasicBlock* const block : Blocks())
+                {
+                    for (GenTree* node : LIR::AsRange(block))
+                    {
+                        if (node->OperIsAnyLocal())
+                        {
+                            unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
+                            if (lclNum < lvaCount)
+                            {
+                                lclRefCounts[lclNum]++;
+                            }
+                        }
+                    }
+                }
+            }
+
             JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
-                    "(estimated frame size %u bytes)\n",
-                    lvaCount, estimatedLocalSize);
+                    "(estimated frame size %u bytes%s)\n",
+                    lvaCount, estimatedLocalSize,
+                    lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
 
             // Pre-compute which locals will be allocated in the main loop and their
             // pass category. Category 0 means "not allocatable" (skipped by the loop).
@@ -5282,7 +5310,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
                         simOff -= static_cast<int>(size);
 
-                        unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState);
+                        unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
+                                                                    : varDsc->lvRefCnt(lvaRefCountState);
                         totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
                     }
                 }
@@ -5322,30 +5351,41 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
             // Strategy 1: Access density (weighted ref count / size) descending.
             // A small hot local is more valuable per frame byte than a large hot local.
             unsigned densityCost = tryStrategy("density",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    const LclVarDsc* d1 = lvaGetDesc(n1);
-                    const LclVarDsc* d2 = lvaGetDesc(n2);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
                     unsigned s1 = lvaLclStackHomeSize(n1);
                     unsigned s2 = lvaLclStackHomeSize(n2);
-                    weight_t w1 = d1->lvRefCntWtd(lvaRefCountState);
-                    weight_t w2 = d2->lvRefCntWtd(lvaRefCountState);
+                    weight_t w1, w2;
+                    if (lclRefCounts != nullptr)
+                    {
+                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
+                    }
+                    else
+                    {
+                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    }
                     // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
                     weight_t dens1 = w1 * s2;
                     weight_t dens2 = w2 * s1;
                     if (dens1 != dens2) return dens1 > dens2;
                     bool a1 = (s1 >= 8), a2 = (s2 >= 8);
                     if (a1 != a2) return a1;
-                    unsigned c1 = d1->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = d2->lvRefCnt(lvaRefCountState);
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     if (c1 != c2) return c1 > c2;
                     return n1 < n2;
                 });
 
             // Strategy 2: Unweighted ref count descending.
             unsigned refCntCost = tryStrategy("refCnt",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     if (c1 != c2) return c1 > c2;
                     bool a1 = (lvaLclStackHomeSize(n1) >= 8);
                     bool a2 = (lvaLclStackHomeSize(n2) >= 8);
@@ -5354,10 +5394,20 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
                 });
 
             // Strategy 3: Weighted ref count descending.
+            // For MinOpts, weighted = unweighted (no block weights available).
             unsigned weightCost = tryStrategy("weight",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                    weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    weight_t w1, w2;
+                    if (lclRefCounts != nullptr)
+                    {
+                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
+                    }
+                    else
+                    {
+                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+                    }
                     if (w1 != w2) return w1 > w2;
                     bool a1 = (lvaLclStackHomeSize(n1) >= 8);
                     bool a2 = (lvaLclStackHomeSize(n2) >= 8);
@@ -5367,9 +5417,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
             // Strategy 4: Unweighted ref count density (refCnt / size) descending.
             unsigned refDensityCost = tryStrategy("refDensity",
-                [this](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
                     unsigned s1 = lvaLclStackHomeSize(n1);
                     unsigned s2 = lvaLclStackHomeSize(n2);
                     // Cross-multiply to avoid division.

From bbac4fb8dbd3533ed0d7d4422ec841917454bd1b Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 16:55:31 +0000
Subject: [PATCH 09/28] JIT: Extract frame layout optimization into
 lvaComputeOptimalFrameLayoutOrder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the multi-strategy frame layout optimization code from
lvaAssignVirtualFrameOffsetsToLocals into a separate method
lvaComputeOptimalFrameLayoutOrder for better readability.

Also move the Allocation enum to file scope (as LclAllocCategory)
so it can be shared between both methods.

No functional change — SPMI results are identical.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/compiler.h  |   3 +
 src/coreclr/jit/lclvars.cpp | 604 +++++++++++++++++++-----------------
 2 files changed, 318 insertions(+), 289 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 35be79978cc0e3..feff52b799f7cb 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -4277,6 +4277,9 @@ class Compiler
     void lvaAssignVirtualFrameOffsetsToArgs();
     bool lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned lclNum, int* offset);
     void lvaAssignVirtualFrameOffsetsToLocals();
+#ifdef TARGET_XARCH
+    unsigned* lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder);
+#endif
     bool lvaParamHasLocalStackSpace(unsigned lclNum);
     int lvaAllocLocalAndSetVirtualOffset(unsigned lclNum, unsigned size, int stkOffs);
     int lvaAllocAsyncContexts(int stkOffs);
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 80e2b13243a1af..12dd36b818eb24 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4778,6 +4778,319 @@ bool Compiler::lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned l
     return false;
 }
 
+// Allocation pass categories used by lvaAssignVirtualFrameOffsetsToLocals
+// and lvaComputeOptimalFrameLayoutOrder to classify locals by type.
+enum LclAllocCategory : UINT
+{
+    ALLOC_NON_PTRS                 = 0x1, // assign offsets to non-ptr
+    ALLOC_PTRS                     = 0x2, // Second pass, assign offsets to tracked ptrs
+    ALLOC_UNSAFE_BUFFERS           = 0x4,
+    ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8
+};
+
+#ifdef TARGET_XARCH
+//------------------------------------------------------------------------
+// lvaComputeOptimalFrameLayoutOrder: try multiple sort orders for locals and
+// pick the one that minimizes total displacement encoding cost.
+//
+// On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte
+// displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
+// bytes per access. We try multiple sort orders for locals and pick the one that
+// minimizes total encoding cost, estimated by simulating the frame allocation loop.
+//
+// The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1
+// for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction
+// count. This gives a direct estimate of total displacement encoding bytes.
+// (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
+//
+// This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called
+// with FINAL_FRAME_LAYOUT (no tentative layout exists).
+//
+// We only run this for frame-pointer-based frames because the disp8 boundary check
+// assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
+// offsets after fixup and contribute negligible savings.
+//
+// We skip frames that fit entirely within the disp8 zone. For MinOpts/Tier0 where
+// precise ref counts are not computed, we do a lightweight LIR walk to count local
+// references for sorting purposes.
+//
+// Arguments:
+//   stkOffs    - current stack offset (after callee saves, XMM saves, and pre-allocated
+//                special locals)
+//   allocOrder - null-terminated array of allocation pass flags (ALLOC_NON_PTRS, etc.)
+//
+// Returns:
+//   An array of lclNum indices representing the optimal sort order, or nullptr if the
+//   original order is already optimal.
+//
+unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder)
+{
+    unsigned estimatedLocalSize = 0;
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        estimatedLocalSize += lvaLclStackHomeSize(i);
+    }
+
+    if (estimatedLocalSize <= 128)
+    {
+        return nullptr;
+    }
+
+    // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0).
+    // Do a lightweight LIR walk to count local references for sorting purposes.
+    // This is much cheaper than the full lvaMarkLclRefs pass — we only count
+    // occurrences without any of the analysis side effects.
+    unsigned* lclRefCounts = nullptr;
+    if (!PreciseRefCountsRequired())
+    {
+        lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount];
+        memset(lclRefCounts, 0, lvaCount * sizeof(unsigned));
+
+        for (BasicBlock* const block : Blocks())
+        {
+            for (GenTree* node : LIR::AsRange(block))
+            {
+                if (node->OperIsAnyLocal())
+                {
+                    unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
+                    if (lclNum < lvaCount)
+                    {
+                        lclRefCounts[lclNum]++;
+                    }
+                }
+            }
+        }
+    }
+
+    JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
+            "(estimated frame size %u bytes%s)\n",
+            lvaCount, estimatedLocalSize,
+            lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
+
+    // Pre-compute which locals will be allocated in the main loop and their
+    // pass category. Category 0 means "not allocatable" (skipped by the loop).
+    unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount];
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        lclPassCategory[i] = 0;
+        LclVarDsc* varDsc = lvaGetDesc(i);
+
+        if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+            continue;
+#if FEATURE_FIXED_OUT_ARGS
+        if (i == lvaOutgoingArgSpaceVar)
+            continue;
+#endif
+        if (lvaIsOSRLocal(i))
+            continue;
+        if (!varDsc->lvOnFrame)
+            continue;
+        if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie())
+            continue;
+        if (i == lvaRetAddrVar)
+            continue;
+        if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar ||
+            i == lvaAsyncSynchronizationContextVar)
+            continue;
+        if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
+            continue;
+
+        if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
+        {
+            lclPassCategory[i] =
+                varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
+        }
+        else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
+        {
+            lclPassCategory[i] = ALLOC_PTRS;
+        }
+        else
+        {
+            lclPassCategory[i] = ALLOC_NON_PTRS;
+        }
+    }
+
+    // Simulate frame layout for a given sort order and return total encoding cost.
+    // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
+    // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
+    auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
+        unsigned totalCost = 0;
+        int      simOff    = stkOffs;
+
+        for (int p = 0; allocOrder[p]; p++)
+        {
+            UINT pass = allocOrder[p];
+            for (unsigned idx = 0; idx < lvaCount; idx++)
+            {
+                unsigned lcl = order[idx];
+                if (lclPassCategory[lcl] != pass)
+                    continue;
+
+                LclVarDsc* varDsc = lvaGetDesc(lcl);
+                unsigned   size   = lvaLclStackHomeSize(lcl);
+
+                // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
+                if (size >= 8)
+                {
+#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
+                    if (varTypeIsSIMD(varDsc))
+                    {
+                        int alignment = getSIMDTypeAlignment(varDsc->TypeGet());
+                        if (simOff % alignment != 0)
+                        {
+                            simOff -= static_cast<int>(alignment + (simOff % alignment));
+                        }
+                    }
+                    else
+#endif
+                    {
+                        if ((simOff % 8) != 0)
+                        {
+                            simOff -= static_cast<int>(8 + (simOff % 8));
+                        }
+                    }
+                }
+
+                simOff -= static_cast<int>(size);
+
+                unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
+                                                            : varDsc->lvRefCnt(lvaRefCountState);
+                totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
+            }
+        }
+
+        return totalCost;
+    };
+
+    unsigned* sortOrder      = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        sortOrder[i]      = i;
+        candidateOrder[i] = i;
+    }
+
+    // Score the original (unsorted) order as baseline.
+    unsigned    origCost = estimateLayoutCost(sortOrder);
+    unsigned    bestCost = origCost;
+    const char* bestName = "original";
+
+    // Helper to try a strategy: sort candidateOrder, estimate cost,
+    // and update best if the cost is lower.
+    auto tryStrategy = [&](const char* name, auto comparator) -> unsigned {
+        for (unsigned i = 0; i < lvaCount; i++)
+            candidateOrder[i] = i;
+        jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator);
+        unsigned cost = estimateLayoutCost(candidateOrder);
+        if (cost < bestCost)
+        {
+            bestCost = cost;
+            bestName = name;
+            memcpy(sortOrder, candidateOrder, lvaCount * sizeof(unsigned));
+        }
+        return cost;
+    };
+
+    // Strategy 1: Access density (weighted ref count / size) descending.
+    // A small hot local is more valuable per frame byte than a large hot local.
+    unsigned densityCost = tryStrategy("density",
+        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+            unsigned s1 = lvaLclStackHomeSize(n1);
+            unsigned s2 = lvaLclStackHomeSize(n2);
+            weight_t w1, w2;
+            if (lclRefCounts != nullptr)
+            {
+                w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                w2 = static_cast<weight_t>(lclRefCounts[n2]);
+            }
+            else
+            {
+                w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+            }
+            // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
+            weight_t dens1 = w1 * s2;
+            weight_t dens2 = w2 * s1;
+            if (dens1 != dens2) return dens1 > dens2;
+            bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+            if (a1 != a2) return a1;
+            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+            if (c1 != c2) return c1 > c2;
+            return n1 < n2;
+        });
+
+    // Strategy 2: Unweighted ref count descending.
+    unsigned refCntCost = tryStrategy("refCnt",
+        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+            if (c1 != c2) return c1 > c2;
+            bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+            bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+            if (a1 != a2) return a1;
+            return n1 < n2;
+        });
+
+    // Strategy 3: Weighted ref count descending.
+    // For MinOpts, weighted = unweighted (no block weights available).
+    unsigned weightCost = tryStrategy("weight",
+        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+            weight_t w1, w2;
+            if (lclRefCounts != nullptr)
+            {
+                w1 = static_cast<weight_t>(lclRefCounts[n1]);
+                w2 = static_cast<weight_t>(lclRefCounts[n2]);
+            }
+            else
+            {
+                w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+                w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+            }
+            if (w1 != w2) return w1 > w2;
+            bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+            bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+            if (a1 != a2) return a1;
+            return n1 < n2;
+        });
+
+    // Strategy 4: Unweighted ref count density (refCnt / size) descending.
+    unsigned refDensityCost = tryStrategy("refDensity",
+        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+            unsigned s1 = lvaLclStackHomeSize(n1);
+            unsigned s2 = lvaLclStackHomeSize(n2);
+            // Cross-multiply to avoid division.
+            unsigned long long dens1 = (unsigned long long)c1 * s2;
+            unsigned long long dens2 = (unsigned long long)c2 * s1;
+            if (dens1 != dens2) return dens1 > dens2;
+            bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+            if (a1 != a2) return a1;
+            return n1 < n2;
+        });
+
+    // If original order won, no sorting needed.
+    if (bestCost == origCost)
+    {
+        sortOrder = nullptr;
+    }
+
+    JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
+            "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
+            origCost, densityCost, refCntCost, weightCost, refDensityCost,
+            bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
+
+    return sortOrder;
+}
+#endif // TARGET_XARCH
+
 //-----------------------------------------------------------------------------
 // lvaAssignVirtualFrameOffsetsToLocals: compute the virtual stack offsets for
 //  all elements on the stackframe.
@@ -5094,13 +5407,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
             non-pointer temps
      */
 
-    enum Allocation
-    {
-        ALLOC_NON_PTRS                 = 0x1, // assign offsets to non-ptr
-        ALLOC_PTRS                     = 0x2, // Second pass, assign offsets to tracked ptrs
-        ALLOC_UNSAFE_BUFFERS           = 0x4,
-        ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8
-    };
     UINT alloc_order[5];
 
     unsigned int cur = 0;
@@ -5158,292 +5464,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
 #ifdef TARGET_XARCH
     // Multi-strategy frame layout optimization for x86/x64.
-    //
-    // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte
-    // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra
-    // bytes per access. We try multiple sort orders for locals and pick the one that
-    // minimizes total encoding cost, estimated by simulating the frame allocation loop.
-    //
-    // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1
-    // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction
-    // count. This gives a direct estimate of total displacement encoding bytes.
-    // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.)
-    //
-    // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called
-    // with FINAL_FRAME_LAYOUT (no tentative layout exists).
-    //
-    // We only run this for frame-pointer-based frames because the disp8 boundary check
-    // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive
-    // offsets after fixup and contribute negligible savings.
-    //
-    // We skip this for EnC (which requires stable layout) and frames that fit entirely
-    // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed,
-    // we do a lightweight LIR walk to count local references for sorting purposes.
+    // See lvaComputeOptimalFrameLayoutOrder for details.
     assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
     unsigned* lclVarSortOrder = nullptr;
     if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed())
     {
-        unsigned estimatedLocalSize = 0;
-        for (unsigned i = 0; i < lvaCount; i++)
-        {
-            estimatedLocalSize += lvaLclStackHomeSize(i);
-        }
-
-        if (estimatedLocalSize > 128)
-        {
-            // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0).
-            // Do a lightweight LIR walk to count local references for sorting purposes.
-            // This is much cheaper than the full lvaMarkLclRefs pass — we only count
-            // occurrences without any of the analysis side effects.
-            unsigned* lclRefCounts = nullptr;
-            if (!PreciseRefCountsRequired())
-            {
-                lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount];
-                memset(lclRefCounts, 0, lvaCount * sizeof(unsigned));
-
-                for (BasicBlock* const block : Blocks())
-                {
-                    for (GenTree* node : LIR::AsRange(block))
-                    {
-                        if (node->OperIsAnyLocal())
-                        {
-                            unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
-                            if (lclNum < lvaCount)
-                            {
-                                lclRefCounts[lclNum]++;
-                            }
-                        }
-                    }
-                }
-            }
-
-            JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
-                    "(estimated frame size %u bytes%s)\n",
-                    lvaCount, estimatedLocalSize,
-                    lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
-
-            // Pre-compute which locals will be allocated in the main loop and their
-            // pass category. Category 0 means "not allocatable" (skipped by the loop).
-            unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount];
-            for (unsigned i = 0; i < lvaCount; i++)
-            {
-                lclPassCategory[i] = 0;
-                LclVarDsc* varDsc = lvaGetDesc(i);
-
-                if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
-                    continue;
-#if FEATURE_FIXED_OUT_ARGS
-                if (i == lvaOutgoingArgSpaceVar)
-                    continue;
-#endif
-                if (lvaIsOSRLocal(i))
-                    continue;
-                if (!varDsc->lvOnFrame)
-                    continue;
-                if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie())
-                    continue;
-                if (i == lvaRetAddrVar)
-                    continue;
-                if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar ||
-                    i == lvaAsyncSynchronizationContextVar)
-                    continue;
-                if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
-                    continue;
-
-                if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
-                {
-                    lclPassCategory[i] =
-                        varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
-                }
-                else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
-                {
-                    lclPassCategory[i] = ALLOC_PTRS;
-                }
-                else
-                {
-                    lclPassCategory[i] = ALLOC_NON_PTRS;
-                }
-            }
-
-            // Simulate frame layout for a given sort order and return total encoding cost.
-            // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
-            // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
-            // Uses the current stkOffs as the starting point, which already accounts for
-            // callee saves, XMM saves, and any pre-allocated special locals.
-            auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
-                unsigned totalCost = 0;
-                int      simOff    = stkOffs;
-
-                for (int p = 0; alloc_order[p]; p++)
-                {
-                    UINT pass = alloc_order[p];
-                    for (unsigned idx = 0; idx < lvaCount; idx++)
-                    {
-                        unsigned lcl = order[idx];
-                        if (lclPassCategory[lcl] != pass)
-                            continue;
-
-                        LclVarDsc* varDsc = lvaGetDesc(lcl);
-                        unsigned   size   = lvaLclStackHomeSize(lcl);
-
-                        // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
-                        if (size >= 8)
-                        {
-#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
-                            if (varTypeIsSIMD(varDsc))
-                            {
-                                int alignment = getSIMDTypeAlignment(varDsc->TypeGet());
-                                if (simOff % alignment != 0)
-                                {
-                                    simOff -= static_cast<int>(alignment + (simOff % alignment));
-                                }
-                            }
-                            else
-#endif
-                            {
-                                if ((simOff % 8) != 0)
-                                {
-                                    simOff -= static_cast<int>(8 + (simOff % 8));
-                                }
-                            }
-                        }
-
-                        simOff -= static_cast<int>(size);
-
-                        unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
-                                                                    : varDsc->lvRefCnt(lvaRefCountState);
-                        totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
-                    }
-                }
-
-                return totalCost;
-            };
-
-            lclVarSortOrder              = new (this, CMK_LvaTable) unsigned[lvaCount];
-            unsigned* candidateOrder     = new (this, CMK_LvaTable) unsigned[lvaCount];
-            for (unsigned i = 0; i < lvaCount; i++)
-            {
-                lclVarSortOrder[i] = i;
-                candidateOrder[i]  = i;
-            }
-
-            // Score the original (unsorted) order as baseline.
-            unsigned    origCost = estimateLayoutCost(lclVarSortOrder);
-            unsigned    bestCost = origCost;
-            const char* bestName = "original";
-
-            // Helper to try a strategy: sort candidateOrder, estimate cost,
-            // and update best if the cost is lower.
-            auto tryStrategy = [&](const char* name, auto comparator) -> unsigned {
-                for (unsigned i = 0; i < lvaCount; i++)
-                    candidateOrder[i] = i;
-                jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator);
-                unsigned cost = estimateLayoutCost(candidateOrder);
-                if (cost < bestCost)
-                {
-                    bestCost = cost;
-                    bestName = name;
-                    memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned));
-                }
-                return cost;
-            };
-
-            // Strategy 1: Access density (weighted ref count / size) descending.
-            // A small hot local is more valuable per frame byte than a large hot local.
-            unsigned densityCost = tryStrategy("density",
-                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-                    unsigned s1 = lvaLclStackHomeSize(n1);
-                    unsigned s2 = lvaLclStackHomeSize(n2);
-                    weight_t w1, w2;
-                    if (lclRefCounts != nullptr)
-                    {
-                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
-                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
-                    }
-                    else
-                    {
-                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-                    }
-                    // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
-                    weight_t dens1 = w1 * s2;
-                    weight_t dens2 = w2 * s1;
-                    if (dens1 != dens2) return dens1 > dens2;
-                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-                    if (a1 != a2) return a1;
-                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-                    if (c1 != c2) return c1 > c2;
-                    return n1 < n2;
-                });
-
-            // Strategy 2: Unweighted ref count descending.
-            unsigned refCntCost = tryStrategy("refCnt",
-                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-                    if (c1 != c2) return c1 > c2;
-                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-                    if (a1 != a2) return a1;
-                    return n1 < n2;
-                });
-
-            // Strategy 3: Weighted ref count descending.
-            // For MinOpts, weighted = unweighted (no block weights available).
-            unsigned weightCost = tryStrategy("weight",
-                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-                    weight_t w1, w2;
-                    if (lclRefCounts != nullptr)
-                    {
-                        w1 = static_cast<weight_t>(lclRefCounts[n1]);
-                        w2 = static_cast<weight_t>(lclRefCounts[n2]);
-                    }
-                    else
-                    {
-                        w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                        w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-                    }
-                    if (w1 != w2) return w1 > w2;
-                    bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-                    bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-                    if (a1 != a2) return a1;
-                    return n1 < n2;
-                });
-
-            // Strategy 4: Unweighted ref count density (refCnt / size) descending.
-            unsigned refDensityCost = tryStrategy("refDensity",
-                [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-                    unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                            : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-                    unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                            : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-                    unsigned s1 = lvaLclStackHomeSize(n1);
-                    unsigned s2 = lvaLclStackHomeSize(n2);
-                    // Cross-multiply to avoid division.
-                    unsigned long long dens1 = (unsigned long long)c1 * s2;
-                    unsigned long long dens2 = (unsigned long long)c2 * s1;
-                    if (dens1 != dens2) return dens1 > dens2;
-                    bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-                    if (a1 != a2) return a1;
-                    return n1 < n2;
-                });
-
-            // If original order won, no sorting needed.
-            if (bestCost == origCost)
-            {
-                lclVarSortOrder = nullptr;
-            }
-
-            JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
-                    "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
-                    origCost, densityCost, refCntCost, weightCost, refDensityCost,
-                    bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
-        }
+        lclVarSortOrder = lvaComputeOptimalFrameLayoutOrder(stkOffs, alloc_order);
     }
 #endif // TARGET_XARCH
 

From 8e09f3fead01b6cb09a731bf6db1a754c04303ff Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 17:00:26 +0000
Subject: [PATCH 10/28] JIT: Lower frame layout optimization threshold from 128
 to 64 bytes

The estimatedLocalSize threshold did not account for alignment padding,
which can significantly inflate the actual frame size. Methods with raw
local sizes between 64-128 bytes can have actual frames exceeding 128
bytes after alignment, making them candidates for optimization.

Experimentally verified: threshold=64 captures all additional benefit
(identical results to threshold=0). Below 64 bytes, even worst-case
alignment keeps frames within the disp8 range.

SPMI aspnet2: -8,934 bytes (was -8,682), 277 improvements (was 262).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 12dd36b818eb24..2d22e946ec4878 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4831,7 +4831,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         estimatedLocalSize += lvaLclStackHomeSize(i);
     }
 
-    if (estimatedLocalSize <= 128)
+    // Skip frames where even with alignment padding, all locals will fit in disp8 range.
+    // We use 64 rather than 128 because alignment padding can inflate the actual frame
+    // size significantly beyond the raw sum of local sizes.
+    if (estimatedLocalSize <= 64)
     {
         return nullptr;
     }

From 0fadd1c39bae6a4fefe7c84d6186d0854a786fd9 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 17:03:39 +0000
Subject: [PATCH 11/28] JIT: Use single array for frame layout strategy
 evaluation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace two lvaCount-sized arrays (sortOrder + candidateOrder) with a single
array. Each strategy sorts the same array for scoring, then a final sort
with the winning comparator produces the result. Eliminates one allocation
and the memcpy on each winning strategy.

No functional change — SPMI results are identical.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 214 +++++++++++++++++++-----------------
 1 file changed, 114 insertions(+), 100 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 2d22e946ec4878..a75713ce488bb2 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4965,125 +4965,139 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         return totalCost;
     };
 
-    unsigned* sortOrder      = new (this, CMK_LvaTable) unsigned[lvaCount];
-    unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
     for (unsigned i = 0; i < lvaCount; i++)
     {
-        sortOrder[i]      = i;
-        candidateOrder[i] = i;
+        sortOrder[i] = i;
     }
 
     // Score the original (unsorted) order as baseline.
-    unsigned    origCost = estimateLayoutCost(sortOrder);
-    unsigned    bestCost = origCost;
-    const char* bestName = "original";
-
-    // Helper to try a strategy: sort candidateOrder, estimate cost,
-    // and update best if the cost is lower.
-    auto tryStrategy = [&](const char* name, auto comparator) -> unsigned {
+    unsigned    origCost   = estimateLayoutCost(sortOrder);
+    unsigned    bestCost   = origCost;
+    int         bestStrategy = -1; // -1 = original order
+    const char* bestName   = "original";
+
+    // Helper to try a strategy: sort sortOrder, estimate cost, track if best.
+    // The array is re-sorted for each strategy; after all strategies are
+    // evaluated, we do one final sort with the winner.
+    auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned {
         for (unsigned i = 0; i < lvaCount; i++)
-            candidateOrder[i] = i;
-        jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator);
-        unsigned cost = estimateLayoutCost(candidateOrder);
+            sortOrder[i] = i;
+        jitstd::sort(sortOrder, sortOrder + lvaCount, comparator);
+        unsigned cost = estimateLayoutCost(sortOrder);
         if (cost < bestCost)
         {
-            bestCost = cost;
-            bestName = name;
-            memcpy(sortOrder, candidateOrder, lvaCount * sizeof(unsigned));
+            bestCost     = cost;
+            bestStrategy = strategyIdx;
+            bestName     = name;
         }
         return cost;
     };
 
-    // Strategy 1: Access density (weighted ref count / size) descending.
+    // Strategy 0: Access density (weighted ref count / size) descending.
     // A small hot local is more valuable per frame byte than a large hot local.
-    unsigned densityCost = tryStrategy("density",
-        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-            unsigned s1 = lvaLclStackHomeSize(n1);
-            unsigned s2 = lvaLclStackHomeSize(n2);
-            weight_t w1, w2;
-            if (lclRefCounts != nullptr)
-            {
-                w1 = static_cast<weight_t>(lclRefCounts[n1]);
-                w2 = static_cast<weight_t>(lclRefCounts[n2]);
-            }
-            else
-            {
-                w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-            }
-            // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
-            weight_t dens1 = w1 * s2;
-            weight_t dens2 = w2 * s1;
-            if (dens1 != dens2) return dens1 > dens2;
-            bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-            if (a1 != a2) return a1;
-            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-            if (c1 != c2) return c1 > c2;
-            return n1 < n2;
-        });
-
-    // Strategy 2: Unweighted ref count descending.
-    unsigned refCntCost = tryStrategy("refCnt",
-        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-            if (c1 != c2) return c1 > c2;
-            bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-            bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-            if (a1 != a2) return a1;
-            return n1 < n2;
-        });
+    auto densityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+        unsigned s1 = lvaLclStackHomeSize(n1);
+        unsigned s2 = lvaLclStackHomeSize(n2);
+        weight_t w1, w2;
+        if (lclRefCounts != nullptr)
+        {
+            w1 = static_cast<weight_t>(lclRefCounts[n1]);
+            w2 = static_cast<weight_t>(lclRefCounts[n2]);
+        }
+        else
+        {
+            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+        }
+        // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
+        weight_t dens1 = w1 * s2;
+        weight_t dens2 = w2 * s1;
+        if (dens1 != dens2) return dens1 > dens2;
+        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+        if (a1 != a2) return a1;
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2) return c1 > c2;
+        return n1 < n2;
+    };
+    unsigned densityCost = tryStrategy(0, "density", densityCompare);
+
+    // Strategy 1: Unweighted ref count descending.
+    auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2) return c1 > c2;
+        bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+        bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+        if (a1 != a2) return a1;
+        return n1 < n2;
+    };
+    unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare);
 
-    // Strategy 3: Weighted ref count descending.
+    // Strategy 2: Weighted ref count descending.
     // For MinOpts, weighted = unweighted (no block weights available).
-    unsigned weightCost = tryStrategy("weight",
-        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-            weight_t w1, w2;
-            if (lclRefCounts != nullptr)
-            {
-                w1 = static_cast<weight_t>(lclRefCounts[n1]);
-                w2 = static_cast<weight_t>(lclRefCounts[n2]);
-            }
-            else
-            {
-                w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-                w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-            }
-            if (w1 != w2) return w1 > w2;
-            bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-            bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-            if (a1 != a2) return a1;
-            return n1 < n2;
-        });
-
-    // Strategy 4: Unweighted ref count density (refCnt / size) descending.
-    unsigned refDensityCost = tryStrategy("refDensity",
-        [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-            unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                    : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-            unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                    : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-            unsigned s1 = lvaLclStackHomeSize(n1);
-            unsigned s2 = lvaLclStackHomeSize(n2);
-            // Cross-multiply to avoid division.
-            unsigned long long dens1 = (unsigned long long)c1 * s2;
-            unsigned long long dens2 = (unsigned long long)c2 * s1;
-            if (dens1 != dens2) return dens1 > dens2;
-            bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-            if (a1 != a2) return a1;
-            return n1 < n2;
-        });
+    auto weightCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+        weight_t w1, w2;
+        if (lclRefCounts != nullptr)
+        {
+            w1 = static_cast<weight_t>(lclRefCounts[n1]);
+            w2 = static_cast<weight_t>(lclRefCounts[n2]);
+        }
+        else
+        {
+            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+        }
+        if (w1 != w2) return w1 > w2;
+        bool a1 = (lvaLclStackHomeSize(n1) >= 8);
+        bool a2 = (lvaLclStackHomeSize(n2) >= 8);
+        if (a1 != a2) return a1;
+        return n1 < n2;
+    };
+    unsigned weightCost = tryStrategy(2, "weight", weightCompare);
+
+    // Strategy 3: Unweighted ref count density (refCnt / size) descending.
+    auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        unsigned s1 = lvaLclStackHomeSize(n1);
+        unsigned s2 = lvaLclStackHomeSize(n2);
+        // Cross-multiply to avoid division.
+        unsigned long long dens1 = (unsigned long long)c1 * s2;
+        unsigned long long dens2 = (unsigned long long)c2 * s1;
+        if (dens1 != dens2) return dens1 > dens2;
+        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+        if (a1 != a2) return a1;
+        return n1 < n2;
+    };
+    unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
 
-    // If original order won, no sorting needed.
-    if (bestCost == origCost)
+    // Apply the winning strategy's sort order (or return nullptr if original won).
+    if (bestStrategy < 0)
     {
         sortOrder = nullptr;
     }
+    else
+    {
+        // Re-sort with the winning comparator to produce the final order.
+        for (unsigned i = 0; i < lvaCount; i++)
+            sortOrder[i] = i;
+        switch (bestStrategy)
+        {
+            case 0: jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); break;
+            case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break;
+            case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break;
+            case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break;
+            default: unreached();
+        }
+    }
 
     JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
             "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",

From 83a333ef360c03c89a7d0ec28a58c4ea48e28070 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 17:23:02 +0000
Subject: [PATCH 12/28] Add zero-init span cost to frame layout estimator

When block-init is used for zero-initialization, the JIT zeros a
contiguous range of stack memory using SIMD stores. If the sort
reorders locals such that must-init locals are scattered, the
contiguous span grows, requiring more SIMD stores in the prolog.

Pre-compute which locals need zero-init (approximating
genCheckUseBlockInit logic) and track the init span during layout
simulation. Add a small penalty of 2 bytes per 16-byte chunk to
favor layouts that keep the init span tight without overwhelming
the main encoding cost.

SPMI aspnet2 results vs no zero-init model:
  Code delta: -3812 bytes (was -3726)
  Improvements: 47 (was 45)
  Regressions: 23/+199 bytes (was 26/+225 bytes)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 69 +++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index a75713ce488bb2..d5843e92b8c25e 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4913,12 +4913,52 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
     }
 
+    // Pre-compute which locals will likely need zero-initialization in the prolog.
+    // This approximates the logic in genCheckUseBlockInit (codegencommon.cpp).
+    // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi]
+    // using SIMD stores. The code size depends on the span and alignment, so layouts
+    // that scatter init-requiring locals produce larger prologs.
+    bool* lclNeedsInit  = new (this, CMK_LvaTable) bool[lvaCount];
+    unsigned initSlotCount = 0;
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        lclNeedsInit[i] = false;
+        if (lclPassCategory[i] == 0)
+            continue;
+
+        LclVarDsc* varDsc = lvaGetDesc(i);
+
+        if (fgVarIsNeverZeroInitializedInProlog(i))
+            continue;
+        if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+            continue;
+        if (varDsc->lvHasExplicitInit)
+            continue;
+        if (varDsc->lvIsTemp && !varDsc->HasGCPtr())
+            continue;
+
+        if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit)
+        {
+            lclNeedsInit[i] = true;
+            initSlotCount += (lvaLclStackHomeSize(i) + sizeof(int) - 1) / sizeof(int);
+        }
+    }
+
+    // On AMD64, block init is used when initSlotCount > 4; on x86 when > 4.
+    // Block init zeros a contiguous range, so the code size depends on span.
+    // Individual init zeros each local separately, cost is independent of layout.
+    bool useBlockInit = (initSlotCount > 4);
+
     // Simulate frame layout for a given sort order and return total encoding cost.
     // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
     // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
+    // When block init is used, we also add a zero-init cost proportional to the
+    // span of init-requiring locals (larger span = more SIMD stores in the prolog).
     auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
         unsigned totalCost = 0;
         int      simOff    = stkOffs;
+        int      initLo    = 0;
+        int      initHi    = 0;
 
         for (int p = 0; allocOrder[p]; p++)
         {
@@ -4959,9 +4999,38 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                 unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
                                                             : varDsc->lvRefCnt(lvaRefCountState);
                 totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
+
+                // Track the zero-init span for block-init cost estimation.
+                if (useBlockInit && lclNeedsInit[lcl])
+                {
+                    int loOffs = simOff;
+                    int hiOffs = simOff + static_cast<int>(size);
+                    if (initLo == 0 && initHi == 0)
+                    {
+                        initLo = loOffs;
+                        initHi = hiOffs;
+                    }
+                    else
+                    {
+                        initLo = min(initLo, loOffs);
+                        initHi = max(initHi, hiOffs);
+                    }
+                }
             }
         }
 
+        // Add zero-init prolog cost when block init will be used.
+        // The JIT zeros the contiguous range [initLo, initHi) using SIMD stores.
+        // Each 16-byte chunk requires one SIMD store instruction. We add a
+        // small penalty per chunk to favor layouts that keep the init span tight,
+        // without overwhelming the main encoding cost.
+        if (useBlockInit && initHi > initLo)
+        {
+            unsigned initSpan = static_cast<unsigned>(initHi - initLo);
+            unsigned initCost = ((initSpan + 15) / 16) * 2;
+            totalCost += initCost;
+        }
+
         return totalCost;
     };
 

From 2f253157bf9294fcbcafae0f3e3b2fec6b262738 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 19:22:05 +0000
Subject: [PATCH 13/28] Add sizeAsc frame layout strategy

Add a size-ascending sort strategy that maximizes the count of locals
fitting within the disp8 encoding range by packing smallest locals
first. This complements the existing density-based strategies which
optimize for hottest-first but may place a hot large struct ahead of
several moderately-hot small locals.

SPMI results (benchmarks.run_pgo.linux):
  Before: -443,754 bytes, 11,221 improvements
  After:  -483,664 bytes, 11,582 improvements (+9% more savings)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index d5843e92b8c25e..843d9b0935f71f 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5148,6 +5148,23 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     };
     unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
 
+    // Strategy 4: Size ascending — maximize count of locals in disp8 range.
+    // Small locals consume less of the disp8 budget, so packing them first
+    // maximizes how many locals get short encodings.
+    auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
+        unsigned s1 = lvaLclStackHomeSize(n1);
+        unsigned s2 = lvaLclStackHomeSize(n2);
+        if (s1 != s2) return s1 < s2;
+        // Within same size, prefer hotter locals first.
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
+                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
+                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2) return c1 > c2;
+        return n1 < n2;
+    };
+    unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare);
+
     // Apply the winning strategy's sort order (or return nullptr if original won).
     if (bestStrategy < 0)
     {
@@ -5164,13 +5181,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break;
             case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break;
             case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break;
+            case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break;
             default: unreached();
         }
     }
 
-    JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; "
+    JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u "
+            "sizeAsc=%u; "
             "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
             origCost, densityCost, refCntCost, weightCost, refDensityCost,
+            sizeAscCost,
             bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
 
     return sortOrder;

From 656b74a79d148e81edd2e3a7672602d974b9b7c8 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 19:30:56 +0000
Subject: [PATCH 14/28] Add initGroupedDensity frame layout strategy

Group init-needing locals first (sorted by density), then non-init
locals by density. This keeps the zero-init span tight, reducing
prolog code size from SIMD block-init while still prioritizing hot
locals within each group.

SPMI results (benchmarks.run_pgo.linux):
  Before: -483,664 bytes, 11,582 improvements (5 strategies)
  After:  -489,753 bytes, 12,253 improvements (6 strategies)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 843d9b0935f71f..478839fd58c6ae 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5165,6 +5165,34 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     };
     unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare);
 
+    // Strategy 5: Density with init-grouping — init-needing locals sorted by
+    // density first, then non-init locals by density. Keeps the zero-init span
+    // tight while still prioritizing hot locals within each group.
+    auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
+        bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];
+        if (init1 != init2) return init1; // init-needing first
+        unsigned s1 = lvaLclStackHomeSize(n1);
+        unsigned s2 = lvaLclStackHomeSize(n2);
+        weight_t w1, w2;
+        if (lclRefCounts != nullptr)
+        {
+            w1 = static_cast<weight_t>(lclRefCounts[n1]);
+            w2 = static_cast<weight_t>(lclRefCounts[n2]);
+        }
+        else
+        {
+            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
+            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
+        }
+        weight_t dens1 = w1 * s2;
+        weight_t dens2 = w2 * s1;
+        if (dens1 != dens2) return dens1 > dens2;
+        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+        if (a1 != a2) return a1;
+        return n1 < n2;
+    };
+    unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare);
+
     // Apply the winning strategy's sort order (or return nullptr if original won).
     if (bestStrategy < 0)
     {
@@ -5182,15 +5210,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break;
             case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break;
             case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break;
+            case 5: jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); break;
             default: unreached();
         }
     }
 
     JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u "
-            "sizeAsc=%u; "
+            "sizeAsc=%u initGroupedDensity=%u; "
             "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
             origCost, densityCost, refCntCost, weightCost, refDensityCost,
-            sizeAscCost,
+            sizeAscCost, initGroupedDensityCost,
             bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
 
     return sortOrder;

From df5fb05228a85fa7f121afdf516a89704c83ea33 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 3 May 2026 21:40:34 +0000
Subject: [PATCH 15/28] format

---
 src/coreclr/jit/lclvars.cpp | 118 ++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 52 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 478839fd58c6ae..fa53bb9fda507f 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4867,8 +4867,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
 
     JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
             "(estimated frame size %u bytes%s)\n",
-            lvaCount, estimatedLocalSize,
-            lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
+            lvaCount, estimatedLocalSize, lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
 
     // Pre-compute which locals will be allocated in the main loop and their
     // pass category. Category 0 means "not allocatable" (skipped by the loop).
@@ -4876,7 +4875,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     for (unsigned i = 0; i < lvaCount; i++)
     {
         lclPassCategory[i] = 0;
-        LclVarDsc* varDsc = lvaGetDesc(i);
+        LclVarDsc* varDsc  = lvaGetDesc(i);
 
         if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
             continue;
@@ -4892,16 +4891,14 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             continue;
         if (i == lvaRetAddrVar)
             continue;
-        if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar ||
-            i == lvaAsyncSynchronizationContextVar)
+        if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || i == lvaAsyncSynchronizationContextVar)
             continue;
         if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
             continue;
 
         if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
         {
-            lclPassCategory[i] =
-                varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
+            lclPassCategory[i] = varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
         }
         else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
         {
@@ -4918,7 +4915,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi]
     // using SIMD stores. The code size depends on the span and alignment, so layouts
     // that scatter init-requiring locals produce larger prologs.
-    bool* lclNeedsInit  = new (this, CMK_LvaTable) bool[lvaCount];
+    bool*    lclNeedsInit  = new (this, CMK_LvaTable) bool[lvaCount];
     unsigned initSlotCount = 0;
     for (unsigned i = 0; i < lvaCount; i++)
     {
@@ -4996,8 +4993,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
 
                 simOff -= static_cast<int>(size);
 
-                unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl]
-                                                            : varDsc->lvRefCnt(lvaRefCountState);
+                unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] : varDsc->lvRefCnt(lvaRefCountState);
                 totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
 
                 // Track the zero-init span for block-init cost estimation.
@@ -5041,10 +5037,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     }
 
     // Score the original (unsorted) order as baseline.
-    unsigned    origCost   = estimateLayoutCost(sortOrder);
-    unsigned    bestCost   = origCost;
+    unsigned    origCost     = estimateLayoutCost(sortOrder);
+    unsigned    bestCost     = origCost;
     int         bestStrategy = -1; // -1 = original order
-    const char* bestName   = "original";
+    const char* bestName     = "original";
 
     // Helper to try a strategy: sort sortOrder, estimate cost, track if best.
     // The array is re-sorted for each strategy; after all strategies are
@@ -5082,28 +5078,29 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
         weight_t dens1 = w1 * s2;
         weight_t dens2 = w2 * s1;
-        if (dens1 != dens2) return dens1 > dens2;
+        if (dens1 != dens2)
+            return dens1 > dens2;
         bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-        if (a1 != a2) return a1;
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2) return c1 > c2;
+        if (a1 != a2)
+            return a1;
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2)
+            return c1 > c2;
         return n1 < n2;
     };
     unsigned densityCost = tryStrategy(0, "density", densityCompare);
 
     // Strategy 1: Unweighted ref count descending.
     auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2) return c1 > c2;
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2)
+            return c1 > c2;
         bool a1 = (lvaLclStackHomeSize(n1) >= 8);
         bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-        if (a1 != a2) return a1;
+        if (a1 != a2)
+            return a1;
         return n1 < n2;
     };
     unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare);
@@ -5122,28 +5119,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
             w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
         }
-        if (w1 != w2) return w1 > w2;
+        if (w1 != w2)
+            return w1 > w2;
         bool a1 = (lvaLclStackHomeSize(n1) >= 8);
         bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-        if (a1 != a2) return a1;
+        if (a1 != a2)
+            return a1;
         return n1 < n2;
     };
     unsigned weightCost = tryStrategy(2, "weight", weightCompare);
 
     // Strategy 3: Unweighted ref count density (refCnt / size) descending.
     auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
         unsigned s1 = lvaLclStackHomeSize(n1);
         unsigned s2 = lvaLclStackHomeSize(n2);
         // Cross-multiply to avoid division.
         unsigned long long dens1 = (unsigned long long)c1 * s2;
         unsigned long long dens2 = (unsigned long long)c2 * s1;
-        if (dens1 != dens2) return dens1 > dens2;
+        if (dens1 != dens2)
+            return dens1 > dens2;
         bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-        if (a1 != a2) return a1;
+        if (a1 != a2)
+            return a1;
         return n1 < n2;
     };
     unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
@@ -5154,13 +5153,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
         unsigned s1 = lvaLclStackHomeSize(n1);
         unsigned s2 = lvaLclStackHomeSize(n2);
-        if (s1 != s2) return s1 < s2;
+        if (s1 != s2)
+            return s1 < s2;
         // Within same size, prefer hotter locals first.
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1]
-                                                : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2]
-                                                : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2) return c1 > c2;
+        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
+        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
+        if (c1 != c2)
+            return c1 > c2;
         return n1 < n2;
     };
     unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare);
@@ -5170,7 +5169,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // tight while still prioritizing hot locals within each group.
     auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
         bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];
-        if (init1 != init2) return init1; // init-needing first
+        if (init1 != init2)
+            return init1; // init-needing first
         unsigned s1 = lvaLclStackHomeSize(n1);
         unsigned s2 = lvaLclStackHomeSize(n2);
         weight_t w1, w2;
@@ -5186,9 +5186,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
         weight_t dens1 = w1 * s2;
         weight_t dens2 = w2 * s1;
-        if (dens1 != dens2) return dens1 > dens2;
+        if (dens1 != dens2)
+            return dens1 > dens2;
         bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-        if (a1 != a2) return a1;
+        if (a1 != a2)
+            return a1;
         return n1 < n2;
     };
     unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare);
@@ -5205,21 +5207,33 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             sortOrder[i] = i;
         switch (bestStrategy)
         {
-            case 0: jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); break;
-            case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break;
-            case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break;
-            case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break;
-            case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break;
-            case 5: jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); break;
-            default: unreached();
+            case 0:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare);
+                break;
+            case 1:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare);
+                break;
+            case 2:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare);
+                break;
+            case 3:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare);
+                break;
+            case 4:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare);
+                break;
+            case 5:
+                jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare);
+                break;
+            default:
+                unreached();
         }
     }
 
     JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u "
             "sizeAsc=%u initGroupedDensity=%u; "
             "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
-            origCost, densityCost, refCntCost, weightCost, refDensityCost,
-            sizeAscCost, initGroupedDensityCost,
+            origCost, densityCost, refCntCost, weightCost, refDensityCost, sizeAscCost, initGroupedDensityCost,
             bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
 
     return sortOrder;

From 1108871298b51cd0a13dc6d3b3ae5867493aecc0 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 4 May 2026 14:35:29 +0000
Subject: [PATCH 16/28] make it faster

---
 src/coreclr/jit/lclvars.cpp | 333 +++++++++++++++---------------------
 1 file changed, 141 insertions(+), 192 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index fa53bb9fda507f..a98a1423f9b1e2 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4825,10 +4825,15 @@ enum LclAllocCategory : UINT
 //
 unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder)
 {
-    unsigned estimatedLocalSize = 0;
+    // Pre-compute local sizes and total estimated frame size in one pass.
+    // These arrays are indexed by lclNum and used throughout to avoid repeated
+    // function calls in sort comparators and cost estimation.
+    unsigned* lclSize            = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned  estimatedLocalSize = 0;
     for (unsigned i = 0; i < lvaCount; i++)
     {
-        estimatedLocalSize += lvaLclStackHomeSize(i);
+        lclSize[i] = lvaLclStackHomeSize(i);
+        estimatedLocalSize += lclSize[i];
     }
 
     // Skip frames where even with alignment padding, all locals will fit in disp8 range.
@@ -4839,15 +4844,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         return nullptr;
     }
 
-    // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0).
-    // Do a lightweight LIR walk to count local references for sorting purposes.
-    // This is much cheaper than the full lvaMarkLclRefs pass — we only count
-    // occurrences without any of the analysis side effects.
-    unsigned* lclRefCounts = nullptr;
-    if (!PreciseRefCountsRequired())
+    // Pre-compute ref counts and weights into flat arrays for fast comparator access.
+    // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0),
+    // so we do a lightweight LIR walk to count local references.
+    unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount];
+    weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount];
+    bool      isMinOpts = !PreciseRefCountsRequired();
+
+    if (isMinOpts)
     {
-        lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount];
-        memset(lclRefCounts, 0, lvaCount * sizeof(unsigned));
+        memset(lclRefCnt, 0, lvaCount * sizeof(unsigned));
 
         for (BasicBlock* const block : Blocks())
         {
@@ -4858,16 +4864,56 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                     unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
                     if (lclNum < lvaCount)
                     {
-                        lclRefCounts[lclNum]++;
+                        lclRefCnt[lclNum]++;
                     }
                 }
             }
         }
+
+        // For MinOpts, weighted = unweighted (no block weights available).
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            lclWeight[i] = static_cast<weight_t>(lclRefCnt[i]);
+        }
+    }
+    else
+    {
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            LclVarDsc* varDsc = lvaGetDesc(i);
+            lclRefCnt[i]      = varDsc->lvRefCnt(lvaRefCountState);
+            lclWeight[i]      = varDsc->lvRefCntWtd(lvaRefCountState);
+        }
     }
 
-    JITDUMP("Frame layout optimization: trying multiple strategies for %u locals "
+    // Pre-compute alignment requirements for each local.
+    // 0 = no alignment needed, otherwise the required alignment in bytes.
+    unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount];
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        if (lclSize[i] < 8)
+        {
+            lclAlignTo[i] = 0;
+        }
+        else
+        {
+#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
+            LclVarDsc* varDsc = lvaGetDesc(i);
+            if (varTypeIsSIMD(varDsc))
+            {
+                lclAlignTo[i] = static_cast<unsigned>(getSIMDTypeAlignment(varDsc->TypeGet()));
+            }
+            else
+#endif
+            {
+                lclAlignTo[i] = 8;
+            }
+        }
+    }
+
+    JITDUMP("Frame layout optimization: trying strategies for %u locals "
             "(estimated frame size %u bytes%s)\n",
-            lvaCount, estimatedLocalSize, lclRefCounts != nullptr ? ", using lightweight ref counts" : "");
+            lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "");
 
     // Pre-compute which locals will be allocated in the main loop and their
     // pass category. Category 0 means "not allocatable" (skipped by the loop).
@@ -4937,7 +4983,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit)
         {
             lclNeedsInit[i] = true;
-            initSlotCount += (lvaLclStackHomeSize(i) + sizeof(int) - 1) / sizeof(int);
+            initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int);
         }
     }
 
@@ -4966,35 +5012,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                 if (lclPassCategory[lcl] != pass)
                     continue;
 
-                LclVarDsc* varDsc = lvaGetDesc(lcl);
-                unsigned   size   = lvaLclStackHomeSize(lcl);
+                unsigned size    = lclSize[lcl];
+                unsigned alignTo = lclAlignTo[lcl];
 
                 // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
-                if (size >= 8)
+                if (alignTo != 0 && (simOff % static_cast<int>(alignTo)) != 0)
                 {
-#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
-                    if (varTypeIsSIMD(varDsc))
-                    {
-                        int alignment = getSIMDTypeAlignment(varDsc->TypeGet());
-                        if (simOff % alignment != 0)
-                        {
-                            simOff -= static_cast<int>(alignment + (simOff % alignment));
-                        }
-                    }
-                    else
-#endif
-                    {
-                        if ((simOff % 8) != 0)
-                        {
-                            simOff -= static_cast<int>(8 + (simOff % 8));
-                        }
-                    }
+                    simOff -= static_cast<int>(alignTo + (simOff % static_cast<int>(alignTo)));
                 }
 
                 simOff -= static_cast<int>(size);
 
-                unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] : varDsc->lvRefCnt(lvaRefCountState);
-                totalCost += refCnt * ((simOff >= -128) ? 1u : 4u);
+                totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u);
 
                 // Track the zero-init span for block-init cost estimation.
                 if (useBlockInit && lclNeedsInit[lcl])
@@ -5031,6 +5060,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     };
 
     unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
     for (unsigned i = 0; i < lvaCount; i++)
     {
         sortOrder[i] = i;
@@ -5043,8 +5073,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     const char* bestName     = "original";
 
     // Helper to try a strategy: sort sortOrder, estimate cost, track if best.
-    // The array is re-sorted for each strategy; after all strategies are
-    // evaluated, we do one final sort with the winner.
+    // When a strategy improves on the current best, we save its permutation
+    // into bestOrder to avoid a redundant re-sort at the end.
     auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned {
         for (unsigned i = 0; i < lvaCount; i++)
             sortOrder[i] = i;
@@ -5055,188 +5085,107 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             bestCost     = cost;
             bestStrategy = strategyIdx;
             bestName     = name;
+            memcpy(bestOrder, sortOrder, lvaCount * sizeof(unsigned));
         }
         return cost;
     };
 
     // Strategy 0: Access density (weighted ref count / size) descending.
     // A small hot local is more valuable per frame byte than a large hot local.
-    auto densityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned s1 = lvaLclStackHomeSize(n1);
-        unsigned s2 = lvaLclStackHomeSize(n2);
-        weight_t w1, w2;
-        if (lclRefCounts != nullptr)
-        {
-            w1 = static_cast<weight_t>(lclRefCounts[n1]);
-            w2 = static_cast<weight_t>(lclRefCounts[n2]);
-        }
-        else
-        {
-            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-        }
-        // Compare w1/s1 > w2/s2 via cross-multiply to avoid division.
-        weight_t dens1 = w1 * s2;
-        weight_t dens2 = w2 * s1;
+    auto densityCompare = [lclSize, lclWeight, lclRefCnt](unsigned n1, unsigned n2) -> bool {
+        weight_t dens1 = lclWeight[n1] * lclSize[n2];
+        weight_t dens2 = lclWeight[n2] * lclSize[n1];
         if (dens1 != dens2)
             return dens1 > dens2;
-        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
+        bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
         if (a1 != a2)
             return a1;
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2)
-            return c1 > c2;
+        if (lclRefCnt[n1] != lclRefCnt[n2])
+            return lclRefCnt[n1] > lclRefCnt[n2];
         return n1 < n2;
     };
     unsigned densityCost = tryStrategy(0, "density", densityCompare);
 
-    // Strategy 1: Unweighted ref count descending.
-    auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2)
-            return c1 > c2;
-        bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-        bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-        if (a1 != a2)
-            return a1;
-        return n1 < n2;
-    };
-    unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare);
-
-    // Strategy 2: Weighted ref count descending.
-    // For MinOpts, weighted = unweighted (no block weights available).
-    auto weightCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        weight_t w1, w2;
-        if (lclRefCounts != nullptr)
-        {
-            w1 = static_cast<weight_t>(lclRefCounts[n1]);
-            w2 = static_cast<weight_t>(lclRefCounts[n2]);
-        }
-        else
-        {
-            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-        }
-        if (w1 != w2)
-            return w1 > w2;
-        bool a1 = (lvaLclStackHomeSize(n1) >= 8);
-        bool a2 = (lvaLclStackHomeSize(n2) >= 8);
-        if (a1 != a2)
-            return a1;
-        return n1 < n2;
-    };
-    unsigned weightCost = tryStrategy(2, "weight", weightCompare);
-
-    // Strategy 3: Unweighted ref count density (refCnt / size) descending.
-    auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        unsigned s1 = lvaLclStackHomeSize(n1);
-        unsigned s2 = lvaLclStackHomeSize(n2);
-        // Cross-multiply to avoid division.
-        unsigned long long dens1 = (unsigned long long)c1 * s2;
-        unsigned long long dens2 = (unsigned long long)c2 * s1;
-        if (dens1 != dens2)
-            return dens1 > dens2;
-        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-        if (a1 != a2)
-            return a1;
-        return n1 < n2;
-    };
-    unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
-
-    // Strategy 4: Size ascending — maximize count of locals in disp8 range.
+    // Strategy 1: Size ascending — maximize count of locals in disp8 range.
     // Small locals consume less of the disp8 budget, so packing them first
     // maximizes how many locals get short encodings.
-    auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool {
-        unsigned s1 = lvaLclStackHomeSize(n1);
-        unsigned s2 = lvaLclStackHomeSize(n2);
-        if (s1 != s2)
-            return s1 < s2;
-        // Within same size, prefer hotter locals first.
-        unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState);
-        unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState);
-        if (c1 != c2)
-            return c1 > c2;
+    auto sizeAscCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool {
+        if (lclSize[n1] != lclSize[n2])
+            return lclSize[n1] < lclSize[n2];
+        if (lclRefCnt[n1] != lclRefCnt[n2])
+            return lclRefCnt[n1] > lclRefCnt[n2];
         return n1 < n2;
     };
-    unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare);
+    unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare);
+
+    // Strategies 2-3: Weight-based sorts that differ from density/sizeAsc only when
+    // PGO block weights are available (FullOpts). For MinOpts, weighted == unweighted
+    // and refDensity == density, so these are redundant and skipped.
+    unsigned weightCost     = 0;
+    unsigned refDensityCost = 0;
+    if (!isMinOpts)
+    {
+        // Strategy 2: Weighted ref count descending.
+        auto weightCompare = [lclSize, lclWeight](unsigned n1, unsigned n2) -> bool {
+            if (lclWeight[n1] != lclWeight[n2])
+                return lclWeight[n1] > lclWeight[n2];
+            bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
+            if (a1 != a2)
+                return a1;
+            return n1 < n2;
+        };
+        weightCost = tryStrategy(2, "weight", weightCompare);
+
+        // Strategy 3: Unweighted ref count density (refCnt / size) descending.
+        auto refDensityCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool {
+            unsigned long long dens1 = (unsigned long long)lclRefCnt[n1] * lclSize[n2];
+            unsigned long long dens2 = (unsigned long long)lclRefCnt[n2] * lclSize[n1];
+            if (dens1 != dens2)
+                return dens1 > dens2;
+            bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
+            if (a1 != a2)
+                return a1;
+            return n1 < n2;
+        };
+        refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
+    }
 
-    // Strategy 5: Density with init-grouping — init-needing locals sorted by
+    // Strategy 4: Density with init-grouping — init-needing locals sorted by
     // density first, then non-init locals by density. Keeps the zero-init span
     // tight while still prioritizing hot locals within each group.
-    auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
-        bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];
-        if (init1 != init2)
-            return init1; // init-needing first
-        unsigned s1 = lvaLclStackHomeSize(n1);
-        unsigned s2 = lvaLclStackHomeSize(n2);
-        weight_t w1, w2;
-        if (lclRefCounts != nullptr)
-        {
-            w1 = static_cast<weight_t>(lclRefCounts[n1]);
-            w2 = static_cast<weight_t>(lclRefCounts[n2]);
-        }
-        else
-        {
-            w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState);
-            w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState);
-        }
-        weight_t dens1 = w1 * s2;
-        weight_t dens2 = w2 * s1;
-        if (dens1 != dens2)
-            return dens1 > dens2;
-        bool a1 = (s1 >= 8), a2 = (s2 >= 8);
-        if (a1 != a2)
-            return a1;
-        return n1 < n2;
-    };
-    unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare);
+    // Only useful when block init will be used (otherwise identical to density).
+    unsigned initGroupedDensityCost = 0;
+    if (useBlockInit)
+    {
+        auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
+            bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];
+            if (init1 != init2)
+                return init1; // init-needing first
+            weight_t dens1 = lclWeight[n1] * lclSize[n2];
+            weight_t dens2 = lclWeight[n2] * lclSize[n1];
+            if (dens1 != dens2)
+                return dens1 > dens2;
+            bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
+            if (a1 != a2)
+                return a1;
+            return n1 < n2;
+        };
+        initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare);
+    }
 
-    // Apply the winning strategy's sort order (or return nullptr if original won).
+    // Return the winning permutation (saved in bestOrder), or nullptr if original won.
     if (bestStrategy < 0)
     {
-        sortOrder = nullptr;
-    }
-    else
-    {
-        // Re-sort with the winning comparator to produce the final order.
-        for (unsigned i = 0; i < lvaCount; i++)
-            sortOrder[i] = i;
-        switch (bestStrategy)
-        {
-            case 0:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare);
-                break;
-            case 1:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare);
-                break;
-            case 2:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare);
-                break;
-            case 3:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare);
-                break;
-            case 4:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare);
-                break;
-            case 5:
-                jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare);
-                break;
-            default:
-                unreached();
-        }
+        bestOrder = nullptr;
     }
 
-    JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u "
-            "sizeAsc=%u initGroupedDensity=%u; "
+    JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u "
+            "initGroupedDensity=%u; "
             "selected '%s' (cost=%u, saved %u encoding bytes est.)\n",
-            origCost, densityCost, refCntCost, weightCost, refDensityCost, sizeAscCost, initGroupedDensityCost,
-            bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0);
+            origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost, bestName, bestCost,
+            origCost > bestCost ? origCost - bestCost : 0);
 
-    return sortOrder;
+    return bestOrder;
 }
 #endif // TARGET_XARCH
 

From 9edbb980f23567602ba1aa932f199a049a36813e Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 4 May 2026 15:51:09 +0000
Subject: [PATCH 17/28] cleanup

---
 src/coreclr/jit/lclvars.cpp | 80 +++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index a98a1423f9b1e2..a5635979b28a58 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4924,29 +4924,45 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         LclVarDsc* varDsc  = lvaGetDesc(i);
 
         if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        {
             continue;
+        }
 #if FEATURE_FIXED_OUT_ARGS
         if (i == lvaOutgoingArgSpaceVar)
+        {
             continue;
+        }
 #endif
         if (lvaIsOSRLocal(i))
+        {
             continue;
+        }
         if (!varDsc->lvOnFrame)
+        {
             continue;
-        if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie())
+        }
+        if ((i == lvaGSSecurityCookie) && getNeedsGSSecurityCookie())
+        {
             continue;
+        }
         if (i == lvaRetAddrVar)
+        {
             continue;
-        if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || i == lvaAsyncSynchronizationContextVar)
+        }
+        if ((i == lvaMonAcquired) || (i == lvaAsyncExecutionContextVar) || (i == lvaAsyncSynchronizationContextVar))
+        {
             continue;
-        if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i))
+        }
+        if ((varDsc->lvIsParam) && !lvaParamHasLocalStackSpace(i))
+        {
             continue;
+        }
 
-        if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout)
+        if ((varDsc->lvIsUnsafeBuffer) && compGSReorderStackLayout)
         {
             lclPassCategory[i] = varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS;
         }
-        else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked)
+        else if (varTypeIsGC(varDsc->TypeGet()) && (varDsc->lvTracked))
         {
             lclPassCategory[i] = ALLOC_PTRS;
         }
@@ -4967,20 +4983,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     {
         lclNeedsInit[i] = false;
         if (lclPassCategory[i] == 0)
+        {
             continue;
+        }
 
         LclVarDsc* varDsc = lvaGetDesc(i);
 
         if (fgVarIsNeverZeroInitializedInProlog(i))
+        {
             continue;
+        }
         if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        {
             continue;
+        }
         if (varDsc->lvHasExplicitInit)
+        {
             continue;
-        if (varDsc->lvIsTemp && !varDsc->HasGCPtr())
+        }
+        if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr())
+        {
             continue;
+        }
 
-        if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit)
+        if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit))
         {
             lclNeedsInit[i] = true;
             initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int);
@@ -5010,13 +5036,15 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             {
                 unsigned lcl = order[idx];
                 if (lclPassCategory[lcl] != pass)
+                {
                     continue;
+                }
 
                 unsigned size    = lclSize[lcl];
                 unsigned alignTo = lclAlignTo[lcl];
 
                 // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
-                if (alignTo != 0 && (simOff % static_cast<int>(alignTo)) != 0)
+                if ((alignTo != 0) && ((simOff % static_cast<int>(alignTo)) != 0))
                 {
                     simOff -= static_cast<int>(alignTo + (simOff % static_cast<int>(alignTo)));
                 }
@@ -5026,11 +5054,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                 totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u);
 
                 // Track the zero-init span for block-init cost estimation.
-                if (useBlockInit && lclNeedsInit[lcl])
+                if (useBlockInit && (lclNeedsInit[lcl]))
                 {
                     int loOffs = simOff;
                     int hiOffs = simOff + static_cast<int>(size);
-                    if (initLo == 0 && initHi == 0)
+                    if ((initLo == 0) && (initHi == 0))
                     {
                         initLo = loOffs;
                         initHi = hiOffs;
@@ -5049,7 +5077,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         // Each 16-byte chunk requires one SIMD store instruction. We add a
         // small penalty per chunk to favor layouts that keep the init span tight,
         // without overwhelming the main encoding cost.
-        if (useBlockInit && initHi > initLo)
+        if (useBlockInit && (initHi > initLo))
         {
             unsigned initSpan = static_cast<unsigned>(initHi - initLo);
             unsigned initCost = ((initSpan + 15) / 16) * 2;
@@ -5077,7 +5105,9 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // into bestOrder to avoid a redundant re-sort at the end.
     auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned {
         for (unsigned i = 0; i < lvaCount; i++)
+        {
             sortOrder[i] = i;
+        }
         jitstd::sort(sortOrder, sortOrder + lvaCount, comparator);
         unsigned cost = estimateLayoutCost(sortOrder);
         if (cost < bestCost)
@@ -5096,12 +5126,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         weight_t dens1 = lclWeight[n1] * lclSize[n2];
         weight_t dens2 = lclWeight[n2] * lclSize[n1];
         if (dens1 != dens2)
+        {
             return dens1 > dens2;
+        }
         bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
         if (a1 != a2)
+        {
             return a1;
+        }
         if (lclRefCnt[n1] != lclRefCnt[n2])
+        {
             return lclRefCnt[n1] > lclRefCnt[n2];
+        }
         return n1 < n2;
     };
     unsigned densityCost = tryStrategy(0, "density", densityCompare);
@@ -5111,9 +5147,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // maximizes how many locals get short encodings.
     auto sizeAscCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool {
         if (lclSize[n1] != lclSize[n2])
+        {
             return lclSize[n1] < lclSize[n2];
+        }
         if (lclRefCnt[n1] != lclRefCnt[n2])
+        {
             return lclRefCnt[n1] > lclRefCnt[n2];
+        }
         return n1 < n2;
     };
     unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare);
@@ -5128,23 +5168,31 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         // Strategy 2: Weighted ref count descending.
         auto weightCompare = [lclSize, lclWeight](unsigned n1, unsigned n2) -> bool {
             if (lclWeight[n1] != lclWeight[n2])
+            {
                 return lclWeight[n1] > lclWeight[n2];
+            }
             bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
             if (a1 != a2)
+            {
                 return a1;
+            }
             return n1 < n2;
         };
         weightCost = tryStrategy(2, "weight", weightCompare);
 
         // Strategy 3: Unweighted ref count density (refCnt / size) descending.
         auto refDensityCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool {
-            unsigned long long dens1 = (unsigned long long)lclRefCnt[n1] * lclSize[n2];
-            unsigned long long dens2 = (unsigned long long)lclRefCnt[n2] * lclSize[n1];
+            double dens1 = (double)lclRefCnt[n1] * lclSize[n2];
+            double dens2 = (double)lclRefCnt[n2] * lclSize[n1];
             if (dens1 != dens2)
+            {
                 return dens1 > dens2;
+            }
             bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
             if (a1 != a2)
+            {
                 return a1;
+            }
             return n1 < n2;
         };
         refDensityCost = tryStrategy(3, "refDensity", refDensityCompare);
@@ -5160,14 +5208,20 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
             bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];
             if (init1 != init2)
+            {
                 return init1; // init-needing first
+            }
             weight_t dens1 = lclWeight[n1] * lclSize[n2];
             weight_t dens2 = lclWeight[n2] * lclSize[n1];
             if (dens1 != dens2)
+            {
                 return dens1 > dens2;
+            }
             bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8);
             if (a1 != a2)
+            {
                 return a1;
+            }
             return n1 < n2;
         };
         initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare);

From f7ebd17c367fff65da666d4abe3cff4bcfb65781 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 4 May 2026 23:06:36 +0000
Subject: [PATCH 18/28] Add early maxSavings gate to skip optimization when
 benefit is small

Compute a quick upper bound on potential savings before the expensive
alignment/category/init pre-computation and sorting phases. Walks locals
in default order, counts refs beyond disp8 range, and bails out if the
maximum achievable code size reduction is <= 12 bytes.

Skips ~14% of MinOpts methods while retaining 98-100% of savings.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index a5635979b28a58..104c50e07237c3 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4886,6 +4886,39 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
     }
 
+    // Quick upper bound on potential savings: walk locals in default order,
+    // accumulate frame size, and count weighted refs that fall beyond disp8 range.
+    // This approximation ignores alignment padding and allocation passes but is
+    // cheap to compute and lets us skip methods where reordering cannot help much.
+    {
+        int      simOff        = stkOffs;
+        unsigned refsInDisp32  = 0;
+
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            if (lclSize[i] == 0)
+            {
+                continue;
+            }
+            simOff -= static_cast<int>(lclSize[i]);
+            if (simOff < -128)
+            {
+                refsInDisp32 += lclRefCnt[i];
+            }
+        }
+
+        // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding).
+        // If even moving ALL those refs into disp8 range wouldn't save much, skip.
+        unsigned maxSavings = refsInDisp32 * 3;
+        if (maxSavings <= 12)
+        {
+            JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes "
+                    "(refsInDisp32=%u)\n",
+                    maxSavings, refsInDisp32);
+            return nullptr;
+        }
+    }
+
     // Pre-compute alignment requirements for each local.
     // 0 = no alignment needed, otherwise the required alignment in bytes.
     unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount];

From b4d6ee5378fdd6d87cb01da0e3ac6b249a18a5b0 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 4 May 2026 23:07:19 +0000
Subject: [PATCH 19/28] format

---
 src/coreclr/jit/lclvars.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 104c50e07237c3..4d7dbb486e96cb 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4891,8 +4891,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // This approximation ignores alignment padding and allocation passes but is
     // cheap to compute and lets us skip methods where reordering cannot help much.
     {
-        int      simOff        = stkOffs;
-        unsigned refsInDisp32  = 0;
+        int      simOff       = stkOffs;
+        unsigned refsInDisp32 = 0;
 
         for (unsigned i = 0; i < lvaCount; i++)
         {

From cf3f0c359ddaf691ed83371378292e87e47aa7f2 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 14 May 2026 09:48:33 -0700
Subject: [PATCH 20/28] Address PR review feedback

- Use signed arithmetic for alignment-pad simulation in estimateLayoutCost to
  avoid mixing unsigned alignTo with the (possibly negative) signed remainder.
- Replace the unconditional FINAL_FRAME_LAYOUT assert in
  lvaAssignVirtualFrameOffsetsToLocals with a guard so non-final layout
  passes (PRE_REGALLOC/REGALLOC/TENTATIVE) simply skip the optimization
  rather than asserting.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 68a88b80d6f125..3d5ca945af1571 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5115,9 +5115,15 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                 unsigned alignTo = lclAlignTo[lcl];
 
                 // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
-                if ((alignTo != 0) && ((simOff % static_cast<int>(alignTo)) != 0))
+                // Use signed arithmetic throughout: simOff is negative, and the remainder
+                // (simOff % alignment) is non-positive, so pad = alignment + remainder
+                // yields a small positive value in [1, alignment-1]. Mixing unsigned alignTo
+                // with the signed remainder would convert the negative remainder to a huge
+                // unsigned value and corrupt simOff.
+                int signedAlign = static_cast<int>(alignTo);
+                if ((signedAlign != 0) && ((simOff % signedAlign) != 0))
                 {
-                    simOff -= static_cast<int>(alignTo + (simOff % static_cast<int>(alignTo)));
+                    simOff -= signedAlign + (simOff % signedAlign);
                 }
 
                 simOff -= static_cast<int>(size);
@@ -5688,9 +5694,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 #ifdef TARGET_XARCH
     // Multi-strategy frame layout optimization for x86/x64.
     // See lvaComputeOptimalFrameLayoutOrder for details.
-    assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT);
+    // Only attempt the optimization during the final layout pass; earlier passes
+    // (PRE_REGALLOC/REGALLOC/TENTATIVE) may invoke lvaAssignVirtualFrameOffsetsToLocals
+    // for size estimation, and the cost-model assumptions only hold for the final pass.
     unsigned* lclVarSortOrder = nullptr;
-    if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed())
+    if ((lvaDoneFrameLayout == FINAL_FRAME_LAYOUT) && lvaLocalVarRefCounted() && !opts.compDbgEnC &&
+        codeGen->isFramePointerUsed())
     {
         lclVarSortOrder = lvaComputeOptimalFrameLayoutOrder(stkOffs, alloc_order);
     }

From ac57311a9fdfd5fdb14e4aec90a55ed8f5ad7602 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 14 May 2026 12:41:19 -0700
Subject: [PATCH 21/28] Add JitFrameLayoutMaxSavingsThreshold config knob

Replace the hard-coded maxSavings <= 12 early-out in
lvaComputeOptimalFrameLayoutOrder with a tunable JIT config option,
keeping the existing value (12) as the default.

This makes it easy to experiment with the TP-vs-code-size tradeoff
without rebuilding the JIT.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/jitconfigvalues.h | 1 +
 src/coreclr/jit/lclvars.cpp       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 88ac0fe83e4924..b091b2851bbff0 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -694,6 +694,7 @@ RELEASE_CONFIG_INTEGER(JitExtDefaultPolicyProfScale, "JitExtDefaultPolicyProfSca
 RELEASE_CONFIG_INTEGER(JitInlinePolicyModel, "JitInlinePolicyModel", 0)
 RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0)
 RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40)
+RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 12)
 CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange")
 RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1)
 RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1)
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 3d5ca945af1571..1fe6e7d25c3b13 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4948,7 +4948,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding).
         // If even moving ALL those refs into disp8 range wouldn't save much, skip.
         unsigned maxSavings = refsInDisp32 * 3;
-        if (maxSavings <= 12)
+        if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold())
         {
             JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes "
                     "(refsInDisp32=%u)\n",

From a2357533bd4c08dbb71e3da176258f9ccd9d6a61 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 14 May 2026 16:30:10 -0700
Subject: [PATCH 22/28] Skip initGroupedDensity (S4) strategy at MinOpts

Empirical analysis on libraries_tests_no_tiered_compilation showed that
S4 (init-grouped density) contributes only ~0.7% of the total code-size
wins at MinOpts while incurring the same per-strategy throughput cost as
the other sorts. Skipping it at MinOpts shaves ~0.5pp off MinOpts JIT TP
(from +1.98% to +1.47%) at negligible code-size impact.

Also remove the experimental JitFrameLayoutStrategyMask config introduced
during the sweep, and refine the S2/S3 comment.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 1fe6e7d25c3b13..38cceae12b1d2b 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5236,8 +5236,9 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare);
 
     // Strategies 2-3: Weight-based sorts that differ from density/sizeAsc only when
-    // PGO block weights are available (FullOpts). For MinOpts, weighted == unweighted
-    // and refDensity == density, so these are redundant and skipped.
+    // PGO block weights are available (FullOpts). For MinOpts, lclWeight == lclRefCnt,
+    // so S2 is redundant with density once D is in the set (empirically adds <1% of the
+    // code-size wins for the full per-strategy TP cost), and S3 == density. Skipped at MinOpts.
     unsigned weightCost     = 0;
     unsigned refDensityCost = 0;
     if (!isMinOpts)
@@ -5279,8 +5280,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // density first, then non-init locals by density. Keeps the zero-init span
     // tight while still prioritizing hot locals within each group.
     // Only useful when block init will be used (otherwise identical to density).
+    // Skipped at MinOpts: empirically adds <1% of the code-size wins for the full
+    // per-strategy TP cost.
     unsigned initGroupedDensityCost = 0;
-    if (useBlockInit)
+    if (useBlockInit && !isMinOpts)
     {
         auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool {
             bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2];

From bba5fa393d038d9aa714df364529f40ab89f6502 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Wed, 27 May 2026 19:07:58 -0700
Subject: [PATCH 23/28] JIT: bucket frame layout candidates by alloc pass, sort
 only the straddling bucket

The frame layout heuristic in lvaComputeOptimalFrameLayoutOrder was scoring
every candidate strategy over all locals and sorting all locals up front. Most
of that work is wasted: when the locals are walked in allocation order, simOff
decreases monotonically, so at most one allocation bucket straddles the
disp8/-128 boundary. Buckets fully above contribute fixed cost refCnt*1;
buckets fully below contribute refCnt*4. Only the straddling bucket's intra-
order affects total cost.

The function now:

  * Buckets candidate locals by allocation pass before doing any cost work.
  * Walks the buckets once in allocation order to identify the (single)
    straddling bucket. If none exists, bails before doing the LIR ref-count
    walk.
  * Precomputes baseCost (and, at FullOpts, baseInitLo/baseInitHi) from non-
    straddling buckets.
  * Tightens the maxSavings short-circuit gate to use the actual straddling
    bucket walk (including alignment) instead of an over-approximation across
    all locals.
  * Sorts only the straddling-bucket slice per strategy and reuses the cost
    machinery just on that slice.
  * Skips the lclNeedsInit / useBlockInit / baseInit setup entirely at
    MinOpts (S4 is already disabled there).

Measured on libraries_tests_no_tiered_compilation.run.windows.x64.Release.mch:

  TP   Overall  +0.10% (was +0.28%)
       MinOpts  +1.25% (was +1.47%)
       FullOpts +0.07% (unchanged)

  Code size diffs preserved (slightly improved): -469,696/+29,555 bytes
  (was -459,519/+29,355).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 507 ++++++++++++++++++++++++------------
 1 file changed, 334 insertions(+), 173 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 30df90e50d9800..47054186f2d3b4 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4867,81 +4867,6 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         return nullptr;
     }
 
-    // Pre-compute ref counts and weights into flat arrays for fast comparator access.
-    // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0),
-    // so we do a lightweight LIR walk to count local references.
-    unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount];
-    weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount];
-    bool      isMinOpts = !PreciseRefCountsRequired();
-
-    if (isMinOpts)
-    {
-        memset(lclRefCnt, 0, lvaCount * sizeof(unsigned));
-
-        for (BasicBlock* const block : Blocks())
-        {
-            for (GenTree* node : LIR::AsRange(block))
-            {
-                if (node->OperIsAnyLocal())
-                {
-                    unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
-                    if (lclNum < lvaCount)
-                    {
-                        lclRefCnt[lclNum]++;
-                    }
-                }
-            }
-        }
-
-        // For MinOpts, weighted = unweighted (no block weights available).
-        for (unsigned i = 0; i < lvaCount; i++)
-        {
-            lclWeight[i] = static_cast<weight_t>(lclRefCnt[i]);
-        }
-    }
-    else
-    {
-        for (unsigned i = 0; i < lvaCount; i++)
-        {
-            LclVarDsc* varDsc = lvaGetDesc(i);
-            lclRefCnt[i]      = varDsc->lvRefCnt(lvaRefCountState);
-            lclWeight[i]      = varDsc->lvRefCntWtd(lvaRefCountState);
-        }
-    }
-
-    // Quick upper bound on potential savings: walk locals in default order,
-    // accumulate frame size, and count weighted refs that fall beyond disp8 range.
-    // This approximation ignores alignment padding and allocation passes but is
-    // cheap to compute and lets us skip methods where reordering cannot help much.
-    {
-        int      simOff       = stkOffs;
-        unsigned refsInDisp32 = 0;
-
-        for (unsigned i = 0; i < lvaCount; i++)
-        {
-            if (lclSize[i] == 0)
-            {
-                continue;
-            }
-            simOff -= static_cast<int>(lclSize[i]);
-            if (simOff < -128)
-            {
-                refsInDisp32 += lclRefCnt[i];
-            }
-        }
-
-        // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding).
-        // If even moving ALL those refs into disp8 range wouldn't save much, skip.
-        unsigned maxSavings = refsInDisp32 * 3;
-        if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold())
-        {
-            JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes "
-                    "(refsInDisp32=%u)\n",
-                    maxSavings, refsInDisp32);
-            return nullptr;
-        }
-    }
-
     // Pre-compute alignment requirements for each local.
     // 0 = no alignment needed, otherwise the required alignment in bytes.
     unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount];
@@ -4967,10 +4892,6 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
     }
 
-    JITDUMP("Frame layout optimization: trying strategies for %u locals "
-            "(estimated frame size %u bytes%s)\n",
-            lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "");
-
     // Pre-compute which locals will be allocated in the main loop and their
     // pass category. Category 0 means "not allocatable" (skipped by the loop).
     unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount];
@@ -5028,156 +4949,369 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
     }
 
-    // Pre-compute which locals will likely need zero-initialization in the prolog.
-    // This approximates the logic in genCheckUseBlockInit (codegencommon.cpp).
-    // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi]
-    // using SIMD stores. The code size depends on the span and alignment, so layouts
-    // that scatter init-requiring locals produce larger prologs.
-    bool*    lclNeedsInit  = new (this, CMK_LvaTable) bool[lvaCount];
-    unsigned initSlotCount = 0;
+    // Build per-pass buckets in allocOrder order. ALLOC_* values are powers of two;
+    // their bit indices (0..3) map to positions in allocOrder via passBitToAllocIdx.
+    const unsigned MAX_PASS_BITS = 4;
+    unsigned       passBitToAllocIdx[MAX_PASS_BITS];
+    for (unsigned k = 0; k < MAX_PASS_BITS; k++)
+    {
+        passBitToAllocIdx[k] = UINT_MAX;
+    }
+
+    unsigned allocOrderLen = 0;
+    for (unsigned p = 0; allocOrder[p] != 0; p++)
+    {
+        // The optimization is gated off for compDbgEnC (which merges ALLOC_PTRS into the
+        // previous pass), so each allocOrder entry here is a single ALLOC_* bit.
+        unsigned bit = BitOperations::Log2((unsigned)allocOrder[p]);
+        assert(bit < MAX_PASS_BITS);
+        assert(((unsigned)allocOrder[p] & ((unsigned)allocOrder[p] - 1)) == 0);
+        passBitToAllocIdx[bit] = p;
+        allocOrderLen++;
+    }
+    assert(allocOrderLen <= MAX_PASS_BITS);
+
+    unsigned passCount[MAX_PASS_BITS + 1] = {0};
     for (unsigned i = 0; i < lvaCount; i++)
     {
-        lclNeedsInit[i] = false;
         if (lclPassCategory[i] == 0)
         {
             continue;
         }
+        unsigned bit = BitOperations::Log2(lclPassCategory[i]);
+        unsigned p   = passBitToAllocIdx[bit];
+        assert(p != UINT_MAX);
+        passCount[p]++;
+    }
+
+    unsigned passStart[MAX_PASS_BITS + 1];
+    passStart[0] = 0;
+    for (unsigned p = 0; p < allocOrderLen; p++)
+    {
+        passStart[p + 1] = passStart[p] + passCount[p];
+    }
+    const unsigned numAllocatable = passStart[allocOrderLen];
 
-        LclVarDsc* varDsc = lvaGetDesc(i);
+    if (numAllocatable == 0)
+    {
+        return nullptr;
+    }
 
-        if (fgVarIsNeverZeroInitializedInProlog(i))
+    // Concatenated bucket array: allocatable locals first (grouped by allocOrder pass, in
+    // ascending lclNum within each pass), then non-allocatable locals at the tail.
+    unsigned* bucketLcls = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned  writePos[MAX_PASS_BITS];
+    for (unsigned p = 0; p < allocOrderLen; p++)
+    {
+        writePos[p] = passStart[p];
+    }
+    unsigned tailPos = numAllocatable;
+    for (unsigned i = 0; i < lvaCount; i++)
+    {
+        if (lclPassCategory[i] == 0)
         {
+            bucketLcls[tailPos++] = i;
             continue;
         }
-        if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        unsigned bit              = BitOperations::Log2(lclPassCategory[i]);
+        unsigned p                = passBitToAllocIdx[bit];
+        bucketLcls[writePos[p]++] = i;
+    }
+
+    // Walk buckets in their original order to determine where the disp8/disp32 boundary
+    // (simOff == -128) falls. simOff decreases monotonically across the walk, so at most
+    // ONE bucket can straddle the boundary. Buckets entirely above -128 contribute fixed
+    // cost = refCnt * 1; buckets entirely below contribute fixed cost = refCnt * 4. Only
+    // the straddling bucket's internal order affects total cost.
+    int      bucketSimOffStart[MAX_PASS_BITS];
+    int      bucketSimOffEnd[MAX_PASS_BITS];
+    int      simOff         = stkOffs;
+    unsigned straddleBucket = UINT_MAX;
+    for (unsigned p = 0; p < allocOrderLen; p++)
+    {
+        bucketSimOffStart[p] = simOff;
+        for (unsigned k = passStart[p]; k < passStart[p + 1]; k++)
         {
-            continue;
+            unsigned lcl         = bucketLcls[k];
+            int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
+            if ((signedAlign != 0) && ((simOff % signedAlign) != 0))
+            {
+                simOff -= signedAlign + (simOff % signedAlign);
+            }
+            simOff -= static_cast<int>(lclSize[lcl]);
         }
-        if (varDsc->lvHasExplicitInit)
+        bucketSimOffEnd[p] = simOff;
+
+        if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] <= -128))
         {
-            continue;
+            straddleBucket = p;
         }
-        if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr())
+    }
+
+    if (straddleBucket == UINT_MAX)
+    {
+        // The frame either fits entirely in disp8 (nothing to optimize) or every
+        // allocated bucket already starts past disp8 (reordering within a bucket
+        // can't pull refs into disp8 range). Bail.
+        JITDUMP("Frame layout optimization: skipping — no straddling bucket "
+                "(simOff at end = %d)\n",
+                simOff);
+        return nullptr;
+    }
+
+    // Pre-compute ref counts and weights. For MinOpts/Tier0, precise ref counts are not
+    // available (all lvRefCnt == 0), so we do a lightweight LIR walk to count local refs.
+    unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount];
+    weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount];
+    bool      isMinOpts = !PreciseRefCountsRequired();
+
+    if (isMinOpts)
+    {
+        memset(lclRefCnt, 0, lvaCount * sizeof(unsigned));
+
+        for (BasicBlock* const block : Blocks())
         {
-            continue;
+            for (GenTree* node : LIR::AsRange(block))
+            {
+                if (node->OperIsAnyLocal())
+                {
+                    unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
+                    if (lclNum < lvaCount)
+                    {
+                        lclRefCnt[lclNum]++;
+                    }
+                }
+            }
         }
 
-        if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit))
+        // For MinOpts, weighted = unweighted (no block weights available).
+        for (unsigned i = 0; i < lvaCount; i++)
+        {
+            lclWeight[i] = static_cast<weight_t>(lclRefCnt[i]);
+        }
+    }
+    else
+    {
+        for (unsigned i = 0; i < lvaCount; i++)
         {
-            lclNeedsInit[i] = true;
-            initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int);
+            LclVarDsc* varDsc = lvaGetDesc(i);
+            lclRefCnt[i]      = varDsc->lvRefCnt(lvaRefCountState);
+            lclWeight[i]      = varDsc->lvRefCntWtd(lvaRefCountState);
         }
     }
 
-    // On AMD64, block init is used when initSlotCount > 4; on x86 when > 4.
-    // Block init zeros a contiguous range, so the code size depends on span.
-    // Individual init zeros each local separately, cost is independent of layout.
-    bool useBlockInit = (initSlotCount > 4);
+    // Precise upper bound on savings: walk the straddling bucket in its original order,
+    // count refs that land past disp8. Each such ref could save at most 3 bytes by being
+    // moved into disp8 range. Bail if even moving them all wouldn't help much.
+    unsigned maxSavings;
+    {
+        int      so           = bucketSimOffStart[straddleBucket];
+        unsigned refsInDisp32 = 0;
+        for (unsigned k = passStart[straddleBucket]; k < passStart[straddleBucket + 1]; k++)
+        {
+            unsigned lcl         = bucketLcls[k];
+            int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
+            if ((signedAlign != 0) && ((so % signedAlign) != 0))
+            {
+                so -= signedAlign + (so % signedAlign);
+            }
+            so -= static_cast<int>(lclSize[lcl]);
+            if (so < -128)
+            {
+                refsInDisp32 += lclRefCnt[lcl];
+            }
+        }
+        maxSavings = refsInDisp32 * 3;
+        if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold())
+        {
+            JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes "
+                    "(refsInDisp32=%u)\n",
+                    maxSavings, refsInDisp32);
+            return nullptr;
+        }
+    }
 
-    // Simulate frame layout for a given sort order and return total encoding cost.
-    // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where
-    // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32.
-    // When block init is used, we also add a zero-init cost proportional to the
-    // span of init-requiring locals (larger span = more SIMD stores in the prolog).
-    auto estimateLayoutCost = [&](unsigned* order) -> unsigned {
-        unsigned totalCost = 0;
-        int      simOff    = stkOffs;
-        int      initLo    = 0;
-        int      initHi    = 0;
+    // Compute baseline cost from non-straddling buckets. These contributions are
+    // independent of any intra-bucket sort order, so they cancel out when comparing
+    // strategies. We still include them in totalCost for accurate JITDUMP output.
+    unsigned baseCost = 0;
+    for (unsigned p = 0; p < allocOrderLen; p++)
+    {
+        if (p == straddleBucket)
+        {
+            continue;
+        }
+        unsigned encoding = (bucketSimOffStart[p] <= -128) ? 4u : 1u;
+        for (unsigned k = passStart[p]; k < passStart[p + 1]; k++)
+        {
+            baseCost += lclRefCnt[bucketLcls[k]] * encoding;
+        }
+    }
 
-        for (int p = 0; allocOrder[p]; p++)
+    // Pre-compute zero-init data for the cost model. Only used at FullOpts where S4
+    // (initGroupedDensity) is active. At MinOpts S4 is skipped and the init-span term
+    // is small relative to the encoding-cost term, so we omit it.
+    bool* lclNeedsInit  = nullptr;
+    bool  useBlockInit  = false;
+    int   baseInitLo    = 0;
+    int   baseInitHi    = 0;
+    if (!isMinOpts)
+    {
+        lclNeedsInit          = new (this, CMK_LvaTable) bool[lvaCount];
+        unsigned initSlotCount = 0;
+        for (unsigned i = 0; i < lvaCount; i++)
         {
-            UINT pass = allocOrder[p];
-            for (unsigned idx = 0; idx < lvaCount; idx++)
+            lclNeedsInit[i] = false;
+            if (lclPassCategory[i] == 0)
             {
-                unsigned lcl = order[idx];
-                if (lclPassCategory[lcl] != pass)
-                {
-                    continue;
-                }
+                continue;
+            }
 
-                unsigned size    = lclSize[lcl];
-                unsigned alignTo = lclAlignTo[lcl];
-
-                // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset).
-                // Use signed arithmetic throughout: simOff is negative, and the remainder
-                // (simOff % alignment) is non-positive, so pad = alignment + remainder
-                // yields a small positive value in [1, alignment-1]. Mixing unsigned alignTo
-                // with the signed remainder would convert the negative remainder to a huge
-                // unsigned value and corrupt simOff.
-                int signedAlign = static_cast<int>(alignTo);
-                if ((signedAlign != 0) && ((simOff % signedAlign) != 0))
-                {
-                    simOff -= signedAlign + (simOff % signedAlign);
-                }
+            LclVarDsc* varDsc = lvaGetDesc(i);
 
-                simOff -= static_cast<int>(size);
+            if (fgVarIsNeverZeroInitializedInProlog(i))
+            {
+                continue;
+            }
+            if (lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+            {
+                continue;
+            }
+            if (varDsc->lvHasExplicitInit)
+            {
+                continue;
+            }
+            if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr())
+            {
+                continue;
+            }
 
-                totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u);
+            if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit))
+            {
+                lclNeedsInit[i] = true;
+                initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int);
+            }
+        }
+        useBlockInit = (initSlotCount > 4);
 
-                // Track the zero-init span for block-init cost estimation.
-                if (useBlockInit && (lclNeedsInit[lcl]))
+        // If block init applies, precompute the init-span contribution from non-straddling
+        // buckets. The straddling bucket's contribution is folded in by walkStraddle.
+        if (useBlockInit)
+        {
+            int so = stkOffs;
+            for (unsigned p = 0; p < allocOrderLen; p++)
+            {
+                if (p == straddleBucket)
+                {
+                    so = bucketSimOffEnd[p];
+                    continue;
+                }
+                for (unsigned k = passStart[p]; k < passStart[p + 1]; k++)
                 {
-                    int loOffs = simOff;
-                    int hiOffs = simOff + static_cast<int>(size);
-                    if ((initLo == 0) && (initHi == 0))
+                    unsigned lcl         = bucketLcls[k];
+                    int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
+                    if ((signedAlign != 0) && ((so % signedAlign) != 0))
                     {
-                        initLo = loOffs;
-                        initHi = hiOffs;
+                        so -= signedAlign + (so % signedAlign);
                     }
-                    else
+                    so -= static_cast<int>(lclSize[lcl]);
+                    if (lclNeedsInit[lcl])
                     {
-                        initLo = min(initLo, loOffs);
-                        initHi = max(initHi, hiOffs);
+                        int loOffs = so;
+                        int hiOffs = so + static_cast<int>(lclSize[lcl]);
+                        if ((baseInitLo == 0) && (baseInitHi == 0))
+                        {
+                            baseInitLo = loOffs;
+                            baseInitHi = hiOffs;
+                        }
+                        else
+                        {
+                            baseInitLo = min(baseInitLo, loOffs);
+                            baseInitHi = max(baseInitHi, hiOffs);
+                        }
                     }
                 }
             }
         }
+    }
+
+    JITDUMP("Frame layout optimization: trying strategies for %u locals "
+            "(estimated frame size %u bytes%s, straddle bucket=%u of %u, baseCost=%u, "
+            "maxSavings=%u)\n",
+            lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "",
+            straddleBucket, allocOrderLen, baseCost, maxSavings);
+
+    const unsigned straddleStart       = passStart[straddleBucket];
+    const unsigned straddleCount       = passStart[straddleBucket + 1] - straddleStart;
+    const int      straddleSimOffEntry = bucketSimOffStart[straddleBucket];
+
+    // Compute the straddling bucket's cost contribution given a particular intra-bucket order.
+    // Folds in the init-span penalty when block init is in use.
+    auto walkStraddle = [&](unsigned* order) -> unsigned {
+        unsigned cost   = 0;
+        int      so     = straddleSimOffEntry;
+        int      initLo = baseInitLo;
+        int      initHi = baseInitHi;
+        for (unsigned k = 0; k < straddleCount; k++)
+        {
+            unsigned lcl         = order[k];
+            int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
+            if ((signedAlign != 0) && ((so % signedAlign) != 0))
+            {
+                so -= signedAlign + (so % signedAlign);
+            }
+            so -= static_cast<int>(lclSize[lcl]);
+            cost += lclRefCnt[lcl] * ((so >= -128) ? 1u : 4u);
+
+            if (useBlockInit && (lclNeedsInit[lcl]))
+            {
+                int loOffs = so;
+                int hiOffs = so + static_cast<int>(lclSize[lcl]);
+                if ((initLo == 0) && (initHi == 0))
+                {
+                    initLo = loOffs;
+                    initHi = hiOffs;
+                }
+                else
+                {
+                    initLo = min(initLo, loOffs);
+                    initHi = max(initHi, hiOffs);
+                }
+            }
+        }
 
-        // Add zero-init prolog cost when block init will be used.
-        // The JIT zeros the contiguous range [initLo, initHi) using SIMD stores.
-        // Each 16-byte chunk requires one SIMD store instruction. We add a
-        // small penalty per chunk to favor layouts that keep the init span tight,
-        // without overwhelming the main encoding cost.
         if (useBlockInit && (initHi > initLo))
         {
             unsigned initSpan = static_cast<unsigned>(initHi - initLo);
             unsigned initCost = ((initSpan + 15) / 16) * 2;
-            totalCost += initCost;
+            cost += initCost;
         }
 
-        return totalCost;
+        return cost;
     };
 
-    unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
-    unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
-    for (unsigned i = 0; i < lvaCount; i++)
-    {
-        sortOrder[i] = i;
-    }
+    unsigned* straddleOrder     = new (this, CMK_LvaTable) unsigned[straddleCount];
+    unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount];
+    memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned));
+    memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned));
 
     // Score the original (unsorted) order as baseline.
-    unsigned    origCost     = estimateLayoutCost(sortOrder);
+    unsigned    origCost     = baseCost + walkStraddle(straddleOrder);
     unsigned    bestCost     = origCost;
-    int         bestStrategy = -1; // -1 = original order
+    int         bestStrategy = -1;
     const char* bestName     = "original";
 
-    // Helper to try a strategy: sort sortOrder, estimate cost, track if best.
-    // When a strategy improves on the current best, we save its permutation
-    // into bestOrder to avoid a redundant re-sort at the end.
+    // Helper to try a strategy: sort straddleOrder, score, track if best.
     auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned {
-        for (unsigned i = 0; i < lvaCount; i++)
-        {
-            sortOrder[i] = i;
-        }
-        jitstd::sort(sortOrder, sortOrder + lvaCount, comparator);
-        unsigned cost = estimateLayoutCost(sortOrder);
+        memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned));
+        jitstd::sort(straddleOrder, straddleOrder + straddleCount, comparator);
+        unsigned cost = baseCost + walkStraddle(straddleOrder);
         if (cost < bestCost)
         {
             bestCost     = cost;
             bestStrategy = strategyIdx;
             bestName     = name;
-            memcpy(bestOrder, sortOrder, lvaCount * sizeof(unsigned));
+            memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned));
         }
         return cost;
     };
@@ -5292,10 +5426,37 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare);
     }
 
-    // Return the winning permutation (saved in bestOrder), or nullptr if original won.
     if (bestStrategy < 0)
     {
-        bestOrder = nullptr;
+        JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u "
+                "initGroupedDensity=%u; original order is best, no change\n",
+                origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost);
+        return nullptr;
+    }
+
+    // Assemble the final permutation: each bucket in its original order, EXCEPT the
+    // straddling bucket, which uses the best strategy's intra-bucket sort; followed by
+    // non-allocatable locals (the caller filters those out anyway).
+    unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
+    unsigned  outIdx    = 0;
+    for (unsigned p = 0; p < allocOrderLen; p++)
+    {
+        if (p == straddleBucket)
+        {
+            memcpy(&bestOrder[outIdx], bestStraddleOrder, straddleCount * sizeof(unsigned));
+            outIdx += straddleCount;
+        }
+        else
+        {
+            unsigned bucketSize = passStart[p + 1] - passStart[p];
+            memcpy(&bestOrder[outIdx], &bucketLcls[passStart[p]], bucketSize * sizeof(unsigned));
+            outIdx += bucketSize;
+        }
+    }
+    if (outIdx < lvaCount)
+    {
+        memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable],
+               (lvaCount - numAllocatable) * sizeof(unsigned));
     }
 
     JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u "

From 32b6b75e81d909c02fc55c50366aee70732d132a Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 28 May 2026 06:53:15 -0700
Subject: [PATCH 24/28] JIT: tighten straddle-bucket end check, run jit-format

Review feedback: when bucketSimOffEnd[p] == -128 the last local in the bucket
sits exactly at -128 (still disp8) and the bucket is fully disp8, not
straddling. Tighten the end-side check from <= -128 to < -128 so we don't run
the straddler sort over a bucket whose internal order can't change cost.

Also apply jit-format whitespace adjustments.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 47054186f2d3b4..d6b714c6786951 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5042,7 +5042,12 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
         bucketSimOffEnd[p] = simOff;
 
-        if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] <= -128))
+        // A bucket straddles the disp8/disp32 boundary when its first local can land in disp8
+        // (simOff at entry strictly above -128, so at least 1 byte of disp8 budget remains) and
+        // its last local lands in disp32 (simOff after allocation strictly below -128).
+        // simOffEnd == -128 means the last local sits exactly at -128 (still disp8), so the
+        // bucket is fully disp8 and not a straddler.
+        if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] < -128))
         {
             straddleBucket = p;
         }
@@ -5151,13 +5156,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // Pre-compute zero-init data for the cost model. Only used at FullOpts where S4
     // (initGroupedDensity) is active. At MinOpts S4 is skipped and the init-span term
     // is small relative to the encoding-cost term, so we omit it.
-    bool* lclNeedsInit  = nullptr;
-    bool  useBlockInit  = false;
-    int   baseInitLo    = 0;
-    int   baseInitHi    = 0;
+    bool* lclNeedsInit = nullptr;
+    bool  useBlockInit = false;
+    int   baseInitLo   = 0;
+    int   baseInitHi   = 0;
     if (!isMinOpts)
     {
-        lclNeedsInit          = new (this, CMK_LvaTable) bool[lvaCount];
+        lclNeedsInit           = new (this, CMK_LvaTable) bool[lvaCount];
         unsigned initSlotCount = 0;
         for (unsigned i = 0; i < lvaCount; i++)
         {
@@ -5238,8 +5243,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     JITDUMP("Frame layout optimization: trying strategies for %u locals "
             "(estimated frame size %u bytes%s, straddle bucket=%u of %u, baseCost=%u, "
             "maxSavings=%u)\n",
-            lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "",
-            straddleBucket, allocOrderLen, baseCost, maxSavings);
+            lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", straddleBucket,
+            allocOrderLen, baseCost, maxSavings);
 
     const unsigned straddleStart       = passStart[straddleBucket];
     const unsigned straddleCount       = passStart[straddleBucket + 1] - straddleStart;
@@ -5455,8 +5460,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     }
     if (outIdx < lvaCount)
     {
-        memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable],
-               (lvaCount - numAllocatable) * sizeof(unsigned));
+        memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable], (lvaCount - numAllocatable) * sizeof(unsigned));
     }
 
     JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u "

From 2ad09144ccb368b66a179d6b78a4ad925925fae9 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 28 May 2026 07:36:27 -0700
Subject: [PATCH 25/28] JIT: address PR review feedback for frame layout
 heuristic

* Mirror skip cases from lvaAssignVirtualFrameOffsetsToLocals more fully:
  also exclude lvaIsUnknownSizeLocal (handled by lvaAllocUnknownSizeLocal),
  lvaAsyncThreadObjectVar, and lvaLocAllocSPvar (JIT32_GCENCODER). Without
  these the simulated simOff walk could diverge from the real layout for
  methods using those locals.

* Replace the (lo == 0 && hi == 0) 'empty init span' sentinel with an explicit
  hasInit flag in both the non-straddling-bucket precompute and walkStraddle.
  Offset 0 is a legitimate hiOffs value (e.g. first init-needing local of size
  s placed at so == -s), so the prior sentinel could spuriously reset the
  recorded init span and produce an inaccurate FullOpts cost estimate.

* Document the alignment-modeling approximation: lclAlignTo does not model
  the x86-only DOUBLE_ALIGN / mustDoubleAlign / have_LclVarDoubleAlign rules,
  so the simulated simOff can drift by a pointer-sized slot on x86 double-
  aligned frames. The real allocator still runs unchanged.

Code size on libraries_tests_no_tiered_compilation.run.windows.x64.Release:
  -469,888 / +29,504 bytes  (was -469,696 / +29,555).
TP unchanged within noise (MinOpts +1.26%, FullOpts +0.07%, Overall +0.10%).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 54 ++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index d6b714c6786951..6ce64bbab16496 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4869,6 +4869,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
 
     // Pre-compute alignment requirements for each local.
     // 0 = no alignment needed, otherwise the required alignment in bytes.
+    //
+    // NOTE: This is an approximation of the alignment behavior in
+    // lvaAssignVirtualFrameOffsetsToLocals + lvaAllocLocalAndSetVirtualOffset.
+    // In particular it does NOT model the x86-only DOUBLE_ALIGN /
+    // mustDoubleAlign / lvaIncrementFrameSize path for TYP_DOUBLE / TYP_LONG /
+    // lvStructDoubleAlign, nor the cross-bucket have_LclVarDoubleAlign
+    // pre-reservation. Frames where those apply may see the simulated simOff
+    // drift from the real layout by a pointer-sized slot, causing the
+    // straddler boundary and per-strategy cost estimate to be slightly
+    // imprecise. This is a heuristic, not a correctness path: the real
+    // allocator still runs unchanged and only the chosen sort permutation
+    // is affected.
     unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount];
     for (unsigned i = 0; i < lvaCount; i++)
     {
@@ -4922,11 +4934,25 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         {
             continue;
         }
+        if (lvaIsUnknownSizeLocal(i))
+        {
+            // The real loop calls lvaAllocUnknownSizeLocal for these; their stack home
+            // size is not modeled by lvaLclStackHomeSize so simulating them would skew
+            // the simOff walk.
+            continue;
+        }
         if (i == lvaRetAddrVar)
         {
             continue;
         }
-        if ((i == lvaMonAcquired) || (i == lvaAsyncExecutionContextVar) || (i == lvaAsyncSynchronizationContextVar))
+#ifdef JIT32_GCENCODER
+        if (i == lvaLocAllocSPvar)
+        {
+            continue;
+        }
+#endif
+        if ((i == lvaMonAcquired) || (i == lvaAsyncThreadObjectVar) || (i == lvaAsyncExecutionContextVar) ||
+            (i == lvaAsyncSynchronizationContextVar))
         {
             continue;
         }
@@ -5160,6 +5186,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     bool  useBlockInit = false;
     int   baseInitLo   = 0;
     int   baseInitHi   = 0;
+    bool  baseHasInit  = false;
     if (!isMinOpts)
     {
         lclNeedsInit           = new (this, CMK_LvaTable) bool[lvaCount];
@@ -5224,10 +5251,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
                     {
                         int loOffs = so;
                         int hiOffs = so + static_cast<int>(lclSize[lcl]);
-                        if ((baseInitLo == 0) && (baseInitHi == 0))
+                        if (!baseHasInit)
                         {
-                            baseInitLo = loOffs;
-                            baseInitHi = hiOffs;
+                            baseInitLo  = loOffs;
+                            baseInitHi  = hiOffs;
+                            baseHasInit = true;
                         }
                         else
                         {
@@ -5253,10 +5281,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
     // Compute the straddling bucket's cost contribution given a particular intra-bucket order.
     // Folds in the init-span penalty when block init is in use.
     auto walkStraddle = [&](unsigned* order) -> unsigned {
-        unsigned cost   = 0;
-        int      so     = straddleSimOffEntry;
-        int      initLo = baseInitLo;
-        int      initHi = baseInitHi;
+        unsigned cost    = 0;
+        int      so      = straddleSimOffEntry;
+        int      initLo  = baseInitLo;
+        int      initHi  = baseInitHi;
+        bool     hasInit = baseHasInit;
         for (unsigned k = 0; k < straddleCount; k++)
         {
             unsigned lcl         = order[k];
@@ -5272,10 +5301,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             {
                 int loOffs = so;
                 int hiOffs = so + static_cast<int>(lclSize[lcl]);
-                if ((initLo == 0) && (initHi == 0))
+                if (!hasInit)
                 {
-                    initLo = loOffs;
-                    initHi = hiOffs;
+                    initLo  = loOffs;
+                    initHi  = hiOffs;
+                    hasInit = true;
                 }
                 else
                 {
@@ -5285,7 +5315,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             }
         }
 
-        if (useBlockInit && (initHi > initLo))
+        if (useBlockInit && hasInit && (initHi > initLo))
         {
             unsigned initSpan = static_cast<unsigned>(initHi - initLo);
             unsigned initCost = ((initSpan + 15) / 16) * 2;

From aef42a711872e15676ae7044c0493ff66aa09eda Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 28 May 2026 16:57:03 -0700
Subject: [PATCH 26/28] JIT: extend frame layout sort to all pre-straddler
 buckets

The bucket-and-straddle layout previously sorted only the straddling
bucket's locals. Intra-bucket ordering of the non-straddling buckets
that precede the straddler still affects alignment padding, which
shifts the straddler's entry offset and can let more refs fit in disp8.

Extend the cost search to also re-order locals within each pre-straddler
bucket (preserving bucket boundaries), using the same comparator as the
straddler. Post-straddler buckets remain in canonical order; they are
fully disp32 and their cost is invariant under reordering.

Recovers ~49% of the code-size savings lost relative to the pre-rewrite
layout, with no measurable throughput cost.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 128 ++++++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 33 deletions(-)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 6ce64bbab16496..8257f40f417dc1 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5226,18 +5226,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         }
         useBlockInit = (initSlotCount > 4);
 
-        // If block init applies, precompute the init-span contribution from non-straddling
-        // buckets. The straddling bucket's contribution is folded in by walkStraddle.
+        // If block init applies, precompute the init-span contribution from POST-straddler
+        // buckets only. Pre-straddler buckets contribute order-dependent simOffs (since each
+        // strategy resorts them), so their init-span contribution is folded in per-strategy
+        // by walkLayout. The straddler's contribution is also folded in by walkLayout.
+        // Post-straddler buckets use canonical simOffs computed from bucketSimOffEnd[straddleBucket].
         if (useBlockInit)
         {
-            int so = stkOffs;
-            for (unsigned p = 0; p < allocOrderLen; p++)
+            int so = bucketSimOffEnd[straddleBucket];
+            for (unsigned p = straddleBucket + 1; p < allocOrderLen; p++)
             {
-                if (p == straddleBucket)
-                {
-                    so = bucketSimOffEnd[p];
-                    continue;
-                }
                 for (unsigned k = passStart[p]; k < passStart[p + 1]; k++)
                 {
                     unsigned lcl         = bucketLcls[k];
@@ -5274,21 +5272,64 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
             lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", straddleBucket,
             allocOrderLen, baseCost, maxSavings);
 
-    const unsigned straddleStart       = passStart[straddleBucket];
-    const unsigned straddleCount       = passStart[straddleBucket + 1] - straddleStart;
-    const int      straddleSimOffEntry = bucketSimOffStart[straddleBucket];
+    const unsigned straddleStart    = passStart[straddleBucket];
+    const unsigned straddleCount    = passStart[straddleBucket + 1] - straddleStart;
+    const unsigned preStraddleCount = straddleStart;
+
+    // Pre-straddler buckets (when present) participate in each strategy's sort.
+    // Reordering inside a pre-straddler bucket can change alignment padding,
+    // which shifts the straddler's entry simOff and can pull more refs into disp8.
+    unsigned* preStraddleOrder = (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr;
+    unsigned* bestPreStraddleOrder =
+        (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr;
+    unsigned* straddleOrder     = new (this, CMK_LvaTable) unsigned[straddleCount];
+    unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount];
 
-    // Compute the straddling bucket's cost contribution given a particular intra-bucket order.
-    // Folds in the init-span penalty when block init is in use.
-    auto walkStraddle = [&](unsigned* order) -> unsigned {
+    // Walk the layout from the start of frame through the end of the straddler,
+    // using the given pre-straddler and straddler orders. Returns the variable
+    // part of the cost: the straddler's encoding cost plus the init-span penalty.
+    // (Non-straddler bucket encoding costs are invariant w.r.t. order and live in baseCost.)
+    auto walkLayout = [&](unsigned* preOrder, unsigned* strOrder) -> unsigned {
         unsigned cost    = 0;
-        int      so      = straddleSimOffEntry;
+        int      so      = stkOffs;
         int      initLo  = baseInitLo;
         int      initHi  = baseInitHi;
         bool     hasInit = baseHasInit;
+
+        // Pre-straddler buckets: walk for alignment padding (which shifts the straddler
+        // entry simOff) and for init-span contribution. Encoding cost is invariant.
+        for (unsigned k = 0; k < preStraddleCount; k++)
+        {
+            unsigned lcl         = preOrder[k];
+            int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
+            if ((signedAlign != 0) && ((so % signedAlign) != 0))
+            {
+                so -= signedAlign + (so % signedAlign);
+            }
+            so -= static_cast<int>(lclSize[lcl]);
+
+            if (useBlockInit && (lclNeedsInit[lcl]))
+            {
+                int loOffs = so;
+                int hiOffs = so + static_cast<int>(lclSize[lcl]);
+                if (!hasInit)
+                {
+                    initLo  = loOffs;
+                    initHi  = hiOffs;
+                    hasInit = true;
+                }
+                else
+                {
+                    initLo = min(initLo, loOffs);
+                    initHi = max(initHi, hiOffs);
+                }
+            }
+        }
+
+        // Straddler bucket: encoding cost varies with order; init span continues to accumulate.
         for (unsigned k = 0; k < straddleCount; k++)
         {
-            unsigned lcl         = order[k];
+            unsigned lcl         = strOrder[k];
             int      signedAlign = static_cast<int>(lclAlignTo[lcl]);
             if ((signedAlign != 0) && ((so % signedAlign) != 0))
             {
@@ -5325,27 +5366,42 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         return cost;
     };
 
-    unsigned* straddleOrder     = new (this, CMK_LvaTable) unsigned[straddleCount];
-    unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount];
+    // Score the original (unsorted) order as baseline.
+    if (preStraddleCount > 0)
+    {
+        memcpy(preStraddleOrder, bucketLcls, preStraddleCount * sizeof(unsigned));
+        memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned));
+    }
     memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned));
     memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned));
-
-    // Score the original (unsorted) order as baseline.
-    unsigned    origCost     = baseCost + walkStraddle(straddleOrder);
+    unsigned    origCost     = baseCost + walkLayout(preStraddleOrder, straddleOrder);
     unsigned    bestCost     = origCost;
     int         bestStrategy = -1;
     const char* bestName     = "original";
 
-    // Helper to try a strategy: sort straddleOrder, score, track if best.
+    // Helper to try a strategy: sort each pre-straddler bucket and the straddler
+    // independently with the comparator (bucket boundaries are preserved), then
+    // score with walkLayout and track if best.
     auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned {
+        for (unsigned p = 0; p < straddleBucket; p++)
+        {
+            unsigned bStart = passStart[p];
+            unsigned bEnd   = passStart[p + 1];
+            memcpy(&preStraddleOrder[bStart], &bucketLcls[bStart], (bEnd - bStart) * sizeof(unsigned));
+            jitstd::sort(&preStraddleOrder[bStart], &preStraddleOrder[bEnd], comparator);
+        }
         memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned));
         jitstd::sort(straddleOrder, straddleOrder + straddleCount, comparator);
-        unsigned cost = baseCost + walkStraddle(straddleOrder);
+        unsigned cost = baseCost + walkLayout(preStraddleOrder, straddleOrder);
         if (cost < bestCost)
         {
             bestCost     = cost;
             bestStrategy = strategyIdx;
             bestName     = name;
+            if (preStraddleCount > 0)
+            {
+                memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned));
+            }
             memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned));
         }
         return cost;
@@ -5469,24 +5525,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a
         return nullptr;
     }
 
-    // Assemble the final permutation: each bucket in its original order, EXCEPT the
-    // straddling bucket, which uses the best strategy's intra-bucket sort; followed by
-    // non-allocatable locals (the caller filters those out anyway).
+    // Assemble the final permutation:
+    //   - Pre-straddler buckets use the best strategy's intra-bucket sort.
+    //   - The straddler bucket uses the best strategy's intra-bucket sort.
+    //   - Post-straddler buckets stay in canonical order.
+    //   - Non-allocatable locals tail (the caller filters those out anyway).
     unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount];
     unsigned  outIdx    = 0;
     for (unsigned p = 0; p < allocOrderLen; p++)
     {
-        if (p == straddleBucket)
+        unsigned bStart     = passStart[p];
+        unsigned bucketSize = passStart[p + 1] - bStart;
+        if (p < straddleBucket)
+        {
+            memcpy(&bestOrder[outIdx], &bestPreStraddleOrder[bStart], bucketSize * sizeof(unsigned));
+        }
+        else if (p == straddleBucket)
         {
-            memcpy(&bestOrder[outIdx], bestStraddleOrder, straddleCount * sizeof(unsigned));
-            outIdx += straddleCount;
+            memcpy(&bestOrder[outIdx], bestStraddleOrder, bucketSize * sizeof(unsigned));
         }
         else
         {
-            unsigned bucketSize = passStart[p + 1] - passStart[p];
-            memcpy(&bestOrder[outIdx], &bucketLcls[passStart[p]], bucketSize * sizeof(unsigned));
-            outIdx += bucketSize;
+            memcpy(&bestOrder[outIdx], &bucketLcls[bStart], bucketSize * sizeof(unsigned));
         }
+        outIdx += bucketSize;
     }
     if (outIdx < lvaCount)
     {

From 149dc418038bc70b55cf42c5d568fdb139c72930 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 28 May 2026 18:05:14 -0700
Subject: [PATCH 27/28] JIT: lower JitFrameLayoutMaxSavingsThreshold default
 from 12 to 0

The threshold pruned strategies that could save at most N bytes of
encoding. With the bucket-and-straddle search, the underlying sort is
cheap enough that the prune buys almost no throughput while it gives up
real code-size opportunities (~68K bytes across linux-x64 SPMI collections).
Drop the bound to 0 so the search is attempted whenever any saving is
possible.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/jitconfigvalues.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 75df1430b9e6f7..1dea92af8fb938 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -697,7 +697,7 @@ RELEASE_CONFIG_INTEGER(JitExtDefaultPolicyProfScale, "JitExtDefaultPolicyProfSca
 RELEASE_CONFIG_INTEGER(JitInlinePolicyModel, "JitInlinePolicyModel", 0)
 RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0)
 RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40)
-RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 12)
+RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 0)
 CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange")
 RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1)
 RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1)

From 5550568a086b479d7fd99c9b08a941ff6bf7effd Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 28 May 2026 19:38:16 -0700
Subject: [PATCH 28/28] JIT: bail from lvaComputeOptimalFrameLayoutOrder when
 lvaCount == 0

Methods with no locals (e.g. small leaf wrappers around helper calls)
were tripping the arenaAllocator 'size != 0' assert via the leading
'new unsigned[lvaCount]' allocation. Bail out at the function entry
when lvaCount is zero so we never make a zero-sized arena allocation.

This fixes a checked-build crossgen2 CoreLib failure on x86 surfaced
by 'Build linux-x86 checked CoreCLR' and the Windows x86 checked legs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lclvars.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 8257f40f417dc1..b408c2544549a9 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4848,6 +4848,13 @@ enum LclAllocCategory : UINT
 //
 unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder)
 {
+    // No locals at all -- nothing to lay out, and we mustn't make zero-sized arena
+    // allocations below.
+    if (lvaCount == 0)
+    {
+        return nullptr;
+    }
+
     // Pre-compute local sizes and total estimated frame size in one pass.
     // These arrays are indexed by lclNum and used throughout to avoid repeated
     // function calls in sort comparators and cost estimation.