From c08c441cebf916638edc668732a14899f33592db Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 2 May 2026 23:41:19 +0000 Subject: [PATCH 01/28] JIT: Sort stack locals by access density for smaller code on x64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort local variables by access density (weighted ref count / size) before frame layout in lvaAssignVirtualFrameOffsetsToLocals(). This packs locals with the highest access frequency per byte into the disp8 zone (±128 bytes from the frame pointer), reducing 4-byte displacements to 1-byte encodings. SPMI aspnet2 results: -5,982 bytes (-0.23%), 256 improvements vs 103 regressions, PerfScore -0.01%. Gated to TARGET_AMD64, FullOpts, non-EnC, frames > 128 bytes estimated size. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 101 +++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index ba92c8035fd012..453708766e8cc0 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5156,6 +5156,98 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() UINT assignMore = 0xFFFFFFFF; bool have_LclVarDoubleAlign = false; +#ifdef TARGET_AMD64 + // Build a sorted array of local variable indices to optimize displacement encoding. + // On x64, stack accesses within [-128, +127] of the base register use a 1-byte + // displacement, while larger offsets require 4 bytes — saving 3 bytes per access + // promoted from disp32 to disp8. + // + // The sort uses access density (weighted ref count / local size) as the primary key + // so that locals with the highest access frequency per byte of frame space get the + // smallest offsets. This maximizes the total number of hot accesses that fit within + // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals + // requiring 8-byte alignment are grouped before smaller locals to reduce alignment + // padding waste. + // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) + // + // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with + // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get + // bigger" invariant to preserve between passes. + // + // We skip this optimization for EnC (which requires stable layout) and when ref counts + // are not available. + assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); + unsigned* lclVarSortOrder = nullptr; + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts()) + { + // Estimate total local frame size to decide if sorting is worthwhile. + // Only sort when the frame exceeds the disp8 boundary (128 bytes); + // in smaller frames, all locals already fit in disp8 and sorting just churns + // offsets without benefit. + unsigned estimatedLocalSize = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + estimatedLocalSize += lvaLclStackHomeSize(i); + } + + if (estimatedLocalSize > 128) + { + JITDUMP("Sorting %u locals by access density for frame layout optimization " + "(estimated frame size %u bytes)\n", + lvaCount, estimatedLocalSize); + + lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclVarSortOrder[i] = i; + } + + jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount, + [this](unsigned n1, unsigned n2) -> bool { + const LclVarDsc* dsc1 = lvaGetDesc(n1); + const LclVarDsc* dsc2 = lvaGetDesc(n2); + + // Sort by access density (weighted ref count per byte) descending. + // This maximizes the number of hot accesses that fit within the + // disp8 zone (first ~128 bytes of frame). A small hot local is + // more valuable per frame byte than a large hot local. + unsigned size1 = lvaLclStackHomeSize(n1); + unsigned size2 = lvaLclStackHomeSize(n2); + weight_t wt1 = dsc1->lvRefCntWtd(lvaRefCountState); + weight_t wt2 = dsc2->lvRefCntWtd(lvaRefCountState); + + // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1 + // to avoid division. Both sizes are > 0. + weight_t density1 = wt1 * size2; + weight_t density2 = wt2 * size1; + if (density1 != density2) + { + return density1 > density2; + } + + // Among locals with equal density, group by alignment class + // (8+ byte locals before smaller ones) to reduce padding. + bool aligned1 = (size1 >= 8); + bool aligned2 = (size2 >= 8); + if (aligned1 != aligned2) + { + return aligned1; + } + + unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState); + unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState); + if (cnt1 != cnt2) + { + return cnt1 > cnt2; + } + + // Stable tiebreaker: lower lclNum first. + return n1 < n2; + }); + } + } +#endif // TARGET_AMD64 + for (cur = 0; alloc_order[cur]; cur++) { if ((assignMore & alloc_order[cur]) == 0) @@ -5168,8 +5260,15 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() unsigned lclNum; LclVarDsc* varDsc; - for (lclNum = 0, varDsc = lvaTable; lclNum < lvaCount; lclNum++, varDsc++) + for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++) { +#ifdef TARGET_AMD64 + lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx; +#else + lclNum = sortIdx; +#endif + varDsc = lvaGetDesc(lclNum); + /* Ignore field locals of the promotion type PROMOTION_TYPE_FIELD_DEPENDENT. In other words, we will not calculate the "base" address of the struct local if the promotion type is PROMOTION_TYPE_FIELD_DEPENDENT. From e63b42776580aee5998e4da45d0847e88f4646ae Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 01:27:43 +0000 Subject: [PATCH 02/28] JIT: Multi-strategy frame layout selection for smaller code on x64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the single access-density sort with a multi-strategy approach that tries 5 candidate layouts and picks the one with lowest estimated encoding cost: 1. Original (unsorted) order as baseline 2. Weighted access density (refCntWtd / size) 3. Unweighted ref count 4. Weighted ref count 5. Unweighted ref count density (refCnt / size) A lightweight cost estimation function simulates the frame allocation loop (including alignment padding and SIMD alignment) and scores each layout using Σ(refCnt × encodingBytes), where encodingBytes is 1 for disp8 or 4 for disp32. The strategy with the lowest cost wins; if no strategy beats the original order, no sorting is applied. This is gated to frame-pointer-based frames only, since the disp8 boundary check assumes RBP-relative negative virtual offsets. SPMI aspnet2 results vs single-strategy: Code size: -7,400 bytes (-0.28%) vs -5,982 bytes (-0.23%) Regressions: 36 vs 103 (65% fewer) PerfScore: neutral Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 273 +++++++++++++++++++++++++++--------- 1 file changed, 210 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 453708766e8cc0..14cb9100b31eed 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5157,33 +5157,31 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() bool have_LclVarDoubleAlign = false; #ifdef TARGET_AMD64 - // Build a sorted array of local variable indices to optimize displacement encoding. + // Multi-strategy frame layout optimization for x64. + // // On x64, stack accesses within [-128, +127] of the base register use a 1-byte - // displacement, while larger offsets require 4 bytes — saving 3 bytes per access - // promoted from disp32 to disp8. + // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra + // bytes per access. We try multiple sort orders for locals and pick the one that + // minimizes total encoding cost, estimated by simulating the frame allocation loop. // - // The sort uses access density (weighted ref count / local size) as the primary key - // so that locals with the highest access frequency per byte of frame space get the - // smallest offsets. This maximizes the total number of hot accesses that fit within - // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals - // requiring 8-byte alignment are grouped before smaller locals to reduce alignment - // padding waste. + // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1 + // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction + // count. This gives a direct estimate of total displacement encoding bytes. // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) // - // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with - // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get - // bigger" invariant to preserve between passes. + // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called + // with FINAL_FRAME_LAYOUT (no tentative layout exists). + // + // We only run this for frame-pointer-based frames because the disp8 boundary check + // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets + // after fixup and contribute negligible savings. // - // We skip this optimization for EnC (which requires stable layout) and when ref counts - // are not available. + // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are + // less meaningful), and frames that fit entirely within the disp8 zone. assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); unsigned* lclVarSortOrder = nullptr; - if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts()) + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed()) { - // Estimate total local frame size to decide if sorting is worthwhile. - // Only sort when the frame exceeds the disp8 boundary (128 bytes); - // in smaller frames, all locals already fit in disp8 and sorting just churns - // offsets without benefit. unsigned estimatedLocalSize = 0; for (unsigned i = 0; i < lvaCount; i++) { @@ -5192,58 +5190,207 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (estimatedLocalSize > 128) { - JITDUMP("Sorting %u locals by access density for frame layout optimization " + JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " "(estimated frame size %u bytes)\n", lvaCount, estimatedLocalSize); - lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + // Pre-compute which locals will be allocated in the main loop and their + // pass category. Category 0 means "not allocatable" (skipped by the loop). + unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclPassCategory[i] = 0; + LclVarDsc* varDsc = lvaGetDesc(i); + + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + continue; +#if FEATURE_FIXED_OUT_ARGS + if (i == lvaOutgoingArgSpaceVar) + continue; +#endif + if (lvaIsOSRLocal(i)) + continue; + if (!varDsc->lvOnFrame) + continue; + if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie()) + continue; + if (i == lvaRetAddrVar) + continue; + if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || + i == lvaAsyncSynchronizationContextVar) + continue; + if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) + continue; + + if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) + { + lclPassCategory[i] = + varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; + } + else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) + { + lclPassCategory[i] = ALLOC_PTRS; + } + else + { + lclPassCategory[i] = ALLOC_NON_PTRS; + } + } + + // Simulate frame layout for a given sort order and return total encoding cost. + // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where + // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. + // Uses the current stkOffs as the starting point, which already accounts for + // callee saves, XMM saves, and any pre-allocated special locals. + auto estimateLayoutCost = [&](unsigned* order) -> unsigned { + unsigned totalCost = 0; + int simOff = stkOffs; + + for (int p = 0; alloc_order[p]; p++) + { + UINT pass = alloc_order[p]; + for (unsigned idx = 0; idx < lvaCount; idx++) + { + unsigned lcl = order[idx]; + if (lclPassCategory[lcl] != pass) + continue; + + LclVarDsc* varDsc = lvaGetDesc(lcl); + unsigned size = lvaLclStackHomeSize(lcl); + + // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). + if (size >= 8) + { +#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES + if (varTypeIsSIMD(varDsc)) + { + int alignment = getSIMDTypeAlignment(varDsc->TypeGet()); + if (simOff % alignment != 0) + { + simOff -= static_cast(alignment + (simOff % alignment)); + } + } + else +#endif + { + if ((simOff % 8) != 0) + { + simOff -= static_cast(8 + (simOff % 8)); + } + } + } + + simOff -= static_cast(size); + + unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState); + totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); + } + } + + return totalCost; + }; + + lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; for (unsigned i = 0; i < lvaCount; i++) { lclVarSortOrder[i] = i; + candidateOrder[i] = i; + } + + // Score the original (unsorted) order as baseline. + unsigned origCost = estimateLayoutCost(lclVarSortOrder); + unsigned bestCost = origCost; + const char* bestName = "original"; + + // Helper to try a strategy: sort candidateOrder, estimate cost, + // and update best if the cost is lower. + auto tryStrategy = [&](const char* name, auto comparator) -> unsigned { + for (unsigned i = 0; i < lvaCount; i++) + candidateOrder[i] = i; + jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator); + unsigned cost = estimateLayoutCost(candidateOrder); + if (cost < bestCost) + { + bestCost = cost; + bestName = name; + memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned)); + } + return cost; + }; + + // Strategy 1: Access density (weighted ref count / size) descending. + // A small hot local is more valuable per frame byte than a large hot local. + unsigned densityCost = tryStrategy("density", + [this](unsigned n1, unsigned n2) -> bool { + const LclVarDsc* d1 = lvaGetDesc(n1); + const LclVarDsc* d2 = lvaGetDesc(n2); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + weight_t w1 = d1->lvRefCntWtd(lvaRefCountState); + weight_t w2 = d2->lvRefCntWtd(lvaRefCountState); + // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. + weight_t dens1 = w1 * s2; + weight_t dens2 = w2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + unsigned c1 = d1->lvRefCnt(lvaRefCountState); + unsigned c2 = d2->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + return n1 < n2; + }); + + // Strategy 2: Unweighted ref count descending. + unsigned refCntCost = tryStrategy("refCnt", + [this](unsigned n1, unsigned n2) -> bool { + unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 3: Weighted ref count descending. + unsigned weightCost = tryStrategy("weight", + [this](unsigned n1, unsigned n2) -> bool { + weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + if (w1 != w2) return w1 > w2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 4: Unweighted ref count density (refCnt / size) descending. + unsigned refDensityCost = tryStrategy("refDensity", + [this](unsigned n1, unsigned n2) -> bool { + unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + // Cross-multiply to avoid division. + unsigned long long dens1 = (unsigned long long)c1 * s2; + unsigned long long dens2 = (unsigned long long)c2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // If original order won, no sorting needed. + if (bestCost == origCost) + { + lclVarSortOrder = nullptr; } - jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount, - [this](unsigned n1, unsigned n2) -> bool { - const LclVarDsc* dsc1 = lvaGetDesc(n1); - const LclVarDsc* dsc2 = lvaGetDesc(n2); - - // Sort by access density (weighted ref count per byte) descending. - // This maximizes the number of hot accesses that fit within the - // disp8 zone (first ~128 bytes of frame). A small hot local is - // more valuable per frame byte than a large hot local. - unsigned size1 = lvaLclStackHomeSize(n1); - unsigned size2 = lvaLclStackHomeSize(n2); - weight_t wt1 = dsc1->lvRefCntWtd(lvaRefCountState); - weight_t wt2 = dsc2->lvRefCntWtd(lvaRefCountState); - - // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1 - // to avoid division. Both sizes are > 0. - weight_t density1 = wt1 * size2; - weight_t density2 = wt2 * size1; - if (density1 != density2) - { - return density1 > density2; - } - - // Among locals with equal density, group by alignment class - // (8+ byte locals before smaller ones) to reduce padding. - bool aligned1 = (size1 >= 8); - bool aligned2 = (size2 >= 8); - if (aligned1 != aligned2) - { - return aligned1; - } - - unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState); - unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState); - if (cnt1 != cnt2) - { - return cnt1 > cnt2; - } - - // Stable tiebreaker: lower lclNum first. - return n1 < n2; - }); + JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " + "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", + origCost, densityCost, refCntCost, weightCost, refDensityCost, + bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); } } #endif // TARGET_AMD64 From 839f45707a2f7039b4a5336d1c4438ffba47af29 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 02:04:39 +0000 Subject: [PATCH 03/28] JIT: Enable multi-strategy frame layout optimization for x86 Extend the stack local sorting optimization from x64-only (TARGET_AMD64) to all x86/x64 (TARGET_XARCH). The optimization is applicable to x86 because: - x86 uses the same disp8 [-128,+127] vs disp32 encoding threshold - x86 only calls lvaAssignFrameOffsets with FINAL_FRAME_LAYOUT - x86 frequently uses EBP-based frames (especially with DOUBLE_ALIGN) No behavioral change on x64; enables the optimization for x86 targets. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 14cb9100b31eed..bd93f583697758 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5156,10 +5156,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() UINT assignMore = 0xFFFFFFFF; bool have_LclVarDoubleAlign = false; -#ifdef TARGET_AMD64 - // Multi-strategy frame layout optimization for x64. +#ifdef TARGET_XARCH + // Multi-strategy frame layout optimization for x86/x64. // - // On x64, stack accesses within [-128, +127] of the base register use a 1-byte + // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra // bytes per access. We try multiple sort orders for locals and pick the one that // minimizes total encoding cost, estimated by simulating the frame allocation loop. @@ -5169,12 +5169,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // count. This gives a direct estimate of total displacement encoding bytes. // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) // - // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called + // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called // with FINAL_FRAME_LAYOUT (no tentative layout exists). // // We only run this for frame-pointer-based frames because the disp8 boundary check - // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets - // after fixup and contribute negligible savings. + // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive + // offsets after fixup and contribute negligible savings. // // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are // less meaningful), and frames that fit entirely within the disp8 zone. @@ -5393,7 +5393,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); } } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH for (cur = 0; alloc_order[cur]; cur++) { @@ -5409,7 +5409,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++) { -#ifdef TARGET_AMD64 +#ifdef TARGET_XARCH lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx; #else lclNum = sortIdx; From ff85ffc581ff15f768e577c2f664c3c0913e03a0 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 02:21:26 +0000 Subject: [PATCH 04/28] JIT: Enable frame layout optimization for MinOpts/Tier0 with lightweight ref counting For MinOpts/Tier0, precise ref counts are not computed (PreciseRefCountsRequired() returns false). Previously, the frame layout optimization was skipped entirely for MinOpts. This change adds a lightweight LIR walk that counts local variable references without any of the analysis side effects of lvaMarkLclRefs. The lightweight counts are used by the cost estimation function and sorting comparators to make informed layout decisions for MinOpts methods. SPMI results across collections show significant impact, especially on MinOpts-heavy collections: aspnet2: -8,682 bytes (-0.33%), 262 improvements, 37 regressions benchmarks.run: -409,067 bytes (-1.10%), 8,788 improvements, 706 regressions libraries_tests: -4,503,889 bytes (-1.11%), 96,203 improvements, 7,210 regressions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 96 ++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 22 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index bd93f583697758..80e2b13243a1af 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5176,11 +5176,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive // offsets after fixup and contribute negligible savings. // - // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are - // less meaningful), and frames that fit entirely within the disp8 zone. + // We skip this for EnC (which requires stable layout) and frames that fit entirely + // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed, + // we do a lightweight LIR walk to count local references for sorting purposes. assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); unsigned* lclVarSortOrder = nullptr; - if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed()) + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed()) { unsigned estimatedLocalSize = 0; for (unsigned i = 0; i < lvaCount; i++) @@ -5190,9 +5191,36 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (estimatedLocalSize > 128) { + // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0). + // Do a lightweight LIR walk to count local references for sorting purposes. + // This is much cheaper than the full lvaMarkLclRefs pass — we only count + // occurrences without any of the analysis side effects. + unsigned* lclRefCounts = nullptr; + if (!PreciseRefCountsRequired()) + { + lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount]; + memset(lclRefCounts, 0, lvaCount * sizeof(unsigned)); + + for (BasicBlock* const block : Blocks()) + { + for (GenTree* node : LIR::AsRange(block)) + { + if (node->OperIsAnyLocal()) + { + unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); + if (lclNum < lvaCount) + { + lclRefCounts[lclNum]++; + } + } + } + } + } + JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " - "(estimated frame size %u bytes)\n", - lvaCount, estimatedLocalSize); + "(estimated frame size %u bytes%s)\n", + lvaCount, estimatedLocalSize, + lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); // Pre-compute which locals will be allocated in the main loop and their // pass category. Category 0 means "not allocatable" (skipped by the loop). @@ -5282,7 +5310,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() simOff -= static_cast(size); - unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState); + unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] + : varDsc->lvRefCnt(lvaRefCountState); totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); } } @@ -5322,30 +5351,41 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // Strategy 1: Access density (weighted ref count / size) descending. // A small hot local is more valuable per frame byte than a large hot local. unsigned densityCost = tryStrategy("density", - [this](unsigned n1, unsigned n2) -> bool { - const LclVarDsc* d1 = lvaGetDesc(n1); - const LclVarDsc* d2 = lvaGetDesc(n2); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1 = d1->lvRefCntWtd(lvaRefCountState); - weight_t w2 = d2->lvRefCntWtd(lvaRefCountState); + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. weight_t dens1 = w1 * s2; weight_t dens2 = w2 * s1; if (dens1 != dens2) return dens1 > dens2; bool a1 = (s1 >= 8), a2 = (s2 >= 8); if (a1 != a2) return a1; - unsigned c1 = d1->lvRefCnt(lvaRefCountState); - unsigned c2 = d2->lvRefCnt(lvaRefCountState); + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); if (c1 != c2) return c1 > c2; return n1 < n2; }); // Strategy 2: Unweighted ref count descending. unsigned refCntCost = tryStrategy("refCnt", - [this](unsigned n1, unsigned n2) -> bool { - unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); if (c1 != c2) return c1 > c2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); @@ -5354,10 +5394,20 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() }); // Strategy 3: Weighted ref count descending. + // For MinOpts, weighted = unweighted (no block weights available). unsigned weightCost = tryStrategy("weight", - [this](unsigned n1, unsigned n2) -> bool { - weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } if (w1 != w2) return w1 > w2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); @@ -5367,9 +5417,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // Strategy 4: Unweighted ref count density (refCnt / size) descending. unsigned refDensityCost = tryStrategy("refDensity", - [this](unsigned n1, unsigned n2) -> bool { - unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); // Cross-multiply to avoid division. From 81e5559e09f6995a695fca7868126a9b703a5c41 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 2 May 2026 23:41:19 +0000 Subject: [PATCH 05/28] JIT: Sort stack locals by access density for smaller code on x64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort local variables by access density (weighted ref count / size) before frame layout in lvaAssignVirtualFrameOffsetsToLocals(). This packs locals with the highest access frequency per byte into the disp8 zone (±128 bytes from the frame pointer), reducing 4-byte displacements to 1-byte encodings. SPMI aspnet2 results: -5,982 bytes (-0.23%), 256 improvements vs 103 regressions, PerfScore -0.01%. Gated to TARGET_AMD64, FullOpts, non-EnC, frames > 128 bytes estimated size. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 101 +++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index ba92c8035fd012..453708766e8cc0 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5156,6 +5156,98 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() UINT assignMore = 0xFFFFFFFF; bool have_LclVarDoubleAlign = false; +#ifdef TARGET_AMD64 + // Build a sorted array of local variable indices to optimize displacement encoding. + // On x64, stack accesses within [-128, +127] of the base register use a 1-byte + // displacement, while larger offsets require 4 bytes — saving 3 bytes per access + // promoted from disp32 to disp8. + // + // The sort uses access density (weighted ref count / local size) as the primary key + // so that locals with the highest access frequency per byte of frame space get the + // smallest offsets. This maximizes the total number of hot accesses that fit within + // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals + // requiring 8-byte alignment are grouped before smaller locals to reduce alignment + // padding waste. + // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) + // + // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with + // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get + // bigger" invariant to preserve between passes. + // + // We skip this optimization for EnC (which requires stable layout) and when ref counts + // are not available. + assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); + unsigned* lclVarSortOrder = nullptr; + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts()) + { + // Estimate total local frame size to decide if sorting is worthwhile. + // Only sort when the frame exceeds the disp8 boundary (128 bytes); + // in smaller frames, all locals already fit in disp8 and sorting just churns + // offsets without benefit. + unsigned estimatedLocalSize = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + estimatedLocalSize += lvaLclStackHomeSize(i); + } + + if (estimatedLocalSize > 128) + { + JITDUMP("Sorting %u locals by access density for frame layout optimization " + "(estimated frame size %u bytes)\n", + lvaCount, estimatedLocalSize); + + lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclVarSortOrder[i] = i; + } + + jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount, + [this](unsigned n1, unsigned n2) -> bool { + const LclVarDsc* dsc1 = lvaGetDesc(n1); + const LclVarDsc* dsc2 = lvaGetDesc(n2); + + // Sort by access density (weighted ref count per byte) descending. + // This maximizes the number of hot accesses that fit within the + // disp8 zone (first ~128 bytes of frame). A small hot local is + // more valuable per frame byte than a large hot local. + unsigned size1 = lvaLclStackHomeSize(n1); + unsigned size2 = lvaLclStackHomeSize(n2); + weight_t wt1 = dsc1->lvRefCntWtd(lvaRefCountState); + weight_t wt2 = dsc2->lvRefCntWtd(lvaRefCountState); + + // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1 + // to avoid division. Both sizes are > 0. + weight_t density1 = wt1 * size2; + weight_t density2 = wt2 * size1; + if (density1 != density2) + { + return density1 > density2; + } + + // Among locals with equal density, group by alignment class + // (8+ byte locals before smaller ones) to reduce padding. + bool aligned1 = (size1 >= 8); + bool aligned2 = (size2 >= 8); + if (aligned1 != aligned2) + { + return aligned1; + } + + unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState); + unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState); + if (cnt1 != cnt2) + { + return cnt1 > cnt2; + } + + // Stable tiebreaker: lower lclNum first. + return n1 < n2; + }); + } + } +#endif // TARGET_AMD64 + for (cur = 0; alloc_order[cur]; cur++) { if ((assignMore & alloc_order[cur]) == 0) @@ -5168,8 +5260,15 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() unsigned lclNum; LclVarDsc* varDsc; - for (lclNum = 0, varDsc = lvaTable; lclNum < lvaCount; lclNum++, varDsc++) + for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++) { +#ifdef TARGET_AMD64 + lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx; +#else + lclNum = sortIdx; +#endif + varDsc = lvaGetDesc(lclNum); + /* Ignore field locals of the promotion type PROMOTION_TYPE_FIELD_DEPENDENT. In other words, we will not calculate the "base" address of the struct local if the promotion type is PROMOTION_TYPE_FIELD_DEPENDENT. From 113262f8869470f645ca5bd2b146b0b695eea412 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 01:27:43 +0000 Subject: [PATCH 06/28] JIT: Multi-strategy frame layout selection for smaller code on x64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the single access-density sort with a multi-strategy approach that tries 5 candidate layouts and picks the one with lowest estimated encoding cost: 1. Original (unsorted) order as baseline 2. Weighted access density (refCntWtd / size) 3. Unweighted ref count 4. Weighted ref count 5. Unweighted ref count density (refCnt / size) A lightweight cost estimation function simulates the frame allocation loop (including alignment padding and SIMD alignment) and scores each layout using Σ(refCnt × encodingBytes), where encodingBytes is 1 for disp8 or 4 for disp32. The strategy with the lowest cost wins; if no strategy beats the original order, no sorting is applied. This is gated to frame-pointer-based frames only, since the disp8 boundary check assumes RBP-relative negative virtual offsets. SPMI aspnet2 results vs single-strategy: Code size: -7,400 bytes (-0.28%) vs -5,982 bytes (-0.23%) Regressions: 36 vs 103 (65% fewer) PerfScore: neutral Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 273 +++++++++++++++++++++++++++--------- 1 file changed, 210 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 453708766e8cc0..14cb9100b31eed 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5157,33 +5157,31 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() bool have_LclVarDoubleAlign = false; #ifdef TARGET_AMD64 - // Build a sorted array of local variable indices to optimize displacement encoding. + // Multi-strategy frame layout optimization for x64. + // // On x64, stack accesses within [-128, +127] of the base register use a 1-byte - // displacement, while larger offsets require 4 bytes — saving 3 bytes per access - // promoted from disp32 to disp8. + // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra + // bytes per access. We try multiple sort orders for locals and pick the one that + // minimizes total encoding cost, estimated by simulating the frame allocation loop. // - // The sort uses access density (weighted ref count / local size) as the primary key - // so that locals with the highest access frequency per byte of frame space get the - // smallest offsets. This maximizes the total number of hot accesses that fit within - // the disp8 zone (~128 bytes from the frame pointer). As a secondary key, locals - // requiring 8-byte alignment are grouped before smaller locals to reduce alignment - // padding waste. + // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1 + // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction + // count. This gives a direct estimate of total displacement encoding bytes. // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) // - // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called with - // FINAL_FRAME_LAYOUT (no tentative layout exists), so there is no "offsets must not get - // bigger" invariant to preserve between passes. + // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called + // with FINAL_FRAME_LAYOUT (no tentative layout exists). + // + // We only run this for frame-pointer-based frames because the disp8 boundary check + // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets + // after fixup and contribute negligible savings. // - // We skip this optimization for EnC (which requires stable layout) and when ref counts - // are not available. + // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are + // less meaningful), and frames that fit entirely within the disp8 zone. assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); unsigned* lclVarSortOrder = nullptr; - if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts()) + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed()) { - // Estimate total local frame size to decide if sorting is worthwhile. - // Only sort when the frame exceeds the disp8 boundary (128 bytes); - // in smaller frames, all locals already fit in disp8 and sorting just churns - // offsets without benefit. unsigned estimatedLocalSize = 0; for (unsigned i = 0; i < lvaCount; i++) { @@ -5192,58 +5190,207 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (estimatedLocalSize > 128) { - JITDUMP("Sorting %u locals by access density for frame layout optimization " + JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " "(estimated frame size %u bytes)\n", lvaCount, estimatedLocalSize); - lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + // Pre-compute which locals will be allocated in the main loop and their + // pass category. Category 0 means "not allocatable" (skipped by the loop). + unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclPassCategory[i] = 0; + LclVarDsc* varDsc = lvaGetDesc(i); + + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + continue; +#if FEATURE_FIXED_OUT_ARGS + if (i == lvaOutgoingArgSpaceVar) + continue; +#endif + if (lvaIsOSRLocal(i)) + continue; + if (!varDsc->lvOnFrame) + continue; + if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie()) + continue; + if (i == lvaRetAddrVar) + continue; + if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || + i == lvaAsyncSynchronizationContextVar) + continue; + if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) + continue; + + if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) + { + lclPassCategory[i] = + varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; + } + else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) + { + lclPassCategory[i] = ALLOC_PTRS; + } + else + { + lclPassCategory[i] = ALLOC_NON_PTRS; + } + } + + // Simulate frame layout for a given sort order and return total encoding cost. + // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where + // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. + // Uses the current stkOffs as the starting point, which already accounts for + // callee saves, XMM saves, and any pre-allocated special locals. + auto estimateLayoutCost = [&](unsigned* order) -> unsigned { + unsigned totalCost = 0; + int simOff = stkOffs; + + for (int p = 0; alloc_order[p]; p++) + { + UINT pass = alloc_order[p]; + for (unsigned idx = 0; idx < lvaCount; idx++) + { + unsigned lcl = order[idx]; + if (lclPassCategory[lcl] != pass) + continue; + + LclVarDsc* varDsc = lvaGetDesc(lcl); + unsigned size = lvaLclStackHomeSize(lcl); + + // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). + if (size >= 8) + { +#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES + if (varTypeIsSIMD(varDsc)) + { + int alignment = getSIMDTypeAlignment(varDsc->TypeGet()); + if (simOff % alignment != 0) + { + simOff -= static_cast(alignment + (simOff % alignment)); + } + } + else +#endif + { + if ((simOff % 8) != 0) + { + simOff -= static_cast(8 + (simOff % 8)); + } + } + } + + simOff -= static_cast(size); + + unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState); + totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); + } + } + + return totalCost; + }; + + lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; for (unsigned i = 0; i < lvaCount; i++) { lclVarSortOrder[i] = i; + candidateOrder[i] = i; + } + + // Score the original (unsorted) order as baseline. + unsigned origCost = estimateLayoutCost(lclVarSortOrder); + unsigned bestCost = origCost; + const char* bestName = "original"; + + // Helper to try a strategy: sort candidateOrder, estimate cost, + // and update best if the cost is lower. + auto tryStrategy = [&](const char* name, auto comparator) -> unsigned { + for (unsigned i = 0; i < lvaCount; i++) + candidateOrder[i] = i; + jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator); + unsigned cost = estimateLayoutCost(candidateOrder); + if (cost < bestCost) + { + bestCost = cost; + bestName = name; + memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned)); + } + return cost; + }; + + // Strategy 1: Access density (weighted ref count / size) descending. + // A small hot local is more valuable per frame byte than a large hot local. + unsigned densityCost = tryStrategy("density", + [this](unsigned n1, unsigned n2) -> bool { + const LclVarDsc* d1 = lvaGetDesc(n1); + const LclVarDsc* d2 = lvaGetDesc(n2); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + weight_t w1 = d1->lvRefCntWtd(lvaRefCountState); + weight_t w2 = d2->lvRefCntWtd(lvaRefCountState); + // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. + weight_t dens1 = w1 * s2; + weight_t dens2 = w2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + unsigned c1 = d1->lvRefCnt(lvaRefCountState); + unsigned c2 = d2->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + return n1 < n2; + }); + + // Strategy 2: Unweighted ref count descending. + unsigned refCntCost = tryStrategy("refCnt", + [this](unsigned n1, unsigned n2) -> bool { + unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 3: Weighted ref count descending. + unsigned weightCost = tryStrategy("weight", + [this](unsigned n1, unsigned n2) -> bool { + weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + if (w1 != w2) return w1 > w2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 4: Unweighted ref count density (refCnt / size) descending. + unsigned refDensityCost = tryStrategy("refDensity", + [this](unsigned n1, unsigned n2) -> bool { + unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + // Cross-multiply to avoid division. + unsigned long long dens1 = (unsigned long long)c1 * s2; + unsigned long long dens2 = (unsigned long long)c2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // If original order won, no sorting needed. + if (bestCost == origCost) + { + lclVarSortOrder = nullptr; } - jitstd::sort(lclVarSortOrder, lclVarSortOrder + lvaCount, - [this](unsigned n1, unsigned n2) -> bool { - const LclVarDsc* dsc1 = lvaGetDesc(n1); - const LclVarDsc* dsc2 = lvaGetDesc(n2); - - // Sort by access density (weighted ref count per byte) descending. - // This maximizes the number of hot accesses that fit within the - // disp8 zone (first ~128 bytes of frame). A small hot local is - // more valuable per frame byte than a large hot local. - unsigned size1 = lvaLclStackHomeSize(n1); - unsigned size2 = lvaLclStackHomeSize(n2); - weight_t wt1 = dsc1->lvRefCntWtd(lvaRefCountState); - weight_t wt2 = dsc2->lvRefCntWtd(lvaRefCountState); - - // Compare wt1/size1 > wt2/size2 as wt1*size2 > wt2*size1 - // to avoid division. Both sizes are > 0. - weight_t density1 = wt1 * size2; - weight_t density2 = wt2 * size1; - if (density1 != density2) - { - return density1 > density2; - } - - // Among locals with equal density, group by alignment class - // (8+ byte locals before smaller ones) to reduce padding. - bool aligned1 = (size1 >= 8); - bool aligned2 = (size2 >= 8); - if (aligned1 != aligned2) - { - return aligned1; - } - - unsigned cnt1 = dsc1->lvRefCnt(lvaRefCountState); - unsigned cnt2 = dsc2->lvRefCnt(lvaRefCountState); - if (cnt1 != cnt2) - { - return cnt1 > cnt2; - } - - // Stable tiebreaker: lower lclNum first. - return n1 < n2; - }); + JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " + "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", + origCost, densityCost, refCntCost, weightCost, refDensityCost, + bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); } } #endif // TARGET_AMD64 From 086034ddbccc223b9a14737c64b89432f99e14b1 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 02:04:39 +0000 Subject: [PATCH 07/28] JIT: Enable multi-strategy frame layout optimization for x86 Extend the stack local sorting optimization from x64-only (TARGET_AMD64) to all x86/x64 (TARGET_XARCH). The optimization is applicable to x86 because: - x86 uses the same disp8 [-128,+127] vs disp32 encoding threshold - x86 only calls lvaAssignFrameOffsets with FINAL_FRAME_LAYOUT - x86 frequently uses EBP-based frames (especially with DOUBLE_ALIGN) No behavioral change on x64; enables the optimization for x86 targets. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 14cb9100b31eed..bd93f583697758 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5156,10 +5156,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() UINT assignMore = 0xFFFFFFFF; bool have_LclVarDoubleAlign = false; -#ifdef TARGET_AMD64 - // Multi-strategy frame layout optimization for x64. +#ifdef TARGET_XARCH + // Multi-strategy frame layout optimization for x86/x64. // - // On x64, stack accesses within [-128, +127] of the base register use a 1-byte + // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra // bytes per access. We try multiple sort orders for locals and pick the one that // minimizes total encoding cost, estimated by simulating the frame allocation loop. @@ -5169,12 +5169,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // count. This gives a direct estimate of total displacement encoding bytes. // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) // - // This optimization is safe because on AMD64, lvaAssignFrameOffsets is only called + // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called // with FINAL_FRAME_LAYOUT (no tentative layout exists). // // We only run this for frame-pointer-based frames because the disp8 boundary check - // assumes RBP-relative negative virtual offsets. RSP-based frames use positive offsets - // after fixup and contribute negligible savings. + // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive + // offsets after fixup and contribute negligible savings. // // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are // less meaningful), and frames that fit entirely within the disp8 zone. @@ -5393,7 +5393,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); } } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH for (cur = 0; alloc_order[cur]; cur++) { @@ -5409,7 +5409,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++) { -#ifdef TARGET_AMD64 +#ifdef TARGET_XARCH lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx; #else lclNum = sortIdx; From 59aab9d4b551b3a1eb8cf8a00175381e78ee0548 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 02:21:26 +0000 Subject: [PATCH 08/28] JIT: Enable frame layout optimization for MinOpts/Tier0 with lightweight ref counting For MinOpts/Tier0, precise ref counts are not computed (PreciseRefCountsRequired() returns false). Previously, the frame layout optimization was skipped entirely for MinOpts. This change adds a lightweight LIR walk that counts local variable references without any of the analysis side effects of lvaMarkLclRefs. The lightweight counts are used by the cost estimation function and sorting comparators to make informed layout decisions for MinOpts methods. SPMI results across collections show significant impact, especially on MinOpts-heavy collections: aspnet2: -8,682 bytes (-0.33%), 262 improvements, 37 regressions benchmarks.run: -409,067 bytes (-1.10%), 8,788 improvements, 706 regressions libraries_tests: -4,503,889 bytes (-1.11%), 96,203 improvements, 7,210 regressions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 96 ++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 22 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index bd93f583697758..80e2b13243a1af 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5176,11 +5176,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive // offsets after fixup and contribute negligible savings. // - // We skip this for EnC (which requires stable layout), MinOpts (where ref counts are - // less meaningful), and frames that fit entirely within the disp8 zone. + // We skip this for EnC (which requires stable layout) and frames that fit entirely + // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed, + // we do a lightweight LIR walk to count local references for sorting purposes. assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); unsigned* lclVarSortOrder = nullptr; - if (lvaLocalVarRefCounted() && !opts.compDbgEnC && !opts.MinOpts() && codeGen->isFramePointerUsed()) + if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed()) { unsigned estimatedLocalSize = 0; for (unsigned i = 0; i < lvaCount; i++) @@ -5190,9 +5191,36 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (estimatedLocalSize > 128) { + // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0). + // Do a lightweight LIR walk to count local references for sorting purposes. + // This is much cheaper than the full lvaMarkLclRefs pass — we only count + // occurrences without any of the analysis side effects. + unsigned* lclRefCounts = nullptr; + if (!PreciseRefCountsRequired()) + { + lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount]; + memset(lclRefCounts, 0, lvaCount * sizeof(unsigned)); + + for (BasicBlock* const block : Blocks()) + { + for (GenTree* node : LIR::AsRange(block)) + { + if (node->OperIsAnyLocal()) + { + unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); + if (lclNum < lvaCount) + { + lclRefCounts[lclNum]++; + } + } + } + } + } + JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " - "(estimated frame size %u bytes)\n", - lvaCount, estimatedLocalSize); + "(estimated frame size %u bytes%s)\n", + lvaCount, estimatedLocalSize, + lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); // Pre-compute which locals will be allocated in the main loop and their // pass category. Category 0 means "not allocatable" (skipped by the loop). @@ -5282,7 +5310,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() simOff -= static_cast(size); - unsigned refCnt = varDsc->lvRefCnt(lvaRefCountState); + unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] + : varDsc->lvRefCnt(lvaRefCountState); totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); } } @@ -5322,30 +5351,41 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // Strategy 1: Access density (weighted ref count / size) descending. // A small hot local is more valuable per frame byte than a large hot local. unsigned densityCost = tryStrategy("density", - [this](unsigned n1, unsigned n2) -> bool { - const LclVarDsc* d1 = lvaGetDesc(n1); - const LclVarDsc* d2 = lvaGetDesc(n2); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1 = d1->lvRefCntWtd(lvaRefCountState); - weight_t w2 = d2->lvRefCntWtd(lvaRefCountState); + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. weight_t dens1 = w1 * s2; weight_t dens2 = w2 * s1; if (dens1 != dens2) return dens1 > dens2; bool a1 = (s1 >= 8), a2 = (s2 >= 8); if (a1 != a2) return a1; - unsigned c1 = d1->lvRefCnt(lvaRefCountState); - unsigned c2 = d2->lvRefCnt(lvaRefCountState); + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); if (c1 != c2) return c1 > c2; return n1 < n2; }); // Strategy 2: Unweighted ref count descending. unsigned refCntCost = tryStrategy("refCnt", - [this](unsigned n1, unsigned n2) -> bool { - unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); if (c1 != c2) return c1 > c2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); @@ -5354,10 +5394,20 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() }); // Strategy 3: Weighted ref count descending. + // For MinOpts, weighted = unweighted (no block weights available). unsigned weightCost = tryStrategy("weight", - [this](unsigned n1, unsigned n2) -> bool { - weight_t w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - weight_t w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } if (w1 != w2) return w1 > w2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); @@ -5367,9 +5417,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() // Strategy 4: Unweighted ref count density (refCnt / size) descending. unsigned refDensityCost = tryStrategy("refDensity", - [this](unsigned n1, unsigned n2) -> bool { - unsigned c1 = lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); // Cross-multiply to avoid division. From bbac4fb8dbd3533ed0d7d4422ec841917454bd1b Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 16:55:31 +0000 Subject: [PATCH 09/28] JIT: Extract frame layout optimization into lvaComputeOptimalFrameLayoutOrder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the multi-strategy frame layout optimization code from lvaAssignVirtualFrameOffsetsToLocals into a separate method lvaComputeOptimalFrameLayoutOrder for better readability. Also move the Allocation enum to file scope (as LclAllocCategory) so it can be shared between both methods. No functional change — SPMI results are identical. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 3 + src/coreclr/jit/lclvars.cpp | 604 +++++++++++++++++++----------------- 2 files changed, 318 insertions(+), 289 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 35be79978cc0e3..feff52b799f7cb 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4277,6 +4277,9 @@ class Compiler void lvaAssignVirtualFrameOffsetsToArgs(); bool lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned lclNum, int* offset); void lvaAssignVirtualFrameOffsetsToLocals(); +#ifdef TARGET_XARCH + unsigned* lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder); +#endif bool lvaParamHasLocalStackSpace(unsigned lclNum); int lvaAllocLocalAndSetVirtualOffset(unsigned lclNum, unsigned size, int stkOffs); int lvaAllocAsyncContexts(int stkOffs); diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 80e2b13243a1af..12dd36b818eb24 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4778,6 +4778,319 @@ bool Compiler::lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned l return false; } +// Allocation pass categories used by lvaAssignVirtualFrameOffsetsToLocals +// and lvaComputeOptimalFrameLayoutOrder to classify locals by type. +enum LclAllocCategory : UINT +{ + ALLOC_NON_PTRS = 0x1, // assign offsets to non-ptr + ALLOC_PTRS = 0x2, // Second pass, assign offsets to tracked ptrs + ALLOC_UNSAFE_BUFFERS = 0x4, + ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8 +}; + +#ifdef TARGET_XARCH +//------------------------------------------------------------------------ +// lvaComputeOptimalFrameLayoutOrder: try multiple sort orders for locals and +// pick the one that minimizes total displacement encoding cost. +// +// On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte +// displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra +// bytes per access. We try multiple sort orders for locals and pick the one that +// minimizes total encoding cost, estimated by simulating the frame allocation loop. +// +// The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1 +// for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction +// count. This gives a direct estimate of total displacement encoding bytes. +// (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) +// +// This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called +// with FINAL_FRAME_LAYOUT (no tentative layout exists). +// +// We only run this for frame-pointer-based frames because the disp8 boundary check +// assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive +// offsets after fixup and contribute negligible savings. +// +// We skip frames that fit entirely within the disp8 zone. For MinOpts/Tier0 where +// precise ref counts are not computed, we do a lightweight LIR walk to count local +// references for sorting purposes. +// +// Arguments: +// stkOffs - current stack offset (after callee saves, XMM saves, and pre-allocated +// special locals) +// allocOrder - null-terminated array of allocation pass flags (ALLOC_NON_PTRS, etc.) +// +// Returns: +// An array of lclNum indices representing the optimal sort order, or nullptr if the +// original order is already optimal. +// +unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder) +{ + unsigned estimatedLocalSize = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + estimatedLocalSize += lvaLclStackHomeSize(i); + } + + if (estimatedLocalSize <= 128) + { + return nullptr; + } + + // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0). + // Do a lightweight LIR walk to count local references for sorting purposes. + // This is much cheaper than the full lvaMarkLclRefs pass — we only count + // occurrences without any of the analysis side effects. + unsigned* lclRefCounts = nullptr; + if (!PreciseRefCountsRequired()) + { + lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount]; + memset(lclRefCounts, 0, lvaCount * sizeof(unsigned)); + + for (BasicBlock* const block : Blocks()) + { + for (GenTree* node : LIR::AsRange(block)) + { + if (node->OperIsAnyLocal()) + { + unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); + if (lclNum < lvaCount) + { + lclRefCounts[lclNum]++; + } + } + } + } + } + + JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " + "(estimated frame size %u bytes%s)\n", + lvaCount, estimatedLocalSize, + lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); + + // Pre-compute which locals will be allocated in the main loop and their + // pass category. Category 0 means "not allocatable" (skipped by the loop). + unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclPassCategory[i] = 0; + LclVarDsc* varDsc = lvaGetDesc(i); + + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + continue; +#if FEATURE_FIXED_OUT_ARGS + if (i == lvaOutgoingArgSpaceVar) + continue; +#endif + if (lvaIsOSRLocal(i)) + continue; + if (!varDsc->lvOnFrame) + continue; + if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie()) + continue; + if (i == lvaRetAddrVar) + continue; + if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || + i == lvaAsyncSynchronizationContextVar) + continue; + if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) + continue; + + if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) + { + lclPassCategory[i] = + varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; + } + else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) + { + lclPassCategory[i] = ALLOC_PTRS; + } + else + { + lclPassCategory[i] = ALLOC_NON_PTRS; + } + } + + // Simulate frame layout for a given sort order and return total encoding cost. + // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where + // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. + auto estimateLayoutCost = [&](unsigned* order) -> unsigned { + unsigned totalCost = 0; + int simOff = stkOffs; + + for (int p = 0; allocOrder[p]; p++) + { + UINT pass = allocOrder[p]; + for (unsigned idx = 0; idx < lvaCount; idx++) + { + unsigned lcl = order[idx]; + if (lclPassCategory[lcl] != pass) + continue; + + LclVarDsc* varDsc = lvaGetDesc(lcl); + unsigned size = lvaLclStackHomeSize(lcl); + + // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). + if (size >= 8) + { +#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES + if (varTypeIsSIMD(varDsc)) + { + int alignment = getSIMDTypeAlignment(varDsc->TypeGet()); + if (simOff % alignment != 0) + { + simOff -= static_cast(alignment + (simOff % alignment)); + } + } + else +#endif + { + if ((simOff % 8) != 0) + { + simOff -= static_cast(8 + (simOff % 8)); + } + } + } + + simOff -= static_cast(size); + + unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] + : varDsc->lvRefCnt(lvaRefCountState); + totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); + } + } + + return totalCost; + }; + + unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + sortOrder[i] = i; + candidateOrder[i] = i; + } + + // Score the original (unsorted) order as baseline. + unsigned origCost = estimateLayoutCost(sortOrder); + unsigned bestCost = origCost; + const char* bestName = "original"; + + // Helper to try a strategy: sort candidateOrder, estimate cost, + // and update best if the cost is lower. + auto tryStrategy = [&](const char* name, auto comparator) -> unsigned { + for (unsigned i = 0; i < lvaCount; i++) + candidateOrder[i] = i; + jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator); + unsigned cost = estimateLayoutCost(candidateOrder); + if (cost < bestCost) + { + bestCost = cost; + bestName = name; + memcpy(sortOrder, candidateOrder, lvaCount * sizeof(unsigned)); + } + return cost; + }; + + // Strategy 1: Access density (weighted ref count / size) descending. + // A small hot local is more valuable per frame byte than a large hot local. + unsigned densityCost = tryStrategy("density", + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } + // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. + weight_t dens1 = w1 * s2; + weight_t dens2 = w2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + return n1 < n2; + }); + + // Strategy 2: Unweighted ref count descending. + unsigned refCntCost = tryStrategy("refCnt", + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 3: Weighted ref count descending. + // For MinOpts, weighted = unweighted (no block weights available). + unsigned weightCost = tryStrategy("weight", + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } + if (w1 != w2) return w1 > w2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // Strategy 4: Unweighted ref count density (refCnt / size) descending. + unsigned refDensityCost = tryStrategy("refDensity", + [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + // Cross-multiply to avoid division. + unsigned long long dens1 = (unsigned long long)c1 * s2; + unsigned long long dens2 = (unsigned long long)c2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }); + + // If original order won, no sorting needed. + if (bestCost == origCost) + { + sortOrder = nullptr; + } + + JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " + "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", + origCost, densityCost, refCntCost, weightCost, refDensityCost, + bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); + + return sortOrder; +} +#endif // TARGET_XARCH + //----------------------------------------------------------------------------- // lvaAssignVirtualFrameOffsetsToLocals: compute the virtual stack offsets for // all elements on the stackframe. @@ -5094,13 +5407,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() non-pointer temps */ - enum Allocation - { - ALLOC_NON_PTRS = 0x1, // assign offsets to non-ptr - ALLOC_PTRS = 0x2, // Second pass, assign offsets to tracked ptrs - ALLOC_UNSAFE_BUFFERS = 0x4, - ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8 - }; UINT alloc_order[5]; unsigned int cur = 0; @@ -5158,292 +5464,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #ifdef TARGET_XARCH // Multi-strategy frame layout optimization for x86/x64. - // - // On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte - // displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra - // bytes per access. We try multiple sort orders for locals and pick the one that - // minimizes total encoding cost, estimated by simulating the frame allocation loop. - // - // The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1 - // for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction - // count. This gives a direct estimate of total displacement encoding bytes. - // (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) - // - // This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called - // with FINAL_FRAME_LAYOUT (no tentative layout exists). - // - // We only run this for frame-pointer-based frames because the disp8 boundary check - // assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive - // offsets after fixup and contribute negligible savings. - // - // We skip this for EnC (which requires stable layout) and frames that fit entirely - // within the disp8 zone. For MinOpts/Tier0 where precise ref counts are not computed, - // we do a lightweight LIR walk to count local references for sorting purposes. + // See lvaComputeOptimalFrameLayoutOrder for details. assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); unsigned* lclVarSortOrder = nullptr; if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed()) { - unsigned estimatedLocalSize = 0; - for (unsigned i = 0; i < lvaCount; i++) - { - estimatedLocalSize += lvaLclStackHomeSize(i); - } - - if (estimatedLocalSize > 128) - { - // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0). - // Do a lightweight LIR walk to count local references for sorting purposes. - // This is much cheaper than the full lvaMarkLclRefs pass — we only count - // occurrences without any of the analysis side effects. - unsigned* lclRefCounts = nullptr; - if (!PreciseRefCountsRequired()) - { - lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount]; - memset(lclRefCounts, 0, lvaCount * sizeof(unsigned)); - - for (BasicBlock* const block : Blocks()) - { - for (GenTree* node : LIR::AsRange(block)) - { - if (node->OperIsAnyLocal()) - { - unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); - if (lclNum < lvaCount) - { - lclRefCounts[lclNum]++; - } - } - } - } - } - - JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " - "(estimated frame size %u bytes%s)\n", - lvaCount, estimatedLocalSize, - lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); - - // Pre-compute which locals will be allocated in the main loop and their - // pass category. Category 0 means "not allocatable" (skipped by the loop). - unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; - for (unsigned i = 0; i < lvaCount; i++) - { - lclPassCategory[i] = 0; - LclVarDsc* varDsc = lvaGetDesc(i); - - if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) - continue; -#if FEATURE_FIXED_OUT_ARGS - if (i == lvaOutgoingArgSpaceVar) - continue; -#endif - if (lvaIsOSRLocal(i)) - continue; - if (!varDsc->lvOnFrame) - continue; - if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie()) - continue; - if (i == lvaRetAddrVar) - continue; - if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || - i == lvaAsyncSynchronizationContextVar) - continue; - if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) - continue; - - if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) - { - lclPassCategory[i] = - varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; - } - else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) - { - lclPassCategory[i] = ALLOC_PTRS; - } - else - { - lclPassCategory[i] = ALLOC_NON_PTRS; - } - } - - // Simulate frame layout for a given sort order and return total encoding cost. - // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where - // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. - // Uses the current stkOffs as the starting point, which already accounts for - // callee saves, XMM saves, and any pre-allocated special locals. - auto estimateLayoutCost = [&](unsigned* order) -> unsigned { - unsigned totalCost = 0; - int simOff = stkOffs; - - for (int p = 0; alloc_order[p]; p++) - { - UINT pass = alloc_order[p]; - for (unsigned idx = 0; idx < lvaCount; idx++) - { - unsigned lcl = order[idx]; - if (lclPassCategory[lcl] != pass) - continue; - - LclVarDsc* varDsc = lvaGetDesc(lcl); - unsigned size = lvaLclStackHomeSize(lcl); - - // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). - if (size >= 8) - { -#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES - if (varTypeIsSIMD(varDsc)) - { - int alignment = getSIMDTypeAlignment(varDsc->TypeGet()); - if (simOff % alignment != 0) - { - simOff -= static_cast(alignment + (simOff % alignment)); - } - } - else -#endif - { - if ((simOff % 8) != 0) - { - simOff -= static_cast(8 + (simOff % 8)); - } - } - } - - simOff -= static_cast(size); - - unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] - : varDsc->lvRefCnt(lvaRefCountState); - totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); - } - } - - return totalCost; - }; - - lclVarSortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; - unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; - for (unsigned i = 0; i < lvaCount; i++) - { - lclVarSortOrder[i] = i; - candidateOrder[i] = i; - } - - // Score the original (unsorted) order as baseline. - unsigned origCost = estimateLayoutCost(lclVarSortOrder); - unsigned bestCost = origCost; - const char* bestName = "original"; - - // Helper to try a strategy: sort candidateOrder, estimate cost, - // and update best if the cost is lower. - auto tryStrategy = [&](const char* name, auto comparator) -> unsigned { - for (unsigned i = 0; i < lvaCount; i++) - candidateOrder[i] = i; - jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator); - unsigned cost = estimateLayoutCost(candidateOrder); - if (cost < bestCost) - { - bestCost = cost; - bestName = name; - memcpy(lclVarSortOrder, candidateOrder, lvaCount * sizeof(unsigned)); - } - return cost; - }; - - // Strategy 1: Access density (weighted ref count / size) descending. - // A small hot local is more valuable per frame byte than a large hot local. - unsigned densityCost = tryStrategy("density", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. - weight_t dens1 = w1 * s2; - weight_t dens2 = w2 * s1; - if (dens1 != dens2) return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; - return n1 < n2; - }); - - // Strategy 2: Unweighted ref count descending. - unsigned refCntCost = tryStrategy("refCnt", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); - - // Strategy 3: Weighted ref count descending. - // For MinOpts, weighted = unweighted (no block weights available). - unsigned weightCost = tryStrategy("weight", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - if (w1 != w2) return w1 > w2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); - - // Strategy 4: Unweighted ref count density (refCnt / size) descending. - unsigned refDensityCost = tryStrategy("refDensity", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - // Cross-multiply to avoid division. - unsigned long long dens1 = (unsigned long long)c1 * s2; - unsigned long long dens2 = (unsigned long long)c2 * s1; - if (dens1 != dens2) return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); - - // If original order won, no sorting needed. - if (bestCost == origCost) - { - lclVarSortOrder = nullptr; - } - - JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " - "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", - origCost, densityCost, refCntCost, weightCost, refDensityCost, - bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); - } + lclVarSortOrder = lvaComputeOptimalFrameLayoutOrder(stkOffs, alloc_order); } #endif // TARGET_XARCH From 8e09f3fead01b6cb09a731bf6db1a754c04303ff Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 17:00:26 +0000 Subject: [PATCH 10/28] JIT: Lower frame layout optimization threshold from 128 to 64 bytes The estimatedLocalSize threshold did not account for alignment padding, which can significantly inflate the actual frame size. Methods with raw local sizes between 64-128 bytes can have actual frames exceeding 128 bytes after alignment, making them candidates for optimization. Experimentally verified: threshold=64 captures all additional benefit (identical results to threshold=0). Below 64 bytes, even worst-case alignment keeps frames within the disp8 range. SPMI aspnet2: -8,934 bytes (was -8,682), 277 improvements (was 262). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 12dd36b818eb24..2d22e946ec4878 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4831,7 +4831,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a estimatedLocalSize += lvaLclStackHomeSize(i); } - if (estimatedLocalSize <= 128) + // Skip frames where even with alignment padding, all locals will fit in disp8 range. + // We use 64 rather than 128 because alignment padding can inflate the actual frame + // size significantly beyond the raw sum of local sizes. + if (estimatedLocalSize <= 64) { return nullptr; } From 0fadd1c39bae6a4fefe7c84d6186d0854a786fd9 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 17:03:39 +0000 Subject: [PATCH 11/28] JIT: Use single array for frame layout strategy evaluation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace two lvaCount-sized arrays (sortOrder + candidateOrder) with a single array. Each strategy sorts the same array for scoring, then a final sort with the winning comparator produces the result. Eliminates one allocation and the memcpy on each winning strategy. No functional change — SPMI results are identical. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 214 +++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 100 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 2d22e946ec4878..a75713ce488bb2 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4965,125 +4965,139 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a return totalCost; }; - unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; - unsigned* candidateOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; for (unsigned i = 0; i < lvaCount; i++) { - sortOrder[i] = i; - candidateOrder[i] = i; + sortOrder[i] = i; } // Score the original (unsorted) order as baseline. - unsigned origCost = estimateLayoutCost(sortOrder); - unsigned bestCost = origCost; - const char* bestName = "original"; - - // Helper to try a strategy: sort candidateOrder, estimate cost, - // and update best if the cost is lower. - auto tryStrategy = [&](const char* name, auto comparator) -> unsigned { + unsigned origCost = estimateLayoutCost(sortOrder); + unsigned bestCost = origCost; + int bestStrategy = -1; // -1 = original order + const char* bestName = "original"; + + // Helper to try a strategy: sort sortOrder, estimate cost, track if best. + // The array is re-sorted for each strategy; after all strategies are + // evaluated, we do one final sort with the winner. + auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { for (unsigned i = 0; i < lvaCount; i++) - candidateOrder[i] = i; - jitstd::sort(candidateOrder, candidateOrder + lvaCount, comparator); - unsigned cost = estimateLayoutCost(candidateOrder); + sortOrder[i] = i; + jitstd::sort(sortOrder, sortOrder + lvaCount, comparator); + unsigned cost = estimateLayoutCost(sortOrder); if (cost < bestCost) { - bestCost = cost; - bestName = name; - memcpy(sortOrder, candidateOrder, lvaCount * sizeof(unsigned)); + bestCost = cost; + bestStrategy = strategyIdx; + bestName = name; } return cost; }; - // Strategy 1: Access density (weighted ref count / size) descending. + // Strategy 0: Access density (weighted ref count / size) descending. // A small hot local is more valuable per frame byte than a large hot local. - unsigned densityCost = tryStrategy("density", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. - weight_t dens1 = w1 * s2; - weight_t dens2 = w2 * s1; - if (dens1 != dens2) return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; - return n1 < n2; - }); - - // Strategy 2: Unweighted ref count descending. - unsigned refCntCost = tryStrategy("refCnt", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); + auto densityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } + // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. + weight_t dens1 = w1 * s2; + weight_t dens2 = w2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + return n1 < n2; + }; + unsigned densityCost = tryStrategy(0, "density", densityCompare); + + // Strategy 1: Unweighted ref count descending. + auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }; + unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare); - // Strategy 3: Weighted ref count descending. + // Strategy 2: Weighted ref count descending. // For MinOpts, weighted = unweighted (no block weights available). - unsigned weightCost = tryStrategy("weight", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - if (w1 != w2) return w1 > w2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); - - // Strategy 4: Unweighted ref count density (refCnt / size) descending. - unsigned refDensityCost = tryStrategy("refDensity", - [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - // Cross-multiply to avoid division. - unsigned long long dens1 = (unsigned long long)c1 * s2; - unsigned long long dens2 = (unsigned long long)c2 * s1; - if (dens1 != dens2) return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; - return n1 < n2; - }); + auto weightCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } + if (w1 != w2) return w1 > w2; + bool a1 = (lvaLclStackHomeSize(n1) >= 8); + bool a2 = (lvaLclStackHomeSize(n2) >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }; + unsigned weightCost = tryStrategy(2, "weight", weightCompare); + + // Strategy 3: Unweighted ref count density (refCnt / size) descending. + auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + // Cross-multiply to avoid division. + unsigned long long dens1 = (unsigned long long)c1 * s2; + unsigned long long dens2 = (unsigned long long)c2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }; + unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); - // If original order won, no sorting needed. - if (bestCost == origCost) + // Apply the winning strategy's sort order (or return nullptr if original won). + if (bestStrategy < 0) { sortOrder = nullptr; } + else + { + // Re-sort with the winning comparator to produce the final order. + for (unsigned i = 0; i < lvaCount; i++) + sortOrder[i] = i; + switch (bestStrategy) + { + case 0: jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); break; + case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break; + case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break; + case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break; + default: unreached(); + } + } JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", From 83a333ef360c03c89a7d0ec28a58c4ea48e28070 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 17:23:02 +0000 Subject: [PATCH 12/28] Add zero-init span cost to frame layout estimator When block-init is used for zero-initialization, the JIT zeros a contiguous range of stack memory using SIMD stores. If the sort reorders locals such that must-init locals are scattered, the contiguous span grows, requiring more SIMD stores in the prolog. Pre-compute which locals need zero-init (approximating genCheckUseBlockInit logic) and track the init span during layout simulation. Add a small penalty of 2 bytes per 16-byte chunk to favor layouts that keep the init span tight without overwhelming the main encoding cost. SPMI aspnet2 results vs no zero-init model: Code delta: -3812 bytes (was -3726) Improvements: 47 (was 45) Regressions: 23/+199 bytes (was 26/+225 bytes) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index a75713ce488bb2..d5843e92b8c25e 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4913,12 +4913,52 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } } + // Pre-compute which locals will likely need zero-initialization in the prolog. + // This approximates the logic in genCheckUseBlockInit (codegencommon.cpp). + // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi] + // using SIMD stores. The code size depends on the span and alignment, so layouts + // that scatter init-requiring locals produce larger prologs. + bool* lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; + unsigned initSlotCount = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + lclNeedsInit[i] = false; + if (lclPassCategory[i] == 0) + continue; + + LclVarDsc* varDsc = lvaGetDesc(i); + + if (fgVarIsNeverZeroInitializedInProlog(i)) + continue; + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + continue; + if (varDsc->lvHasExplicitInit) + continue; + if (varDsc->lvIsTemp && !varDsc->HasGCPtr()) + continue; + + if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit) + { + lclNeedsInit[i] = true; + initSlotCount += (lvaLclStackHomeSize(i) + sizeof(int) - 1) / sizeof(int); + } + } + + // On AMD64, block init is used when initSlotCount > 4; on x86 when > 4. + // Block init zeros a contiguous range, so the code size depends on span. + // Individual init zeros each local separately, cost is independent of layout. + bool useBlockInit = (initSlotCount > 4); + // Simulate frame layout for a given sort order and return total encoding cost. // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. + // When block init is used, we also add a zero-init cost proportional to the + // span of init-requiring locals (larger span = more SIMD stores in the prolog). auto estimateLayoutCost = [&](unsigned* order) -> unsigned { unsigned totalCost = 0; int simOff = stkOffs; + int initLo = 0; + int initHi = 0; for (int p = 0; allocOrder[p]; p++) { @@ -4959,9 +4999,38 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] : varDsc->lvRefCnt(lvaRefCountState); totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); + + // Track the zero-init span for block-init cost estimation. + if (useBlockInit && lclNeedsInit[lcl]) + { + int loOffs = simOff; + int hiOffs = simOff + static_cast(size); + if (initLo == 0 && initHi == 0) + { + initLo = loOffs; + initHi = hiOffs; + } + else + { + initLo = min(initLo, loOffs); + initHi = max(initHi, hiOffs); + } + } } } + // Add zero-init prolog cost when block init will be used. + // The JIT zeros the contiguous range [initLo, initHi) using SIMD stores. + // Each 16-byte chunk requires one SIMD store instruction. We add a + // small penalty per chunk to favor layouts that keep the init span tight, + // without overwhelming the main encoding cost. + if (useBlockInit && initHi > initLo) + { + unsigned initSpan = static_cast(initHi - initLo); + unsigned initCost = ((initSpan + 15) / 16) * 2; + totalCost += initCost; + } + return totalCost; }; From 2f253157bf9294fcbcafae0f3e3b2fec6b262738 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 19:22:05 +0000 Subject: [PATCH 13/28] Add sizeAsc frame layout strategy Add a size-ascending sort strategy that maximizes the count of locals fitting within the disp8 encoding range by packing smallest locals first. This complements the existing density-based strategies which optimize for hottest-first but may place a hot large struct ahead of several moderately-hot small locals. SPMI results (benchmarks.run_pgo.linux): Before: -443,754 bytes, 11,221 improvements After: -483,664 bytes, 11,582 improvements (+9% more savings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index d5843e92b8c25e..843d9b0935f71f 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5148,6 +5148,23 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a }; unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); + // Strategy 4: Size ascending — maximize count of locals in disp8 range. + // Small locals consume less of the disp8 budget, so packing them first + // maximizes how many locals get short encodings. + auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + if (s1 != s2) return s1 < s2; + // Within same size, prefer hotter locals first. + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] + : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] + : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) return c1 > c2; + return n1 < n2; + }; + unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare); + // Apply the winning strategy's sort order (or return nullptr if original won). if (bestStrategy < 0) { @@ -5164,13 +5181,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break; case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break; case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break; + case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break; default: unreached(); } } - JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u; " + JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u " + "sizeAsc=%u; " "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", origCost, densityCost, refCntCost, weightCost, refDensityCost, + sizeAscCost, bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); return sortOrder; From 656b74a79d148e81edd2e3a7672602d974b9b7c8 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 19:30:56 +0000 Subject: [PATCH 14/28] Add initGroupedDensity frame layout strategy Group init-needing locals first (sorted by density), then non-init locals by density. This keeps the zero-init span tight, reducing prolog code size from SIMD block-init while still prioritizing hot locals within each group. SPMI results (benchmarks.run_pgo.linux): Before: -483,664 bytes, 11,582 improvements (5 strategies) After: -489,753 bytes, 12,253 improvements (6 strategies) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 843d9b0935f71f..478839fd58c6ae 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5165,6 +5165,34 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a }; unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare); + // Strategy 5: Density with init-grouping — init-needing locals sorted by + // density first, then non-init locals by density. Keeps the zero-init span + // tight while still prioritizing hot locals within each group. + auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool { + bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; + if (init1 != init2) return init1; // init-needing first + unsigned s1 = lvaLclStackHomeSize(n1); + unsigned s2 = lvaLclStackHomeSize(n2); + weight_t w1, w2; + if (lclRefCounts != nullptr) + { + w1 = static_cast(lclRefCounts[n1]); + w2 = static_cast(lclRefCounts[n2]); + } + else + { + w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); + w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); + } + weight_t dens1 = w1 * s2; + weight_t dens2 = w2 * s1; + if (dens1 != dens2) return dens1 > dens2; + bool a1 = (s1 >= 8), a2 = (s2 >= 8); + if (a1 != a2) return a1; + return n1 < n2; + }; + unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare); + // Apply the winning strategy's sort order (or return nullptr if original won). if (bestStrategy < 0) { @@ -5182,15 +5210,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break; case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break; case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break; + case 5: jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); break; default: unreached(); } } JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u " - "sizeAsc=%u; " + "sizeAsc=%u initGroupedDensity=%u; " "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", origCost, densityCost, refCntCost, weightCost, refDensityCost, - sizeAscCost, + sizeAscCost, initGroupedDensityCost, bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); return sortOrder; From df5fb05228a85fa7f121afdf516a89704c83ea33 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 3 May 2026 21:40:34 +0000 Subject: [PATCH 15/28] format --- src/coreclr/jit/lclvars.cpp | 118 ++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 52 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 478839fd58c6ae..fa53bb9fda507f 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4867,8 +4867,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " "(estimated frame size %u bytes%s)\n", - lvaCount, estimatedLocalSize, - lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); + lvaCount, estimatedLocalSize, lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); // Pre-compute which locals will be allocated in the main loop and their // pass category. Category 0 means "not allocatable" (skipped by the loop). @@ -4876,7 +4875,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a for (unsigned i = 0; i < lvaCount; i++) { lclPassCategory[i] = 0; - LclVarDsc* varDsc = lvaGetDesc(i); + LclVarDsc* varDsc = lvaGetDesc(i); if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) continue; @@ -4892,16 +4891,14 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a continue; if (i == lvaRetAddrVar) continue; - if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || - i == lvaAsyncSynchronizationContextVar) + if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || i == lvaAsyncSynchronizationContextVar) continue; if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) continue; if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) { - lclPassCategory[i] = - varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; + lclPassCategory[i] = varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; } else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) { @@ -4918,7 +4915,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi] // using SIMD stores. The code size depends on the span and alignment, so layouts // that scatter init-requiring locals produce larger prologs. - bool* lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; + bool* lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; unsigned initSlotCount = 0; for (unsigned i = 0; i < lvaCount; i++) { @@ -4996,8 +4993,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a simOff -= static_cast(size); - unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] - : varDsc->lvRefCnt(lvaRefCountState); + unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] : varDsc->lvRefCnt(lvaRefCountState); totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); // Track the zero-init span for block-init cost estimation. @@ -5041,10 +5037,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } // Score the original (unsorted) order as baseline. - unsigned origCost = estimateLayoutCost(sortOrder); - unsigned bestCost = origCost; + unsigned origCost = estimateLayoutCost(sortOrder); + unsigned bestCost = origCost; int bestStrategy = -1; // -1 = original order - const char* bestName = "original"; + const char* bestName = "original"; // Helper to try a strategy: sort sortOrder, estimate cost, track if best. // The array is re-sorted for each strategy; after all strategies are @@ -5082,28 +5078,29 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. weight_t dens1 = w1 * s2; weight_t dens2 = w2 * s1; - if (dens1 != dens2) return dens1 > dens2; + if (dens1 != dens2) + return dens1 > dens2; bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; + if (a1 != a2) + return a1; + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) + return c1 > c2; return n1 < n2; }; unsigned densityCost = tryStrategy(0, "density", densityCompare); // Strategy 1: Unweighted ref count descending. auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) + return c1 > c2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; + if (a1 != a2) + return a1; return n1 < n2; }; unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare); @@ -5122,28 +5119,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); } - if (w1 != w2) return w1 > w2; + if (w1 != w2) + return w1 > w2; bool a1 = (lvaLclStackHomeSize(n1) >= 8); bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) return a1; + if (a1 != a2) + return a1; return n1 < n2; }; unsigned weightCost = tryStrategy(2, "weight", weightCompare); // Strategy 3: Unweighted ref count density (refCnt / size) descending. auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); // Cross-multiply to avoid division. unsigned long long dens1 = (unsigned long long)c1 * s2; unsigned long long dens2 = (unsigned long long)c2 * s1; - if (dens1 != dens2) return dens1 > dens2; + if (dens1 != dens2) + return dens1 > dens2; bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; + if (a1 != a2) + return a1; return n1 < n2; }; unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); @@ -5154,13 +5153,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); - if (s1 != s2) return s1 < s2; + if (s1 != s2) + return s1 < s2; // Within same size, prefer hotter locals first. - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] - : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] - : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) return c1 > c2; + unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); + unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); + if (c1 != c2) + return c1 > c2; return n1 < n2; }; unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare); @@ -5170,7 +5169,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // tight while still prioritizing hot locals within each group. auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool { bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; - if (init1 != init2) return init1; // init-needing first + if (init1 != init2) + return init1; // init-needing first unsigned s1 = lvaLclStackHomeSize(n1); unsigned s2 = lvaLclStackHomeSize(n2); weight_t w1, w2; @@ -5186,9 +5186,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } weight_t dens1 = w1 * s2; weight_t dens2 = w2 * s1; - if (dens1 != dens2) return dens1 > dens2; + if (dens1 != dens2) + return dens1 > dens2; bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) return a1; + if (a1 != a2) + return a1; return n1 < n2; }; unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare); @@ -5205,21 +5207,33 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a sortOrder[i] = i; switch (bestStrategy) { - case 0: jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); break; - case 1: jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); break; - case 2: jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); break; - case 3: jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); break; - case 4: jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); break; - case 5: jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); break; - default: unreached(); + case 0: + jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); + break; + case 1: + jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); + break; + case 2: + jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); + break; + case 3: + jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); + break; + case 4: + jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); + break; + case 5: + jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); + break; + default: + unreached(); } } JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u " "sizeAsc=%u initGroupedDensity=%u; " "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", - origCost, densityCost, refCntCost, weightCost, refDensityCost, - sizeAscCost, initGroupedDensityCost, + origCost, densityCost, refCntCost, weightCost, refDensityCost, sizeAscCost, initGroupedDensityCost, bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); return sortOrder; From 1108871298b51cd0a13dc6d3b3ae5867493aecc0 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 4 May 2026 14:35:29 +0000 Subject: [PATCH 16/28] make it faster --- src/coreclr/jit/lclvars.cpp | 333 +++++++++++++++--------------------- 1 file changed, 141 insertions(+), 192 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index fa53bb9fda507f..a98a1423f9b1e2 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4825,10 +4825,15 @@ enum LclAllocCategory : UINT // unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder) { - unsigned estimatedLocalSize = 0; + // Pre-compute local sizes and total estimated frame size in one pass. + // These arrays are indexed by lclNum and used throughout to avoid repeated + // function calls in sort comparators and cost estimation. + unsigned* lclSize = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned estimatedLocalSize = 0; for (unsigned i = 0; i < lvaCount; i++) { - estimatedLocalSize += lvaLclStackHomeSize(i); + lclSize[i] = lvaLclStackHomeSize(i); + estimatedLocalSize += lclSize[i]; } // Skip frames where even with alignment padding, all locals will fit in disp8 range. @@ -4839,15 +4844,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a return nullptr; } - // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0). - // Do a lightweight LIR walk to count local references for sorting purposes. - // This is much cheaper than the full lvaMarkLclRefs pass — we only count - // occurrences without any of the analysis side effects. - unsigned* lclRefCounts = nullptr; - if (!PreciseRefCountsRequired()) + // Pre-compute ref counts and weights into flat arrays for fast comparator access. + // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0), + // so we do a lightweight LIR walk to count local references. + unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount]; + weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount]; + bool isMinOpts = !PreciseRefCountsRequired(); + + if (isMinOpts) { - lclRefCounts = new (this, CMK_LvaTable) unsigned[lvaCount]; - memset(lclRefCounts, 0, lvaCount * sizeof(unsigned)); + memset(lclRefCnt, 0, lvaCount * sizeof(unsigned)); for (BasicBlock* const block : Blocks()) { @@ -4858,16 +4864,56 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); if (lclNum < lvaCount) { - lclRefCounts[lclNum]++; + lclRefCnt[lclNum]++; } } } } + + // For MinOpts, weighted = unweighted (no block weights available). + for (unsigned i = 0; i < lvaCount; i++) + { + lclWeight[i] = static_cast(lclRefCnt[i]); + } + } + else + { + for (unsigned i = 0; i < lvaCount; i++) + { + LclVarDsc* varDsc = lvaGetDesc(i); + lclRefCnt[i] = varDsc->lvRefCnt(lvaRefCountState); + lclWeight[i] = varDsc->lvRefCntWtd(lvaRefCountState); + } } - JITDUMP("Frame layout optimization: trying multiple strategies for %u locals " + // Pre-compute alignment requirements for each local. + // 0 = no alignment needed, otherwise the required alignment in bytes. + unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclSize[i] < 8) + { + lclAlignTo[i] = 0; + } + else + { +#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES + LclVarDsc* varDsc = lvaGetDesc(i); + if (varTypeIsSIMD(varDsc)) + { + lclAlignTo[i] = static_cast(getSIMDTypeAlignment(varDsc->TypeGet())); + } + else +#endif + { + lclAlignTo[i] = 8; + } + } + } + + JITDUMP("Frame layout optimization: trying strategies for %u locals " "(estimated frame size %u bytes%s)\n", - lvaCount, estimatedLocalSize, lclRefCounts != nullptr ? ", using lightweight ref counts" : ""); + lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : ""); // Pre-compute which locals will be allocated in the main loop and their // pass category. Category 0 means "not allocatable" (skipped by the loop). @@ -4937,7 +4983,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit) { lclNeedsInit[i] = true; - initSlotCount += (lvaLclStackHomeSize(i) + sizeof(int) - 1) / sizeof(int); + initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int); } } @@ -4966,35 +5012,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a if (lclPassCategory[lcl] != pass) continue; - LclVarDsc* varDsc = lvaGetDesc(lcl); - unsigned size = lvaLclStackHomeSize(lcl); + unsigned size = lclSize[lcl]; + unsigned alignTo = lclAlignTo[lcl]; // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). - if (size >= 8) + if (alignTo != 0 && (simOff % static_cast(alignTo)) != 0) { -#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES - if (varTypeIsSIMD(varDsc)) - { - int alignment = getSIMDTypeAlignment(varDsc->TypeGet()); - if (simOff % alignment != 0) - { - simOff -= static_cast(alignment + (simOff % alignment)); - } - } - else -#endif - { - if ((simOff % 8) != 0) - { - simOff -= static_cast(8 + (simOff % 8)); - } - } + simOff -= static_cast(alignTo + (simOff % static_cast(alignTo))); } simOff -= static_cast(size); - unsigned refCnt = (lclRefCounts != nullptr) ? lclRefCounts[lcl] : varDsc->lvRefCnt(lvaRefCountState); - totalCost += refCnt * ((simOff >= -128) ? 1u : 4u); + totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u); // Track the zero-init span for block-init cost estimation. if (useBlockInit && lclNeedsInit[lcl]) @@ -5031,6 +5060,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a }; unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; for (unsigned i = 0; i < lvaCount; i++) { sortOrder[i] = i; @@ -5043,8 +5073,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a const char* bestName = "original"; // Helper to try a strategy: sort sortOrder, estimate cost, track if best. - // The array is re-sorted for each strategy; after all strategies are - // evaluated, we do one final sort with the winner. + // When a strategy improves on the current best, we save its permutation + // into bestOrder to avoid a redundant re-sort at the end. auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { for (unsigned i = 0; i < lvaCount; i++) sortOrder[i] = i; @@ -5055,188 +5085,107 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a bestCost = cost; bestStrategy = strategyIdx; bestName = name; + memcpy(bestOrder, sortOrder, lvaCount * sizeof(unsigned)); } return cost; }; // Strategy 0: Access density (weighted ref count / size) descending. // A small hot local is more valuable per frame byte than a large hot local. - auto densityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - // Compare w1/s1 > w2/s2 via cross-multiply to avoid division. - weight_t dens1 = w1 * s2; - weight_t dens2 = w2 * s1; + auto densityCompare = [lclSize, lclWeight, lclRefCnt](unsigned n1, unsigned n2) -> bool { + weight_t dens1 = lclWeight[n1] * lclSize[n2]; + weight_t dens2 = lclWeight[n2] * lclSize[n1]; if (dens1 != dens2) return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); if (a1 != a2) return a1; - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) - return c1 > c2; + if (lclRefCnt[n1] != lclRefCnt[n2]) + return lclRefCnt[n1] > lclRefCnt[n2]; return n1 < n2; }; unsigned densityCost = tryStrategy(0, "density", densityCompare); - // Strategy 1: Unweighted ref count descending. - auto refCntCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) - return c1 > c2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) - return a1; - return n1 < n2; - }; - unsigned refCntCost = tryStrategy(1, "refCnt", refCntCompare); - - // Strategy 2: Weighted ref count descending. - // For MinOpts, weighted = unweighted (no block weights available). - auto weightCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - if (w1 != w2) - return w1 > w2; - bool a1 = (lvaLclStackHomeSize(n1) >= 8); - bool a2 = (lvaLclStackHomeSize(n2) >= 8); - if (a1 != a2) - return a1; - return n1 < n2; - }; - unsigned weightCost = tryStrategy(2, "weight", weightCompare); - - // Strategy 3: Unweighted ref count density (refCnt / size) descending. - auto refDensityCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - // Cross-multiply to avoid division. - unsigned long long dens1 = (unsigned long long)c1 * s2; - unsigned long long dens2 = (unsigned long long)c2 * s1; - if (dens1 != dens2) - return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) - return a1; - return n1 < n2; - }; - unsigned refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); - - // Strategy 4: Size ascending — maximize count of locals in disp8 range. + // Strategy 1: Size ascending — maximize count of locals in disp8 range. // Small locals consume less of the disp8 budget, so packing them first // maximizes how many locals get short encodings. - auto sizeAscCompare = [this, lclRefCounts](unsigned n1, unsigned n2) -> bool { - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - if (s1 != s2) - return s1 < s2; - // Within same size, prefer hotter locals first. - unsigned c1 = (lclRefCounts != nullptr) ? lclRefCounts[n1] : lvaGetDesc(n1)->lvRefCnt(lvaRefCountState); - unsigned c2 = (lclRefCounts != nullptr) ? lclRefCounts[n2] : lvaGetDesc(n2)->lvRefCnt(lvaRefCountState); - if (c1 != c2) - return c1 > c2; + auto sizeAscCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { + if (lclSize[n1] != lclSize[n2]) + return lclSize[n1] < lclSize[n2]; + if (lclRefCnt[n1] != lclRefCnt[n2]) + return lclRefCnt[n1] > lclRefCnt[n2]; return n1 < n2; }; - unsigned sizeAscCost = tryStrategy(4, "sizeAsc", sizeAscCompare); + unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare); + + // Strategies 2-3: Weight-based sorts that differ from density/sizeAsc only when + // PGO block weights are available (FullOpts). For MinOpts, weighted == unweighted + // and refDensity == density, so these are redundant and skipped. + unsigned weightCost = 0; + unsigned refDensityCost = 0; + if (!isMinOpts) + { + // Strategy 2: Weighted ref count descending. + auto weightCompare = [lclSize, lclWeight](unsigned n1, unsigned n2) -> bool { + if (lclWeight[n1] != lclWeight[n2]) + return lclWeight[n1] > lclWeight[n2]; + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + return a1; + return n1 < n2; + }; + weightCost = tryStrategy(2, "weight", weightCompare); + + // Strategy 3: Unweighted ref count density (refCnt / size) descending. + auto refDensityCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { + unsigned long long dens1 = (unsigned long long)lclRefCnt[n1] * lclSize[n2]; + unsigned long long dens2 = (unsigned long long)lclRefCnt[n2] * lclSize[n1]; + if (dens1 != dens2) + return dens1 > dens2; + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + return a1; + return n1 < n2; + }; + refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); + } - // Strategy 5: Density with init-grouping — init-needing locals sorted by + // Strategy 4: Density with init-grouping — init-needing locals sorted by // density first, then non-init locals by density. Keeps the zero-init span // tight while still prioritizing hot locals within each group. - auto initGroupedDensityCompare = [this, lclRefCounts, lclNeedsInit](unsigned n1, unsigned n2) -> bool { - bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; - if (init1 != init2) - return init1; // init-needing first - unsigned s1 = lvaLclStackHomeSize(n1); - unsigned s2 = lvaLclStackHomeSize(n2); - weight_t w1, w2; - if (lclRefCounts != nullptr) - { - w1 = static_cast(lclRefCounts[n1]); - w2 = static_cast(lclRefCounts[n2]); - } - else - { - w1 = lvaGetDesc(n1)->lvRefCntWtd(lvaRefCountState); - w2 = lvaGetDesc(n2)->lvRefCntWtd(lvaRefCountState); - } - weight_t dens1 = w1 * s2; - weight_t dens2 = w2 * s1; - if (dens1 != dens2) - return dens1 > dens2; - bool a1 = (s1 >= 8), a2 = (s2 >= 8); - if (a1 != a2) - return a1; - return n1 < n2; - }; - unsigned initGroupedDensityCost = tryStrategy(5, "initGroupedDensity", initGroupedDensityCompare); + // Only useful when block init will be used (otherwise identical to density). + unsigned initGroupedDensityCost = 0; + if (useBlockInit) + { + auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool { + bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; + if (init1 != init2) + return init1; // init-needing first + weight_t dens1 = lclWeight[n1] * lclSize[n2]; + weight_t dens2 = lclWeight[n2] * lclSize[n1]; + if (dens1 != dens2) + return dens1 > dens2; + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + return a1; + return n1 < n2; + }; + initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare); + } - // Apply the winning strategy's sort order (or return nullptr if original won). + // Return the winning permutation (saved in bestOrder), or nullptr if original won. if (bestStrategy < 0) { - sortOrder = nullptr; - } - else - { - // Re-sort with the winning comparator to produce the final order. - for (unsigned i = 0; i < lvaCount; i++) - sortOrder[i] = i; - switch (bestStrategy) - { - case 0: - jitstd::sort(sortOrder, sortOrder + lvaCount, densityCompare); - break; - case 1: - jitstd::sort(sortOrder, sortOrder + lvaCount, refCntCompare); - break; - case 2: - jitstd::sort(sortOrder, sortOrder + lvaCount, weightCompare); - break; - case 3: - jitstd::sort(sortOrder, sortOrder + lvaCount, refDensityCompare); - break; - case 4: - jitstd::sort(sortOrder, sortOrder + lvaCount, sizeAscCompare); - break; - case 5: - jitstd::sort(sortOrder, sortOrder + lvaCount, initGroupedDensityCompare); - break; - default: - unreached(); - } + bestOrder = nullptr; } - JITDUMP("Frame layout costs: original=%u density=%u refCnt=%u weight=%u refDensity=%u " - "sizeAsc=%u initGroupedDensity=%u; " + JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " + "initGroupedDensity=%u; " "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", - origCost, densityCost, refCntCost, weightCost, refDensityCost, sizeAscCost, initGroupedDensityCost, - bestName, bestCost, origCost > bestCost ? origCost - bestCost : 0); + origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost, bestName, bestCost, + origCost > bestCost ? origCost - bestCost : 0); - return sortOrder; + return bestOrder; } #endif // TARGET_XARCH From 9edbb980f23567602ba1aa932f199a049a36813e Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 4 May 2026 15:51:09 +0000 Subject: [PATCH 17/28] cleanup --- src/coreclr/jit/lclvars.cpp | 80 +++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index a98a1423f9b1e2..a5635979b28a58 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4924,29 +4924,45 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a LclVarDsc* varDsc = lvaGetDesc(i); if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + { continue; + } #if FEATURE_FIXED_OUT_ARGS if (i == lvaOutgoingArgSpaceVar) + { continue; + } #endif if (lvaIsOSRLocal(i)) + { continue; + } if (!varDsc->lvOnFrame) + { continue; - if (i == lvaGSSecurityCookie && getNeedsGSSecurityCookie()) + } + if ((i == lvaGSSecurityCookie) && getNeedsGSSecurityCookie()) + { continue; + } if (i == lvaRetAddrVar) + { continue; - if (i == lvaMonAcquired || i == lvaAsyncExecutionContextVar || i == lvaAsyncSynchronizationContextVar) + } + if ((i == lvaMonAcquired) || (i == lvaAsyncExecutionContextVar) || (i == lvaAsyncSynchronizationContextVar)) + { continue; - if (varDsc->lvIsParam && !lvaParamHasLocalStackSpace(i)) + } + if ((varDsc->lvIsParam) && !lvaParamHasLocalStackSpace(i)) + { continue; + } - if (varDsc->lvIsUnsafeBuffer && compGSReorderStackLayout) + if ((varDsc->lvIsUnsafeBuffer) && compGSReorderStackLayout) { lclPassCategory[i] = varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; } - else if (varTypeIsGC(varDsc->TypeGet()) && varDsc->lvTracked) + else if (varTypeIsGC(varDsc->TypeGet()) && (varDsc->lvTracked)) { lclPassCategory[i] = ALLOC_PTRS; } @@ -4967,20 +4983,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a { lclNeedsInit[i] = false; if (lclPassCategory[i] == 0) + { continue; + } LclVarDsc* varDsc = lvaGetDesc(i); if (fgVarIsNeverZeroInitializedInProlog(i)) + { continue; + } if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + { continue; + } if (varDsc->lvHasExplicitInit) + { continue; - if (varDsc->lvIsTemp && !varDsc->HasGCPtr()) + } + if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr()) + { continue; + } - if (info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit) + if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit)) { lclNeedsInit[i] = true; initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int); @@ -5010,13 +5036,15 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a { unsigned lcl = order[idx]; if (lclPassCategory[lcl] != pass) + { continue; + } unsigned size = lclSize[lcl]; unsigned alignTo = lclAlignTo[lcl]; // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). - if (alignTo != 0 && (simOff % static_cast(alignTo)) != 0) + if ((alignTo != 0) && ((simOff % static_cast(alignTo)) != 0)) { simOff -= static_cast(alignTo + (simOff % static_cast(alignTo))); } @@ -5026,11 +5054,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u); // Track the zero-init span for block-init cost estimation. - if (useBlockInit && lclNeedsInit[lcl]) + if (useBlockInit && (lclNeedsInit[lcl])) { int loOffs = simOff; int hiOffs = simOff + static_cast(size); - if (initLo == 0 && initHi == 0) + if ((initLo == 0) && (initHi == 0)) { initLo = loOffs; initHi = hiOffs; @@ -5049,7 +5077,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Each 16-byte chunk requires one SIMD store instruction. We add a // small penalty per chunk to favor layouts that keep the init span tight, // without overwhelming the main encoding cost. - if (useBlockInit && initHi > initLo) + if (useBlockInit && (initHi > initLo)) { unsigned initSpan = static_cast(initHi - initLo); unsigned initCost = ((initSpan + 15) / 16) * 2; @@ -5077,7 +5105,9 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // into bestOrder to avoid a redundant re-sort at the end. auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { for (unsigned i = 0; i < lvaCount; i++) + { sortOrder[i] = i; + } jitstd::sort(sortOrder, sortOrder + lvaCount, comparator); unsigned cost = estimateLayoutCost(sortOrder); if (cost < bestCost) @@ -5096,12 +5126,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a weight_t dens1 = lclWeight[n1] * lclSize[n2]; weight_t dens2 = lclWeight[n2] * lclSize[n1]; if (dens1 != dens2) + { return dens1 > dens2; + } bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); if (a1 != a2) + { return a1; + } if (lclRefCnt[n1] != lclRefCnt[n2]) + { return lclRefCnt[n1] > lclRefCnt[n2]; + } return n1 < n2; }; unsigned densityCost = tryStrategy(0, "density", densityCompare); @@ -5111,9 +5147,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // maximizes how many locals get short encodings. auto sizeAscCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { if (lclSize[n1] != lclSize[n2]) + { return lclSize[n1] < lclSize[n2]; + } if (lclRefCnt[n1] != lclRefCnt[n2]) + { return lclRefCnt[n1] > lclRefCnt[n2]; + } return n1 < n2; }; unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare); @@ -5128,23 +5168,31 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Strategy 2: Weighted ref count descending. auto weightCompare = [lclSize, lclWeight](unsigned n1, unsigned n2) -> bool { if (lclWeight[n1] != lclWeight[n2]) + { return lclWeight[n1] > lclWeight[n2]; + } bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); if (a1 != a2) + { return a1; + } return n1 < n2; }; weightCost = tryStrategy(2, "weight", weightCompare); // Strategy 3: Unweighted ref count density (refCnt / size) descending. auto refDensityCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { - unsigned long long dens1 = (unsigned long long)lclRefCnt[n1] * lclSize[n2]; - unsigned long long dens2 = (unsigned long long)lclRefCnt[n2] * lclSize[n1]; + double dens1 = (double)lclRefCnt[n1] * lclSize[n2]; + double dens2 = (double)lclRefCnt[n2] * lclSize[n1]; if (dens1 != dens2) + { return dens1 > dens2; + } bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); if (a1 != a2) + { return a1; + } return n1 < n2; }; refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); @@ -5160,14 +5208,20 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool { bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; if (init1 != init2) + { return init1; // init-needing first + } weight_t dens1 = lclWeight[n1] * lclSize[n2]; weight_t dens2 = lclWeight[n2] * lclSize[n1]; if (dens1 != dens2) + { return dens1 > dens2; + } bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); if (a1 != a2) + { return a1; + } return n1 < n2; }; initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare); From f7ebd17c367fff65da666d4abe3cff4bcfb65781 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 4 May 2026 23:06:36 +0000 Subject: [PATCH 18/28] Add early maxSavings gate to skip optimization when benefit is small Compute a quick upper bound on potential savings before the expensive alignment/category/init pre-computation and sorting phases. Walks locals in default order, counts refs beyond disp8 range, and bails out if the maximum achievable code size reduction is <= 12 bytes. Skips ~14% of MinOpts methods while retaining 98-100% of savings. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index a5635979b28a58..104c50e07237c3 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4886,6 +4886,39 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } } + // Quick upper bound on potential savings: walk locals in default order, + // accumulate frame size, and count weighted refs that fall beyond disp8 range. + // This approximation ignores alignment padding and allocation passes but is + // cheap to compute and lets us skip methods where reordering cannot help much. + { + int simOff = stkOffs; + unsigned refsInDisp32 = 0; + + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclSize[i] == 0) + { + continue; + } + simOff -= static_cast(lclSize[i]); + if (simOff < -128) + { + refsInDisp32 += lclRefCnt[i]; + } + } + + // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding). + // If even moving ALL those refs into disp8 range wouldn't save much, skip. + unsigned maxSavings = refsInDisp32 * 3; + if (maxSavings <= 12) + { + JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes " + "(refsInDisp32=%u)\n", + maxSavings, refsInDisp32); + return nullptr; + } + } + // Pre-compute alignment requirements for each local. // 0 = no alignment needed, otherwise the required alignment in bytes. unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount]; From b4d6ee5378fdd6d87cb01da0e3ac6b249a18a5b0 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 4 May 2026 23:07:19 +0000 Subject: [PATCH 19/28] format --- src/coreclr/jit/lclvars.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 104c50e07237c3..4d7dbb486e96cb 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4891,8 +4891,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // This approximation ignores alignment padding and allocation passes but is // cheap to compute and lets us skip methods where reordering cannot help much. { - int simOff = stkOffs; - unsigned refsInDisp32 = 0; + int simOff = stkOffs; + unsigned refsInDisp32 = 0; for (unsigned i = 0; i < lvaCount; i++) { From cf3f0c359ddaf691ed83371378292e87e47aa7f2 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 14 May 2026 09:48:33 -0700 Subject: [PATCH 20/28] Address PR review feedback - Use signed arithmetic for alignment-pad simulation in estimateLayoutCost to avoid mixing unsigned alignTo with the (possibly negative) signed remainder. - Replace the unconditional FINAL_FRAME_LAYOUT assert in lvaAssignVirtualFrameOffsetsToLocals with a guard so non-final layout passes (PRE_REGALLOC/REGALLOC/TENTATIVE) simply skip the optimization rather than asserting. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 68a88b80d6f125..3d5ca945af1571 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5115,9 +5115,15 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a unsigned alignTo = lclAlignTo[lcl]; // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). - if ((alignTo != 0) && ((simOff % static_cast(alignTo)) != 0)) + // Use signed arithmetic throughout: simOff is negative, and the remainder + // (simOff % alignment) is non-positive, so pad = alignment + remainder + // yields a small positive value in [1, alignment-1]. Mixing unsigned alignTo + // with the signed remainder would convert the negative remainder to a huge + // unsigned value and corrupt simOff. + int signedAlign = static_cast(alignTo); + if ((signedAlign != 0) && ((simOff % signedAlign) != 0)) { - simOff -= static_cast(alignTo + (simOff % static_cast(alignTo))); + simOff -= signedAlign + (simOff % signedAlign); } simOff -= static_cast(size); @@ -5688,9 +5694,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #ifdef TARGET_XARCH // Multi-strategy frame layout optimization for x86/x64. // See lvaComputeOptimalFrameLayoutOrder for details. - assert(lvaDoneFrameLayout == FINAL_FRAME_LAYOUT); + // Only attempt the optimization during the final layout pass; earlier passes + // (PRE_REGALLOC/REGALLOC/TENTATIVE) may invoke lvaAssignVirtualFrameOffsetsToLocals + // for size estimation, and the cost-model assumptions only hold for the final pass. unsigned* lclVarSortOrder = nullptr; - if (lvaLocalVarRefCounted() && !opts.compDbgEnC && codeGen->isFramePointerUsed()) + if ((lvaDoneFrameLayout == FINAL_FRAME_LAYOUT) && lvaLocalVarRefCounted() && !opts.compDbgEnC && + codeGen->isFramePointerUsed()) { lclVarSortOrder = lvaComputeOptimalFrameLayoutOrder(stkOffs, alloc_order); } From ac57311a9fdfd5fdb14e4aec90a55ed8f5ad7602 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 14 May 2026 12:41:19 -0700 Subject: [PATCH 21/28] Add JitFrameLayoutMaxSavingsThreshold config knob Replace the hard-coded maxSavings <= 12 early-out in lvaComputeOptimalFrameLayoutOrder with a tunable JIT config option, keeping the existing value (12) as the default. This makes it easy to experiment with the TP-vs-code-size tradeoff without rebuilding the JIT. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/lclvars.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 88ac0fe83e4924..b091b2851bbff0 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -694,6 +694,7 @@ RELEASE_CONFIG_INTEGER(JitExtDefaultPolicyProfScale, "JitExtDefaultPolicyProfSca RELEASE_CONFIG_INTEGER(JitInlinePolicyModel, "JitInlinePolicyModel", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40) +RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 12) CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 3d5ca945af1571..1fe6e7d25c3b13 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4948,7 +4948,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding). // If even moving ALL those refs into disp8 range wouldn't save much, skip. unsigned maxSavings = refsInDisp32 * 3; - if (maxSavings <= 12) + if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold()) { JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes " "(refsInDisp32=%u)\n", From a2357533bd4c08dbb71e3da176258f9ccd9d6a61 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 14 May 2026 16:30:10 -0700 Subject: [PATCH 22/28] Skip initGroupedDensity (S4) strategy at MinOpts Empirical analysis on libraries_tests_no_tiered_compilation showed that S4 (init-grouped density) contributes only ~0.7% of the total code-size wins at MinOpts while incurring the same per-strategy throughput cost as the other sorts. Skipping it at MinOpts shaves ~0.5pp off MinOpts JIT TP (from +1.98% to +1.47%) at negligible code-size impact. Also remove the experimental JitFrameLayoutStrategyMask config introduced during the sweep, and refine the S2/S3 comment. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 1fe6e7d25c3b13..38cceae12b1d2b 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5236,8 +5236,9 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare); // Strategies 2-3: Weight-based sorts that differ from density/sizeAsc only when - // PGO block weights are available (FullOpts). For MinOpts, weighted == unweighted - // and refDensity == density, so these are redundant and skipped. + // PGO block weights are available (FullOpts). For MinOpts, lclWeight == lclRefCnt, + // so S2 is redundant with density once D is in the set (empirically adds <1% of the + // code-size wins for the full per-strategy TP cost), and S3 == density. Skipped at MinOpts. unsigned weightCost = 0; unsigned refDensityCost = 0; if (!isMinOpts) @@ -5279,8 +5280,10 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // density first, then non-init locals by density. Keeps the zero-init span // tight while still prioritizing hot locals within each group. // Only useful when block init will be used (otherwise identical to density). + // Skipped at MinOpts: empirically adds <1% of the code-size wins for the full + // per-strategy TP cost. unsigned initGroupedDensityCost = 0; - if (useBlockInit) + if (useBlockInit && !isMinOpts) { auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool { bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; From bba5fa393d038d9aa714df364529f40ab89f6502 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Wed, 27 May 2026 19:07:58 -0700 Subject: [PATCH 23/28] JIT: bucket frame layout candidates by alloc pass, sort only the straddling bucket The frame layout heuristic in lvaComputeOptimalFrameLayoutOrder was scoring every candidate strategy over all locals and sorting all locals up front. Most of that work is wasted: when the locals are walked in allocation order, simOff decreases monotonically, so at most one allocation bucket straddles the disp8/-128 boundary. Buckets fully above contribute fixed cost refCnt*1; buckets fully below contribute refCnt*4. Only the straddling bucket's intra- order affects total cost. The function now: * Buckets candidate locals by allocation pass before doing any cost work. * Walks the buckets once in allocation order to identify the (single) straddling bucket. If none exists, bails before doing the LIR ref-count walk. * Precomputes baseCost (and, at FullOpts, baseInitLo/baseInitHi) from non- straddling buckets. * Tightens the maxSavings short-circuit gate to use the actual straddling bucket walk (including alignment) instead of an over-approximation across all locals. * Sorts only the straddling-bucket slice per strategy and reuses the cost machinery just on that slice. * Skips the lclNeedsInit / useBlockInit / baseInit setup entirely at MinOpts (S4 is already disabled there). Measured on libraries_tests_no_tiered_compilation.run.windows.x64.Release.mch: TP Overall +0.10% (was +0.28%) MinOpts +1.25% (was +1.47%) FullOpts +0.07% (unchanged) Code size diffs preserved (slightly improved): -469,696/+29,555 bytes (was -459,519/+29,355). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 507 ++++++++++++++++++++++++------------ 1 file changed, 334 insertions(+), 173 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 30df90e50d9800..47054186f2d3b4 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4867,81 +4867,6 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a return nullptr; } - // Pre-compute ref counts and weights into flat arrays for fast comparator access. - // For MinOpts/Tier0, precise ref counts are not available (all lvRefCnt == 0), - // so we do a lightweight LIR walk to count local references. - unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount]; - weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount]; - bool isMinOpts = !PreciseRefCountsRequired(); - - if (isMinOpts) - { - memset(lclRefCnt, 0, lvaCount * sizeof(unsigned)); - - for (BasicBlock* const block : Blocks()) - { - for (GenTree* node : LIR::AsRange(block)) - { - if (node->OperIsAnyLocal()) - { - unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); - if (lclNum < lvaCount) - { - lclRefCnt[lclNum]++; - } - } - } - } - - // For MinOpts, weighted = unweighted (no block weights available). - for (unsigned i = 0; i < lvaCount; i++) - { - lclWeight[i] = static_cast(lclRefCnt[i]); - } - } - else - { - for (unsigned i = 0; i < lvaCount; i++) - { - LclVarDsc* varDsc = lvaGetDesc(i); - lclRefCnt[i] = varDsc->lvRefCnt(lvaRefCountState); - lclWeight[i] = varDsc->lvRefCntWtd(lvaRefCountState); - } - } - - // Quick upper bound on potential savings: walk locals in default order, - // accumulate frame size, and count weighted refs that fall beyond disp8 range. - // This approximation ignores alignment padding and allocation passes but is - // cheap to compute and lets us skip methods where reordering cannot help much. - { - int simOff = stkOffs; - unsigned refsInDisp32 = 0; - - for (unsigned i = 0; i < lvaCount; i++) - { - if (lclSize[i] == 0) - { - continue; - } - simOff -= static_cast(lclSize[i]); - if (simOff < -128) - { - refsInDisp32 += lclRefCnt[i]; - } - } - - // Each ref beyond disp8 costs 3 extra bytes (disp32 vs disp8 encoding). - // If even moving ALL those refs into disp8 range wouldn't save much, skip. - unsigned maxSavings = refsInDisp32 * 3; - if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold()) - { - JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes " - "(refsInDisp32=%u)\n", - maxSavings, refsInDisp32); - return nullptr; - } - } - // Pre-compute alignment requirements for each local. // 0 = no alignment needed, otherwise the required alignment in bytes. unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount]; @@ -4967,10 +4892,6 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } } - JITDUMP("Frame layout optimization: trying strategies for %u locals " - "(estimated frame size %u bytes%s)\n", - lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : ""); - // Pre-compute which locals will be allocated in the main loop and their // pass category. Category 0 means "not allocatable" (skipped by the loop). unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; @@ -5028,156 +4949,369 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } } - // Pre-compute which locals will likely need zero-initialization in the prolog. - // This approximates the logic in genCheckUseBlockInit (codegencommon.cpp). - // When block init is used, the JIT zeros a contiguous range [untrLclLo, untrLclHi] - // using SIMD stores. The code size depends on the span and alignment, so layouts - // that scatter init-requiring locals produce larger prologs. - bool* lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; - unsigned initSlotCount = 0; + // Build per-pass buckets in allocOrder order. ALLOC_* values are powers of two; + // their bit indices (0..3) map to positions in allocOrder via passBitToAllocIdx. + const unsigned MAX_PASS_BITS = 4; + unsigned passBitToAllocIdx[MAX_PASS_BITS]; + for (unsigned k = 0; k < MAX_PASS_BITS; k++) + { + passBitToAllocIdx[k] = UINT_MAX; + } + + unsigned allocOrderLen = 0; + for (unsigned p = 0; allocOrder[p] != 0; p++) + { + // The optimization is gated off for compDbgEnC (which merges ALLOC_PTRS into the + // previous pass), so each allocOrder entry here is a single ALLOC_* bit. + unsigned bit = BitOperations::Log2((unsigned)allocOrder[p]); + assert(bit < MAX_PASS_BITS); + assert(((unsigned)allocOrder[p] & ((unsigned)allocOrder[p] - 1)) == 0); + passBitToAllocIdx[bit] = p; + allocOrderLen++; + } + assert(allocOrderLen <= MAX_PASS_BITS); + + unsigned passCount[MAX_PASS_BITS + 1] = {0}; for (unsigned i = 0; i < lvaCount; i++) { - lclNeedsInit[i] = false; if (lclPassCategory[i] == 0) { continue; } + unsigned bit = BitOperations::Log2(lclPassCategory[i]); + unsigned p = passBitToAllocIdx[bit]; + assert(p != UINT_MAX); + passCount[p]++; + } + + unsigned passStart[MAX_PASS_BITS + 1]; + passStart[0] = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + passStart[p + 1] = passStart[p] + passCount[p]; + } + const unsigned numAllocatable = passStart[allocOrderLen]; - LclVarDsc* varDsc = lvaGetDesc(i); + if (numAllocatable == 0) + { + return nullptr; + } - if (fgVarIsNeverZeroInitializedInProlog(i)) + // Concatenated bucket array: allocatable locals first (grouped by allocOrder pass, in + // ascending lclNum within each pass), then non-allocatable locals at the tail. + unsigned* bucketLcls = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned writePos[MAX_PASS_BITS]; + for (unsigned p = 0; p < allocOrderLen; p++) + { + writePos[p] = passStart[p]; + } + unsigned tailPos = numAllocatable; + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclPassCategory[i] == 0) { + bucketLcls[tailPos++] = i; continue; } - if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + unsigned bit = BitOperations::Log2(lclPassCategory[i]); + unsigned p = passBitToAllocIdx[bit]; + bucketLcls[writePos[p]++] = i; + } + + // Walk buckets in their original order to determine where the disp8/disp32 boundary + // (simOff == -128) falls. simOff decreases monotonically across the walk, so at most + // ONE bucket can straddle the boundary. Buckets entirely above -128 contribute fixed + // cost = refCnt * 1; buckets entirely below contribute fixed cost = refCnt * 4. Only + // the straddling bucket's internal order affects total cost. + int bucketSimOffStart[MAX_PASS_BITS]; + int bucketSimOffEnd[MAX_PASS_BITS]; + int simOff = stkOffs; + unsigned straddleBucket = UINT_MAX; + for (unsigned p = 0; p < allocOrderLen; p++) + { + bucketSimOffStart[p] = simOff; + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) { - continue; + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((simOff % signedAlign) != 0)) + { + simOff -= signedAlign + (simOff % signedAlign); + } + simOff -= static_cast(lclSize[lcl]); } - if (varDsc->lvHasExplicitInit) + bucketSimOffEnd[p] = simOff; + + if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] <= -128)) { - continue; + straddleBucket = p; } - if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr()) + } + + if (straddleBucket == UINT_MAX) + { + // The frame either fits entirely in disp8 (nothing to optimize) or every + // allocated bucket already starts past disp8 (reordering within a bucket + // can't pull refs into disp8 range). Bail. + JITDUMP("Frame layout optimization: skipping — no straddling bucket " + "(simOff at end = %d)\n", + simOff); + return nullptr; + } + + // Pre-compute ref counts and weights. For MinOpts/Tier0, precise ref counts are not + // available (all lvRefCnt == 0), so we do a lightweight LIR walk to count local refs. + unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount]; + weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount]; + bool isMinOpts = !PreciseRefCountsRequired(); + + if (isMinOpts) + { + memset(lclRefCnt, 0, lvaCount * sizeof(unsigned)); + + for (BasicBlock* const block : Blocks()) { - continue; + for (GenTree* node : LIR::AsRange(block)) + { + if (node->OperIsAnyLocal()) + { + unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); + if (lclNum < lvaCount) + { + lclRefCnt[lclNum]++; + } + } + } } - if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit)) + // For MinOpts, weighted = unweighted (no block weights available). + for (unsigned i = 0; i < lvaCount; i++) + { + lclWeight[i] = static_cast(lclRefCnt[i]); + } + } + else + { + for (unsigned i = 0; i < lvaCount; i++) { - lclNeedsInit[i] = true; - initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int); + LclVarDsc* varDsc = lvaGetDesc(i); + lclRefCnt[i] = varDsc->lvRefCnt(lvaRefCountState); + lclWeight[i] = varDsc->lvRefCntWtd(lvaRefCountState); } } - // On AMD64, block init is used when initSlotCount > 4; on x86 when > 4. - // Block init zeros a contiguous range, so the code size depends on span. - // Individual init zeros each local separately, cost is independent of layout. - bool useBlockInit = (initSlotCount > 4); + // Precise upper bound on savings: walk the straddling bucket in its original order, + // count refs that land past disp8. Each such ref could save at most 3 bytes by being + // moved into disp8 range. Bail if even moving them all wouldn't help much. + unsigned maxSavings; + { + int so = bucketSimOffStart[straddleBucket]; + unsigned refsInDisp32 = 0; + for (unsigned k = passStart[straddleBucket]; k < passStart[straddleBucket + 1]; k++) + { + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + if (so < -128) + { + refsInDisp32 += lclRefCnt[lcl]; + } + } + maxSavings = refsInDisp32 * 3; + if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold()) + { + JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes " + "(refsInDisp32=%u)\n", + maxSavings, refsInDisp32); + return nullptr; + } + } - // Simulate frame layout for a given sort order and return total encoding cost. - // Lower cost = better layout. Cost = Σ(refCnt × encodingBytes) where - // encodingBytes is 1 for disp8 (offset in [-128,+127]) or 4 for disp32. - // When block init is used, we also add a zero-init cost proportional to the - // span of init-requiring locals (larger span = more SIMD stores in the prolog). - auto estimateLayoutCost = [&](unsigned* order) -> unsigned { - unsigned totalCost = 0; - int simOff = stkOffs; - int initLo = 0; - int initHi = 0; + // Compute baseline cost from non-straddling buckets. These contributions are + // independent of any intra-bucket sort order, so they cancel out when comparing + // strategies. We still include them in totalCost for accurate JITDUMP output. + unsigned baseCost = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + if (p == straddleBucket) + { + continue; + } + unsigned encoding = (bucketSimOffStart[p] <= -128) ? 4u : 1u; + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) + { + baseCost += lclRefCnt[bucketLcls[k]] * encoding; + } + } - for (int p = 0; allocOrder[p]; p++) + // Pre-compute zero-init data for the cost model. Only used at FullOpts where S4 + // (initGroupedDensity) is active. At MinOpts S4 is skipped and the init-span term + // is small relative to the encoding-cost term, so we omit it. + bool* lclNeedsInit = nullptr; + bool useBlockInit = false; + int baseInitLo = 0; + int baseInitHi = 0; + if (!isMinOpts) + { + lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; + unsigned initSlotCount = 0; + for (unsigned i = 0; i < lvaCount; i++) { - UINT pass = allocOrder[p]; - for (unsigned idx = 0; idx < lvaCount; idx++) + lclNeedsInit[i] = false; + if (lclPassCategory[i] == 0) { - unsigned lcl = order[idx]; - if (lclPassCategory[lcl] != pass) - { - continue; - } + continue; + } - unsigned size = lclSize[lcl]; - unsigned alignTo = lclAlignTo[lcl]; - - // Simulate alignment padding (mirrors lvaAllocLocalAndSetVirtualOffset). - // Use signed arithmetic throughout: simOff is negative, and the remainder - // (simOff % alignment) is non-positive, so pad = alignment + remainder - // yields a small positive value in [1, alignment-1]. Mixing unsigned alignTo - // with the signed remainder would convert the negative remainder to a huge - // unsigned value and corrupt simOff. - int signedAlign = static_cast(alignTo); - if ((signedAlign != 0) && ((simOff % signedAlign) != 0)) - { - simOff -= signedAlign + (simOff % signedAlign); - } + LclVarDsc* varDsc = lvaGetDesc(i); - simOff -= static_cast(size); + if (fgVarIsNeverZeroInitializedInProlog(i)) + { + continue; + } + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + { + continue; + } + if (varDsc->lvHasExplicitInit) + { + continue; + } + if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr()) + { + continue; + } - totalCost += lclRefCnt[lcl] * ((simOff >= -128) ? 1u : 4u); + if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit)) + { + lclNeedsInit[i] = true; + initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int); + } + } + useBlockInit = (initSlotCount > 4); - // Track the zero-init span for block-init cost estimation. - if (useBlockInit && (lclNeedsInit[lcl])) + // If block init applies, precompute the init-span contribution from non-straddling + // buckets. The straddling bucket's contribution is folded in by walkStraddle. + if (useBlockInit) + { + int so = stkOffs; + for (unsigned p = 0; p < allocOrderLen; p++) + { + if (p == straddleBucket) + { + so = bucketSimOffEnd[p]; + continue; + } + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) { - int loOffs = simOff; - int hiOffs = simOff + static_cast(size); - if ((initLo == 0) && (initHi == 0)) + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) { - initLo = loOffs; - initHi = hiOffs; + so -= signedAlign + (so % signedAlign); } - else + so -= static_cast(lclSize[lcl]); + if (lclNeedsInit[lcl]) { - initLo = min(initLo, loOffs); - initHi = max(initHi, hiOffs); + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if ((baseInitLo == 0) && (baseInitHi == 0)) + { + baseInitLo = loOffs; + baseInitHi = hiOffs; + } + else + { + baseInitLo = min(baseInitLo, loOffs); + baseInitHi = max(baseInitHi, hiOffs); + } } } } } + } + + JITDUMP("Frame layout optimization: trying strategies for %u locals " + "(estimated frame size %u bytes%s, straddle bucket=%u of %u, baseCost=%u, " + "maxSavings=%u)\n", + lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", + straddleBucket, allocOrderLen, baseCost, maxSavings); + + const unsigned straddleStart = passStart[straddleBucket]; + const unsigned straddleCount = passStart[straddleBucket + 1] - straddleStart; + const int straddleSimOffEntry = bucketSimOffStart[straddleBucket]; + + // Compute the straddling bucket's cost contribution given a particular intra-bucket order. + // Folds in the init-span penalty when block init is in use. + auto walkStraddle = [&](unsigned* order) -> unsigned { + unsigned cost = 0; + int so = straddleSimOffEntry; + int initLo = baseInitLo; + int initHi = baseInitHi; + for (unsigned k = 0; k < straddleCount; k++) + { + unsigned lcl = order[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + cost += lclRefCnt[lcl] * ((so >= -128) ? 1u : 4u); + + if (useBlockInit && (lclNeedsInit[lcl])) + { + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if ((initLo == 0) && (initHi == 0)) + { + initLo = loOffs; + initHi = hiOffs; + } + else + { + initLo = min(initLo, loOffs); + initHi = max(initHi, hiOffs); + } + } + } - // Add zero-init prolog cost when block init will be used. - // The JIT zeros the contiguous range [initLo, initHi) using SIMD stores. - // Each 16-byte chunk requires one SIMD store instruction. We add a - // small penalty per chunk to favor layouts that keep the init span tight, - // without overwhelming the main encoding cost. if (useBlockInit && (initHi > initLo)) { unsigned initSpan = static_cast(initHi - initLo); unsigned initCost = ((initSpan + 15) / 16) * 2; - totalCost += initCost; + cost += initCost; } - return totalCost; + return cost; }; - unsigned* sortOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; - unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; - for (unsigned i = 0; i < lvaCount; i++) - { - sortOrder[i] = i; - } + unsigned* straddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); + memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); // Score the original (unsorted) order as baseline. - unsigned origCost = estimateLayoutCost(sortOrder); + unsigned origCost = baseCost + walkStraddle(straddleOrder); unsigned bestCost = origCost; - int bestStrategy = -1; // -1 = original order + int bestStrategy = -1; const char* bestName = "original"; - // Helper to try a strategy: sort sortOrder, estimate cost, track if best. - // When a strategy improves on the current best, we save its permutation - // into bestOrder to avoid a redundant re-sort at the end. + // Helper to try a strategy: sort straddleOrder, score, track if best. auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { - for (unsigned i = 0; i < lvaCount; i++) - { - sortOrder[i] = i; - } - jitstd::sort(sortOrder, sortOrder + lvaCount, comparator); - unsigned cost = estimateLayoutCost(sortOrder); + memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); + jitstd::sort(straddleOrder, straddleOrder + straddleCount, comparator); + unsigned cost = baseCost + walkStraddle(straddleOrder); if (cost < bestCost) { bestCost = cost; bestStrategy = strategyIdx; bestName = name; - memcpy(bestOrder, sortOrder, lvaCount * sizeof(unsigned)); + memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); } return cost; }; @@ -5292,10 +5426,37 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare); } - // Return the winning permutation (saved in bestOrder), or nullptr if original won. if (bestStrategy < 0) { - bestOrder = nullptr; + JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " + "initGroupedDensity=%u; original order is best, no change\n", + origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost); + return nullptr; + } + + // Assemble the final permutation: each bucket in its original order, EXCEPT the + // straddling bucket, which uses the best strategy's intra-bucket sort; followed by + // non-allocatable locals (the caller filters those out anyway). + unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned outIdx = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + if (p == straddleBucket) + { + memcpy(&bestOrder[outIdx], bestStraddleOrder, straddleCount * sizeof(unsigned)); + outIdx += straddleCount; + } + else + { + unsigned bucketSize = passStart[p + 1] - passStart[p]; + memcpy(&bestOrder[outIdx], &bucketLcls[passStart[p]], bucketSize * sizeof(unsigned)); + outIdx += bucketSize; + } + } + if (outIdx < lvaCount) + { + memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable], + (lvaCount - numAllocatable) * sizeof(unsigned)); } JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " From 32b6b75e81d909c02fc55c50366aee70732d132a Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 28 May 2026 06:53:15 -0700 Subject: [PATCH 24/28] JIT: tighten straddle-bucket end check, run jit-format Review feedback: when bucketSimOffEnd[p] == -128 the last local in the bucket sits exactly at -128 (still disp8) and the bucket is fully disp8, not straddling. Tighten the end-side check from <= -128 to < -128 so we don't run the straddler sort over a bucket whose internal order can't change cost. Also apply jit-format whitespace adjustments. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 47054186f2d3b4..d6b714c6786951 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5042,7 +5042,12 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } bucketSimOffEnd[p] = simOff; - if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] <= -128)) + // A bucket straddles the disp8/disp32 boundary when its first local can land in disp8 + // (simOff at entry strictly above -128, so at least 1 byte of disp8 budget remains) and + // its last local lands in disp32 (simOff after allocation strictly below -128). + // simOffEnd == -128 means the last local sits exactly at -128 (still disp8), so the + // bucket is fully disp8 and not a straddler. + if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] < -128)) { straddleBucket = p; } @@ -5151,13 +5156,13 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Pre-compute zero-init data for the cost model. Only used at FullOpts where S4 // (initGroupedDensity) is active. At MinOpts S4 is skipped and the init-span term // is small relative to the encoding-cost term, so we omit it. - bool* lclNeedsInit = nullptr; - bool useBlockInit = false; - int baseInitLo = 0; - int baseInitHi = 0; + bool* lclNeedsInit = nullptr; + bool useBlockInit = false; + int baseInitLo = 0; + int baseInitHi = 0; if (!isMinOpts) { - lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; + lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; unsigned initSlotCount = 0; for (unsigned i = 0; i < lvaCount; i++) { @@ -5238,8 +5243,8 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a JITDUMP("Frame layout optimization: trying strategies for %u locals " "(estimated frame size %u bytes%s, straddle bucket=%u of %u, baseCost=%u, " "maxSavings=%u)\n", - lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", - straddleBucket, allocOrderLen, baseCost, maxSavings); + lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", straddleBucket, + allocOrderLen, baseCost, maxSavings); const unsigned straddleStart = passStart[straddleBucket]; const unsigned straddleCount = passStart[straddleBucket + 1] - straddleStart; @@ -5455,8 +5460,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } if (outIdx < lvaCount) { - memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable], - (lvaCount - numAllocatable) * sizeof(unsigned)); + memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable], (lvaCount - numAllocatable) * sizeof(unsigned)); } JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " From 2ad09144ccb368b66a179d6b78a4ad925925fae9 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 28 May 2026 07:36:27 -0700 Subject: [PATCH 25/28] JIT: address PR review feedback for frame layout heuristic * Mirror skip cases from lvaAssignVirtualFrameOffsetsToLocals more fully: also exclude lvaIsUnknownSizeLocal (handled by lvaAllocUnknownSizeLocal), lvaAsyncThreadObjectVar, and lvaLocAllocSPvar (JIT32_GCENCODER). Without these the simulated simOff walk could diverge from the real layout for methods using those locals. * Replace the (lo == 0 && hi == 0) 'empty init span' sentinel with an explicit hasInit flag in both the non-straddling-bucket precompute and walkStraddle. Offset 0 is a legitimate hiOffs value (e.g. first init-needing local of size s placed at so == -s), so the prior sentinel could spuriously reset the recorded init span and produce an inaccurate FullOpts cost estimate. * Document the alignment-modeling approximation: lclAlignTo does not model the x86-only DOUBLE_ALIGN / mustDoubleAlign / have_LclVarDoubleAlign rules, so the simulated simOff can drift by a pointer-sized slot on x86 double- aligned frames. The real allocator still runs unchanged. Code size on libraries_tests_no_tiered_compilation.run.windows.x64.Release: -469,888 / +29,504 bytes (was -469,696 / +29,555). TP unchanged within noise (MinOpts +1.26%, FullOpts +0.07%, Overall +0.10%). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 54 ++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index d6b714c6786951..6ce64bbab16496 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4869,6 +4869,18 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Pre-compute alignment requirements for each local. // 0 = no alignment needed, otherwise the required alignment in bytes. + // + // NOTE: This is an approximation of the alignment behavior in + // lvaAssignVirtualFrameOffsetsToLocals + lvaAllocLocalAndSetVirtualOffset. + // In particular it does NOT model the x86-only DOUBLE_ALIGN / + // mustDoubleAlign / lvaIncrementFrameSize path for TYP_DOUBLE / TYP_LONG / + // lvStructDoubleAlign, nor the cross-bucket have_LclVarDoubleAlign + // pre-reservation. Frames where those apply may see the simulated simOff + // drift from the real layout by a pointer-sized slot, causing the + // straddler boundary and per-strategy cost estimate to be slightly + // imprecise. This is a heuristic, not a correctness path: the real + // allocator still runs unchanged and only the chosen sort permutation + // is affected. unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount]; for (unsigned i = 0; i < lvaCount; i++) { @@ -4922,11 +4934,25 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a { continue; } + if (lvaIsUnknownSizeLocal(i)) + { + // The real loop calls lvaAllocUnknownSizeLocal for these; their stack home + // size is not modeled by lvaLclStackHomeSize so simulating them would skew + // the simOff walk. + continue; + } if (i == lvaRetAddrVar) { continue; } - if ((i == lvaMonAcquired) || (i == lvaAsyncExecutionContextVar) || (i == lvaAsyncSynchronizationContextVar)) +#ifdef JIT32_GCENCODER + if (i == lvaLocAllocSPvar) + { + continue; + } +#endif + if ((i == lvaMonAcquired) || (i == lvaAsyncThreadObjectVar) || (i == lvaAsyncExecutionContextVar) || + (i == lvaAsyncSynchronizationContextVar)) { continue; } @@ -5160,6 +5186,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a bool useBlockInit = false; int baseInitLo = 0; int baseInitHi = 0; + bool baseHasInit = false; if (!isMinOpts) { lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; @@ -5224,10 +5251,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a { int loOffs = so; int hiOffs = so + static_cast(lclSize[lcl]); - if ((baseInitLo == 0) && (baseInitHi == 0)) + if (!baseHasInit) { - baseInitLo = loOffs; - baseInitHi = hiOffs; + baseInitLo = loOffs; + baseInitHi = hiOffs; + baseHasInit = true; } else { @@ -5253,10 +5281,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a // Compute the straddling bucket's cost contribution given a particular intra-bucket order. // Folds in the init-span penalty when block init is in use. auto walkStraddle = [&](unsigned* order) -> unsigned { - unsigned cost = 0; - int so = straddleSimOffEntry; - int initLo = baseInitLo; - int initHi = baseInitHi; + unsigned cost = 0; + int so = straddleSimOffEntry; + int initLo = baseInitLo; + int initHi = baseInitHi; + bool hasInit = baseHasInit; for (unsigned k = 0; k < straddleCount; k++) { unsigned lcl = order[k]; @@ -5272,10 +5301,11 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a { int loOffs = so; int hiOffs = so + static_cast(lclSize[lcl]); - if ((initLo == 0) && (initHi == 0)) + if (!hasInit) { - initLo = loOffs; - initHi = hiOffs; + initLo = loOffs; + initHi = hiOffs; + hasInit = true; } else { @@ -5285,7 +5315,7 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } } - if (useBlockInit && (initHi > initLo)) + if (useBlockInit && hasInit && (initHi > initLo)) { unsigned initSpan = static_cast(initHi - initLo); unsigned initCost = ((initSpan + 15) / 16) * 2; From aef42a711872e15676ae7044c0493ff66aa09eda Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 28 May 2026 16:57:03 -0700 Subject: [PATCH 26/28] JIT: extend frame layout sort to all pre-straddler buckets The bucket-and-straddle layout previously sorted only the straddling bucket's locals. Intra-bucket ordering of the non-straddling buckets that precede the straddler still affects alignment padding, which shifts the straddler's entry offset and can let more refs fit in disp8. Extend the cost search to also re-order locals within each pre-straddler bucket (preserving bucket boundaries), using the same comparator as the straddler. Post-straddler buckets remain in canonical order; they are fully disp32 and their cost is invariant under reordering. Recovers ~49% of the code-size savings lost relative to the pre-rewrite layout, with no measurable throughput cost. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 128 ++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 33 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 6ce64bbab16496..8257f40f417dc1 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5226,18 +5226,16 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a } useBlockInit = (initSlotCount > 4); - // If block init applies, precompute the init-span contribution from non-straddling - // buckets. The straddling bucket's contribution is folded in by walkStraddle. + // If block init applies, precompute the init-span contribution from POST-straddler + // buckets only. Pre-straddler buckets contribute order-dependent simOffs (since each + // strategy resorts them), so their init-span contribution is folded in per-strategy + // by walkLayout. The straddler's contribution is also folded in by walkLayout. + // Post-straddler buckets use canonical simOffs computed from bucketSimOffEnd[straddleBucket]. if (useBlockInit) { - int so = stkOffs; - for (unsigned p = 0; p < allocOrderLen; p++) + int so = bucketSimOffEnd[straddleBucket]; + for (unsigned p = straddleBucket + 1; p < allocOrderLen; p++) { - if (p == straddleBucket) - { - so = bucketSimOffEnd[p]; - continue; - } for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) { unsigned lcl = bucketLcls[k]; @@ -5274,21 +5272,64 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", straddleBucket, allocOrderLen, baseCost, maxSavings); - const unsigned straddleStart = passStart[straddleBucket]; - const unsigned straddleCount = passStart[straddleBucket + 1] - straddleStart; - const int straddleSimOffEntry = bucketSimOffStart[straddleBucket]; + const unsigned straddleStart = passStart[straddleBucket]; + const unsigned straddleCount = passStart[straddleBucket + 1] - straddleStart; + const unsigned preStraddleCount = straddleStart; + + // Pre-straddler buckets (when present) participate in each strategy's sort. + // Reordering inside a pre-straddler bucket can change alignment padding, + // which shifts the straddler's entry simOff and can pull more refs into disp8. + unsigned* preStraddleOrder = (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr; + unsigned* bestPreStraddleOrder = + (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr; + unsigned* straddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; - // Compute the straddling bucket's cost contribution given a particular intra-bucket order. - // Folds in the init-span penalty when block init is in use. - auto walkStraddle = [&](unsigned* order) -> unsigned { + // Walk the layout from the start of frame through the end of the straddler, + // using the given pre-straddler and straddler orders. Returns the variable + // part of the cost: the straddler's encoding cost plus the init-span penalty. + // (Non-straddler bucket encoding costs are invariant w.r.t. order and live in baseCost.) + auto walkLayout = [&](unsigned* preOrder, unsigned* strOrder) -> unsigned { unsigned cost = 0; - int so = straddleSimOffEntry; + int so = stkOffs; int initLo = baseInitLo; int initHi = baseInitHi; bool hasInit = baseHasInit; + + // Pre-straddler buckets: walk for alignment padding (which shifts the straddler + // entry simOff) and for init-span contribution. Encoding cost is invariant. + for (unsigned k = 0; k < preStraddleCount; k++) + { + unsigned lcl = preOrder[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + + if (useBlockInit && (lclNeedsInit[lcl])) + { + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if (!hasInit) + { + initLo = loOffs; + initHi = hiOffs; + hasInit = true; + } + else + { + initLo = min(initLo, loOffs); + initHi = max(initHi, hiOffs); + } + } + } + + // Straddler bucket: encoding cost varies with order; init span continues to accumulate. for (unsigned k = 0; k < straddleCount; k++) { - unsigned lcl = order[k]; + unsigned lcl = strOrder[k]; int signedAlign = static_cast(lclAlignTo[lcl]); if ((signedAlign != 0) && ((so % signedAlign) != 0)) { @@ -5325,27 +5366,42 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a return cost; }; - unsigned* straddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; - unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + // Score the original (unsorted) order as baseline. + if (preStraddleCount > 0) + { + memcpy(preStraddleOrder, bucketLcls, preStraddleCount * sizeof(unsigned)); + memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned)); + } memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); - - // Score the original (unsorted) order as baseline. - unsigned origCost = baseCost + walkStraddle(straddleOrder); + unsigned origCost = baseCost + walkLayout(preStraddleOrder, straddleOrder); unsigned bestCost = origCost; int bestStrategy = -1; const char* bestName = "original"; - // Helper to try a strategy: sort straddleOrder, score, track if best. + // Helper to try a strategy: sort each pre-straddler bucket and the straddler + // independently with the comparator (bucket boundaries are preserved), then + // score with walkLayout and track if best. auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { + for (unsigned p = 0; p < straddleBucket; p++) + { + unsigned bStart = passStart[p]; + unsigned bEnd = passStart[p + 1]; + memcpy(&preStraddleOrder[bStart], &bucketLcls[bStart], (bEnd - bStart) * sizeof(unsigned)); + jitstd::sort(&preStraddleOrder[bStart], &preStraddleOrder[bEnd], comparator); + } memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); jitstd::sort(straddleOrder, straddleOrder + straddleCount, comparator); - unsigned cost = baseCost + walkStraddle(straddleOrder); + unsigned cost = baseCost + walkLayout(preStraddleOrder, straddleOrder); if (cost < bestCost) { bestCost = cost; bestStrategy = strategyIdx; bestName = name; + if (preStraddleCount > 0) + { + memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned)); + } memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); } return cost; @@ -5469,24 +5525,30 @@ unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* a return nullptr; } - // Assemble the final permutation: each bucket in its original order, EXCEPT the - // straddling bucket, which uses the best strategy's intra-bucket sort; followed by - // non-allocatable locals (the caller filters those out anyway). + // Assemble the final permutation: + // - Pre-straddler buckets use the best strategy's intra-bucket sort. + // - The straddler bucket uses the best strategy's intra-bucket sort. + // - Post-straddler buckets stay in canonical order. + // - Non-allocatable locals tail (the caller filters those out anyway). unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; unsigned outIdx = 0; for (unsigned p = 0; p < allocOrderLen; p++) { - if (p == straddleBucket) + unsigned bStart = passStart[p]; + unsigned bucketSize = passStart[p + 1] - bStart; + if (p < straddleBucket) + { + memcpy(&bestOrder[outIdx], &bestPreStraddleOrder[bStart], bucketSize * sizeof(unsigned)); + } + else if (p == straddleBucket) { - memcpy(&bestOrder[outIdx], bestStraddleOrder, straddleCount * sizeof(unsigned)); - outIdx += straddleCount; + memcpy(&bestOrder[outIdx], bestStraddleOrder, bucketSize * sizeof(unsigned)); } else { - unsigned bucketSize = passStart[p + 1] - passStart[p]; - memcpy(&bestOrder[outIdx], &bucketLcls[passStart[p]], bucketSize * sizeof(unsigned)); - outIdx += bucketSize; + memcpy(&bestOrder[outIdx], &bucketLcls[bStart], bucketSize * sizeof(unsigned)); } + outIdx += bucketSize; } if (outIdx < lvaCount) { From 149dc418038bc70b55cf42c5d568fdb139c72930 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 28 May 2026 18:05:14 -0700 Subject: [PATCH 27/28] JIT: lower JitFrameLayoutMaxSavingsThreshold default from 12 to 0 The threshold pruned strategies that could save at most N bytes of encoding. With the bucket-and-straddle search, the underlying sort is cheap enough that the prune buys almost no throughput while it gives up real code-size opportunities (~68K bytes across linux-x64 SPMI collections). Drop the bound to 0 so the search is attempted whenever any saving is possible. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/jitconfigvalues.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 75df1430b9e6f7..1dea92af8fb938 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -697,7 +697,7 @@ RELEASE_CONFIG_INTEGER(JitExtDefaultPolicyProfScale, "JitExtDefaultPolicyProfSca RELEASE_CONFIG_INTEGER(JitInlinePolicyModel, "JitInlinePolicyModel", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40) -RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 12) +RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 0) CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1) From 5550568a086b479d7fd99c9b08a941ff6bf7effd Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 28 May 2026 19:38:16 -0700 Subject: [PATCH 28/28] JIT: bail from lvaComputeOptimalFrameLayoutOrder when lvaCount == 0 Methods with no locals (e.g. small leaf wrappers around helper calls) were tripping the arenaAllocator 'size != 0' assert via the leading 'new unsigned[lvaCount]' allocation. Bail out at the function entry when lvaCount is zero so we never make a zero-sized arena allocation. This fixes a checked-build crossgen2 CoreLib failure on x86 surfaced by 'Build linux-x86 checked CoreCLR' and the Windows x86 checked legs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lclvars.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 8257f40f417dc1..b408c2544549a9 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4848,6 +4848,13 @@ enum LclAllocCategory : UINT // unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder) { + // No locals at all -- nothing to lay out, and we mustn't make zero-sized arena + // allocations below. + if (lvaCount == 0) + { + return nullptr; + } + // Pre-compute local sizes and total estimated frame size in one pass. // These arrays are indexed by lclNum and used throughout to avoid repeated // function calls in sort comparators and cost estimation.