diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e3ed2ceb179d5a..597bf3d02d00a0 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4351,6 +4351,9 @@ class Compiler void lvaAssignVirtualFrameOffsetsToArgs(); bool lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned lclNum, int* offset); void lvaAssignVirtualFrameOffsetsToLocals(); +#ifdef TARGET_XARCH + unsigned* lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder); +#endif bool lvaParamHasLocalStackSpace(unsigned lclNum); int lvaAllocLocalAndSetVirtualOffset(unsigned lclNum, unsigned size, int stkOffs); int lvaAllocAsyncContexts(int stkOffs); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 6e36a55a67ae02..1dea92af8fb938 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -697,6 +697,7 @@ RELEASE_CONFIG_INTEGER(JitExtDefaultPolicyProfScale, "JitExtDefaultPolicyProfSca RELEASE_CONFIG_INTEGER(JitInlinePolicyModel, "JitInlinePolicyModel", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40) +RELEASE_CONFIG_INTEGER(JitFrameLayoutMaxSavingsThreshold, "JitFrameLayoutMaxSavingsThreshold", 0) CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index fc4df0ff6b87d0..b408c2544549a9 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4801,6 +4801,777 @@ bool Compiler::lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(unsigned l return false; } +// Allocation pass categories used by lvaAssignVirtualFrameOffsetsToLocals +// and lvaComputeOptimalFrameLayoutOrder to classify locals by type. +enum LclAllocCategory : UINT +{ + ALLOC_NON_PTRS = 0x1, // assign offsets to non-ptr + ALLOC_PTRS = 0x2, // Second pass, assign offsets to tracked ptrs + ALLOC_UNSAFE_BUFFERS = 0x4, + ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8 +}; + +#ifdef TARGET_XARCH +//------------------------------------------------------------------------ +// lvaComputeOptimalFrameLayoutOrder: try multiple sort orders for locals and +// pick the one that minimizes total displacement encoding cost. +// +// On x86/x64, stack accesses within [-128, +127] of the base register use a 1-byte +// displacement (disp8), while larger offsets require 4 bytes (disp32) — 3 extra +// bytes per access. We try multiple sort orders for locals and pick the one that +// minimizes total encoding cost, estimated by simulating the frame allocation loop. +// +// The cost function computes Σ(refCnt × encodingBytes) where encodingBytes is 1 +// for disp8 or 4 for disp32, using unweighted refCnt as a proxy for instruction +// count. This gives a direct estimate of total displacement encoding bytes. +// (See also the comment in lvaAllocLocalAndSetVirtualOffset about sorting by alignment.) +// +// This optimization is safe because on x86/x64, lvaAssignFrameOffsets is only called +// with FINAL_FRAME_LAYOUT (no tentative layout exists). +// +// We only run this for frame-pointer-based frames because the disp8 boundary check +// assumes EBP/RBP-relative negative virtual offsets. ESP/RSP-based frames use positive +// offsets after fixup and contribute negligible savings. +// +// We skip frames that fit entirely within the disp8 zone. For MinOpts/Tier0 where +// precise ref counts are not computed, we do a lightweight LIR walk to count local +// references for sorting purposes. +// +// Arguments: +// stkOffs - current stack offset (after callee saves, XMM saves, and pre-allocated +// special locals) +// allocOrder - null-terminated array of allocation pass flags (ALLOC_NON_PTRS, etc.) +// +// Returns: +// An array of lclNum indices representing the optimal sort order, or nullptr if the +// original order is already optimal. +// +unsigned* Compiler::lvaComputeOptimalFrameLayoutOrder(int stkOffs, const UINT* allocOrder) +{ + // No locals at all -- nothing to lay out, and we mustn't make zero-sized arena + // allocations below. + if (lvaCount == 0) + { + return nullptr; + } + + // Pre-compute local sizes and total estimated frame size in one pass. + // These arrays are indexed by lclNum and used throughout to avoid repeated + // function calls in sort comparators and cost estimation. + unsigned* lclSize = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned estimatedLocalSize = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + lclSize[i] = lvaLclStackHomeSize(i); + estimatedLocalSize += lclSize[i]; + } + + // Skip frames where even with alignment padding, all locals will fit in disp8 range. + // We use 64 rather than 128 because alignment padding can inflate the actual frame + // size significantly beyond the raw sum of local sizes. + if (estimatedLocalSize <= 64) + { + return nullptr; + } + + // Pre-compute alignment requirements for each local. + // 0 = no alignment needed, otherwise the required alignment in bytes. + // + // NOTE: This is an approximation of the alignment behavior in + // lvaAssignVirtualFrameOffsetsToLocals + lvaAllocLocalAndSetVirtualOffset. + // In particular it does NOT model the x86-only DOUBLE_ALIGN / + // mustDoubleAlign / lvaIncrementFrameSize path for TYP_DOUBLE / TYP_LONG / + // lvStructDoubleAlign, nor the cross-bucket have_LclVarDoubleAlign + // pre-reservation. Frames where those apply may see the simulated simOff + // drift from the real layout by a pointer-sized slot, causing the + // straddler boundary and per-strategy cost estimate to be slightly + // imprecise. This is a heuristic, not a correctness path: the real + // allocator still runs unchanged and only the chosen sort permutation + // is affected. + unsigned* lclAlignTo = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclSize[i] < 8) + { + lclAlignTo[i] = 0; + } + else + { +#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES + LclVarDsc* varDsc = lvaGetDesc(i); + if (varTypeIsSIMD(varDsc)) + { + lclAlignTo[i] = static_cast(getSIMDTypeAlignment(varDsc->TypeGet())); + } + else +#endif + { + lclAlignTo[i] = 8; + } + } + } + + // Pre-compute which locals will be allocated in the main loop and their + // pass category. Category 0 means "not allocatable" (skipped by the loop). + unsigned* lclPassCategory = new (this, CMK_LvaTable) unsigned[lvaCount]; + for (unsigned i = 0; i < lvaCount; i++) + { + lclPassCategory[i] = 0; + LclVarDsc* varDsc = lvaGetDesc(i); + + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + { + continue; + } +#if FEATURE_FIXED_OUT_ARGS + if (i == lvaOutgoingArgSpaceVar) + { + continue; + } +#endif + if (lvaIsOSRLocal(i)) + { + continue; + } + if (!varDsc->lvOnFrame) + { + continue; + } + if ((i == lvaGSSecurityCookie) && getNeedsGSSecurityCookie()) + { + continue; + } + if (lvaIsUnknownSizeLocal(i)) + { + // The real loop calls lvaAllocUnknownSizeLocal for these; their stack home + // size is not modeled by lvaLclStackHomeSize so simulating them would skew + // the simOff walk. + continue; + } + if (i == lvaRetAddrVar) + { + continue; + } +#ifdef JIT32_GCENCODER + if (i == lvaLocAllocSPvar) + { + continue; + } +#endif + if ((i == lvaMonAcquired) || (i == lvaAsyncThreadObjectVar) || (i == lvaAsyncExecutionContextVar) || + (i == lvaAsyncSynchronizationContextVar)) + { + continue; + } + if ((varDsc->lvIsParam) && !lvaParamHasLocalStackSpace(i)) + { + continue; + } + + if ((varDsc->lvIsUnsafeBuffer) && compGSReorderStackLayout) + { + lclPassCategory[i] = varDsc->lvIsPtr ? ALLOC_UNSAFE_BUFFERS_WITH_PTRS : ALLOC_UNSAFE_BUFFERS; + } + else if (varTypeIsGC(varDsc->TypeGet()) && (varDsc->lvTracked)) + { + lclPassCategory[i] = ALLOC_PTRS; + } + else + { + lclPassCategory[i] = ALLOC_NON_PTRS; + } + } + + // Build per-pass buckets in allocOrder order. ALLOC_* values are powers of two; + // their bit indices (0..3) map to positions in allocOrder via passBitToAllocIdx. + const unsigned MAX_PASS_BITS = 4; + unsigned passBitToAllocIdx[MAX_PASS_BITS]; + for (unsigned k = 0; k < MAX_PASS_BITS; k++) + { + passBitToAllocIdx[k] = UINT_MAX; + } + + unsigned allocOrderLen = 0; + for (unsigned p = 0; allocOrder[p] != 0; p++) + { + // The optimization is gated off for compDbgEnC (which merges ALLOC_PTRS into the + // previous pass), so each allocOrder entry here is a single ALLOC_* bit. + unsigned bit = BitOperations::Log2((unsigned)allocOrder[p]); + assert(bit < MAX_PASS_BITS); + assert(((unsigned)allocOrder[p] & ((unsigned)allocOrder[p] - 1)) == 0); + passBitToAllocIdx[bit] = p; + allocOrderLen++; + } + assert(allocOrderLen <= MAX_PASS_BITS); + + unsigned passCount[MAX_PASS_BITS + 1] = {0}; + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclPassCategory[i] == 0) + { + continue; + } + unsigned bit = BitOperations::Log2(lclPassCategory[i]); + unsigned p = passBitToAllocIdx[bit]; + assert(p != UINT_MAX); + passCount[p]++; + } + + unsigned passStart[MAX_PASS_BITS + 1]; + passStart[0] = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + passStart[p + 1] = passStart[p] + passCount[p]; + } + const unsigned numAllocatable = passStart[allocOrderLen]; + + if (numAllocatable == 0) + { + return nullptr; + } + + // Concatenated bucket array: allocatable locals first (grouped by allocOrder pass, in + // ascending lclNum within each pass), then non-allocatable locals at the tail. + unsigned* bucketLcls = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned writePos[MAX_PASS_BITS]; + for (unsigned p = 0; p < allocOrderLen; p++) + { + writePos[p] = passStart[p]; + } + unsigned tailPos = numAllocatable; + for (unsigned i = 0; i < lvaCount; i++) + { + if (lclPassCategory[i] == 0) + { + bucketLcls[tailPos++] = i; + continue; + } + unsigned bit = BitOperations::Log2(lclPassCategory[i]); + unsigned p = passBitToAllocIdx[bit]; + bucketLcls[writePos[p]++] = i; + } + + // Walk buckets in their original order to determine where the disp8/disp32 boundary + // (simOff == -128) falls. simOff decreases monotonically across the walk, so at most + // ONE bucket can straddle the boundary. Buckets entirely above -128 contribute fixed + // cost = refCnt * 1; buckets entirely below contribute fixed cost = refCnt * 4. Only + // the straddling bucket's internal order affects total cost. + int bucketSimOffStart[MAX_PASS_BITS]; + int bucketSimOffEnd[MAX_PASS_BITS]; + int simOff = stkOffs; + unsigned straddleBucket = UINT_MAX; + for (unsigned p = 0; p < allocOrderLen; p++) + { + bucketSimOffStart[p] = simOff; + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) + { + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((simOff % signedAlign) != 0)) + { + simOff -= signedAlign + (simOff % signedAlign); + } + simOff -= static_cast(lclSize[lcl]); + } + bucketSimOffEnd[p] = simOff; + + // A bucket straddles the disp8/disp32 boundary when its first local can land in disp8 + // (simOff at entry strictly above -128, so at least 1 byte of disp8 budget remains) and + // its last local lands in disp32 (simOff after allocation strictly below -128). + // simOffEnd == -128 means the last local sits exactly at -128 (still disp8), so the + // bucket is fully disp8 and not a straddler. + if ((bucketSimOffStart[p] > -128) && (bucketSimOffEnd[p] < -128)) + { + straddleBucket = p; + } + } + + if (straddleBucket == UINT_MAX) + { + // The frame either fits entirely in disp8 (nothing to optimize) or every + // allocated bucket already starts past disp8 (reordering within a bucket + // can't pull refs into disp8 range). Bail. + JITDUMP("Frame layout optimization: skipping — no straddling bucket " + "(simOff at end = %d)\n", + simOff); + return nullptr; + } + + // Pre-compute ref counts and weights. For MinOpts/Tier0, precise ref counts are not + // available (all lvRefCnt == 0), so we do a lightweight LIR walk to count local refs. + unsigned* lclRefCnt = new (this, CMK_LvaTable) unsigned[lvaCount]; + weight_t* lclWeight = new (this, CMK_LvaTable) weight_t[lvaCount]; + bool isMinOpts = !PreciseRefCountsRequired(); + + if (isMinOpts) + { + memset(lclRefCnt, 0, lvaCount * sizeof(unsigned)); + + for (BasicBlock* const block : Blocks()) + { + for (GenTree* node : LIR::AsRange(block)) + { + if (node->OperIsAnyLocal()) + { + unsigned lclNum = node->AsLclVarCommon()->GetLclNum(); + if (lclNum < lvaCount) + { + lclRefCnt[lclNum]++; + } + } + } + } + + // For MinOpts, weighted = unweighted (no block weights available). + for (unsigned i = 0; i < lvaCount; i++) + { + lclWeight[i] = static_cast(lclRefCnt[i]); + } + } + else + { + for (unsigned i = 0; i < lvaCount; i++) + { + LclVarDsc* varDsc = lvaGetDesc(i); + lclRefCnt[i] = varDsc->lvRefCnt(lvaRefCountState); + lclWeight[i] = varDsc->lvRefCntWtd(lvaRefCountState); + } + } + + // Precise upper bound on savings: walk the straddling bucket in its original order, + // count refs that land past disp8. Each such ref could save at most 3 bytes by being + // moved into disp8 range. Bail if even moving them all wouldn't help much. + unsigned maxSavings; + { + int so = bucketSimOffStart[straddleBucket]; + unsigned refsInDisp32 = 0; + for (unsigned k = passStart[straddleBucket]; k < passStart[straddleBucket + 1]; k++) + { + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + if (so < -128) + { + refsInDisp32 += lclRefCnt[lcl]; + } + } + maxSavings = refsInDisp32 * 3; + if (maxSavings <= (unsigned)JitConfig.JitFrameLayoutMaxSavingsThreshold()) + { + JITDUMP("Frame layout optimization: skipping — max possible savings is %u bytes " + "(refsInDisp32=%u)\n", + maxSavings, refsInDisp32); + return nullptr; + } + } + + // Compute baseline cost from non-straddling buckets. These contributions are + // independent of any intra-bucket sort order, so they cancel out when comparing + // strategies. We still include them in totalCost for accurate JITDUMP output. + unsigned baseCost = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + if (p == straddleBucket) + { + continue; + } + unsigned encoding = (bucketSimOffStart[p] <= -128) ? 4u : 1u; + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) + { + baseCost += lclRefCnt[bucketLcls[k]] * encoding; + } + } + + // Pre-compute zero-init data for the cost model. Only used at FullOpts where S4 + // (initGroupedDensity) is active. At MinOpts S4 is skipped and the init-span term + // is small relative to the encoding-cost term, so we omit it. + bool* lclNeedsInit = nullptr; + bool useBlockInit = false; + int baseInitLo = 0; + int baseInitHi = 0; + bool baseHasInit = false; + if (!isMinOpts) + { + lclNeedsInit = new (this, CMK_LvaTable) bool[lvaCount]; + unsigned initSlotCount = 0; + for (unsigned i = 0; i < lvaCount; i++) + { + lclNeedsInit[i] = false; + if (lclPassCategory[i] == 0) + { + continue; + } + + LclVarDsc* varDsc = lvaGetDesc(i); + + if (fgVarIsNeverZeroInitializedInProlog(i)) + { + continue; + } + if (lvaIsFieldOfDependentlyPromotedStruct(varDsc)) + { + continue; + } + if (varDsc->lvHasExplicitInit) + { + continue; + } + if ((varDsc->lvIsTemp) && !varDsc->HasGCPtr()) + { + continue; + } + + if ((info.compInitMem) || varDsc->HasGCPtr() || (varDsc->lvMustInit)) + { + lclNeedsInit[i] = true; + initSlotCount += (lclSize[i] + sizeof(int) - 1) / sizeof(int); + } + } + useBlockInit = (initSlotCount > 4); + + // If block init applies, precompute the init-span contribution from POST-straddler + // buckets only. Pre-straddler buckets contribute order-dependent simOffs (since each + // strategy resorts them), so their init-span contribution is folded in per-strategy + // by walkLayout. The straddler's contribution is also folded in by walkLayout. + // Post-straddler buckets use canonical simOffs computed from bucketSimOffEnd[straddleBucket]. + if (useBlockInit) + { + int so = bucketSimOffEnd[straddleBucket]; + for (unsigned p = straddleBucket + 1; p < allocOrderLen; p++) + { + for (unsigned k = passStart[p]; k < passStart[p + 1]; k++) + { + unsigned lcl = bucketLcls[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + if (lclNeedsInit[lcl]) + { + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if (!baseHasInit) + { + baseInitLo = loOffs; + baseInitHi = hiOffs; + baseHasInit = true; + } + else + { + baseInitLo = min(baseInitLo, loOffs); + baseInitHi = max(baseInitHi, hiOffs); + } + } + } + } + } + } + + JITDUMP("Frame layout optimization: trying strategies for %u locals " + "(estimated frame size %u bytes%s, straddle bucket=%u of %u, baseCost=%u, " + "maxSavings=%u)\n", + lvaCount, estimatedLocalSize, isMinOpts ? ", using lightweight ref counts" : "", straddleBucket, + allocOrderLen, baseCost, maxSavings); + + const unsigned straddleStart = passStart[straddleBucket]; + const unsigned straddleCount = passStart[straddleBucket + 1] - straddleStart; + const unsigned preStraddleCount = straddleStart; + + // Pre-straddler buckets (when present) participate in each strategy's sort. + // Reordering inside a pre-straddler bucket can change alignment padding, + // which shifts the straddler's entry simOff and can pull more refs into disp8. + unsigned* preStraddleOrder = (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr; + unsigned* bestPreStraddleOrder = + (preStraddleCount > 0) ? new (this, CMK_LvaTable) unsigned[preStraddleCount] : nullptr; + unsigned* straddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + unsigned* bestStraddleOrder = new (this, CMK_LvaTable) unsigned[straddleCount]; + + // Walk the layout from the start of frame through the end of the straddler, + // using the given pre-straddler and straddler orders. Returns the variable + // part of the cost: the straddler's encoding cost plus the init-span penalty. + // (Non-straddler bucket encoding costs are invariant w.r.t. order and live in baseCost.) + auto walkLayout = [&](unsigned* preOrder, unsigned* strOrder) -> unsigned { + unsigned cost = 0; + int so = stkOffs; + int initLo = baseInitLo; + int initHi = baseInitHi; + bool hasInit = baseHasInit; + + // Pre-straddler buckets: walk for alignment padding (which shifts the straddler + // entry simOff) and for init-span contribution. Encoding cost is invariant. + for (unsigned k = 0; k < preStraddleCount; k++) + { + unsigned lcl = preOrder[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + + if (useBlockInit && (lclNeedsInit[lcl])) + { + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if (!hasInit) + { + initLo = loOffs; + initHi = hiOffs; + hasInit = true; + } + else + { + initLo = min(initLo, loOffs); + initHi = max(initHi, hiOffs); + } + } + } + + // Straddler bucket: encoding cost varies with order; init span continues to accumulate. + for (unsigned k = 0; k < straddleCount; k++) + { + unsigned lcl = strOrder[k]; + int signedAlign = static_cast(lclAlignTo[lcl]); + if ((signedAlign != 0) && ((so % signedAlign) != 0)) + { + so -= signedAlign + (so % signedAlign); + } + so -= static_cast(lclSize[lcl]); + cost += lclRefCnt[lcl] * ((so >= -128) ? 1u : 4u); + + if (useBlockInit && (lclNeedsInit[lcl])) + { + int loOffs = so; + int hiOffs = so + static_cast(lclSize[lcl]); + if (!hasInit) + { + initLo = loOffs; + initHi = hiOffs; + hasInit = true; + } + else + { + initLo = min(initLo, loOffs); + initHi = max(initHi, hiOffs); + } + } + } + + if (useBlockInit && hasInit && (initHi > initLo)) + { + unsigned initSpan = static_cast(initHi - initLo); + unsigned initCost = ((initSpan + 15) / 16) * 2; + cost += initCost; + } + + return cost; + }; + + // Score the original (unsorted) order as baseline. + if (preStraddleCount > 0) + { + memcpy(preStraddleOrder, bucketLcls, preStraddleCount * sizeof(unsigned)); + memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned)); + } + memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); + memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); + unsigned origCost = baseCost + walkLayout(preStraddleOrder, straddleOrder); + unsigned bestCost = origCost; + int bestStrategy = -1; + const char* bestName = "original"; + + // Helper to try a strategy: sort each pre-straddler bucket and the straddler + // independently with the comparator (bucket boundaries are preserved), then + // score with walkLayout and track if best. + auto tryStrategy = [&](int strategyIdx, const char* name, auto comparator) -> unsigned { + for (unsigned p = 0; p < straddleBucket; p++) + { + unsigned bStart = passStart[p]; + unsigned bEnd = passStart[p + 1]; + memcpy(&preStraddleOrder[bStart], &bucketLcls[bStart], (bEnd - bStart) * sizeof(unsigned)); + jitstd::sort(&preStraddleOrder[bStart], &preStraddleOrder[bEnd], comparator); + } + memcpy(straddleOrder, &bucketLcls[straddleStart], straddleCount * sizeof(unsigned)); + jitstd::sort(straddleOrder, straddleOrder + straddleCount, comparator); + unsigned cost = baseCost + walkLayout(preStraddleOrder, straddleOrder); + if (cost < bestCost) + { + bestCost = cost; + bestStrategy = strategyIdx; + bestName = name; + if (preStraddleCount > 0) + { + memcpy(bestPreStraddleOrder, preStraddleOrder, preStraddleCount * sizeof(unsigned)); + } + memcpy(bestStraddleOrder, straddleOrder, straddleCount * sizeof(unsigned)); + } + return cost; + }; + + // Strategy 0: Access density (weighted ref count / size) descending. + // A small hot local is more valuable per frame byte than a large hot local. + auto densityCompare = [lclSize, lclWeight, lclRefCnt](unsigned n1, unsigned n2) -> bool { + weight_t dens1 = lclWeight[n1] * lclSize[n2]; + weight_t dens2 = lclWeight[n2] * lclSize[n1]; + if (dens1 != dens2) + { + return dens1 > dens2; + } + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + { + return a1; + } + if (lclRefCnt[n1] != lclRefCnt[n2]) + { + return lclRefCnt[n1] > lclRefCnt[n2]; + } + return n1 < n2; + }; + unsigned densityCost = tryStrategy(0, "density", densityCompare); + + // Strategy 1: Size ascending — maximize count of locals in disp8 range. + // Small locals consume less of the disp8 budget, so packing them first + // maximizes how many locals get short encodings. + auto sizeAscCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { + if (lclSize[n1] != lclSize[n2]) + { + return lclSize[n1] < lclSize[n2]; + } + if (lclRefCnt[n1] != lclRefCnt[n2]) + { + return lclRefCnt[n1] > lclRefCnt[n2]; + } + return n1 < n2; + }; + unsigned sizeAscCost = tryStrategy(1, "sizeAsc", sizeAscCompare); + + // Strategies 2-3: Weight-based sorts that differ from density/sizeAsc only when + // PGO block weights are available (FullOpts). For MinOpts, lclWeight == lclRefCnt, + // so S2 is redundant with density once D is in the set (empirically adds <1% of the + // code-size wins for the full per-strategy TP cost), and S3 == density. Skipped at MinOpts. + unsigned weightCost = 0; + unsigned refDensityCost = 0; + if (!isMinOpts) + { + // Strategy 2: Weighted ref count descending. + auto weightCompare = [lclSize, lclWeight](unsigned n1, unsigned n2) -> bool { + if (lclWeight[n1] != lclWeight[n2]) + { + return lclWeight[n1] > lclWeight[n2]; + } + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + { + return a1; + } + return n1 < n2; + }; + weightCost = tryStrategy(2, "weight", weightCompare); + + // Strategy 3: Unweighted ref count density (refCnt / size) descending. + auto refDensityCompare = [lclSize, lclRefCnt](unsigned n1, unsigned n2) -> bool { + double dens1 = (double)lclRefCnt[n1] * lclSize[n2]; + double dens2 = (double)lclRefCnt[n2] * lclSize[n1]; + if (dens1 != dens2) + { + return dens1 > dens2; + } + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + { + return a1; + } + return n1 < n2; + }; + refDensityCost = tryStrategy(3, "refDensity", refDensityCompare); + } + + // Strategy 4: Density with init-grouping — init-needing locals sorted by + // density first, then non-init locals by density. Keeps the zero-init span + // tight while still prioritizing hot locals within each group. + // Only useful when block init will be used (otherwise identical to density). + // Skipped at MinOpts: empirically adds <1% of the code-size wins for the full + // per-strategy TP cost. + unsigned initGroupedDensityCost = 0; + if (useBlockInit && !isMinOpts) + { + auto initGroupedDensityCompare = [lclSize, lclWeight, lclNeedsInit](unsigned n1, unsigned n2) -> bool { + bool init1 = lclNeedsInit[n1], init2 = lclNeedsInit[n2]; + if (init1 != init2) + { + return init1; // init-needing first + } + weight_t dens1 = lclWeight[n1] * lclSize[n2]; + weight_t dens2 = lclWeight[n2] * lclSize[n1]; + if (dens1 != dens2) + { + return dens1 > dens2; + } + bool a1 = (lclSize[n1] >= 8), a2 = (lclSize[n2] >= 8); + if (a1 != a2) + { + return a1; + } + return n1 < n2; + }; + initGroupedDensityCost = tryStrategy(4, "initGroupedDensity", initGroupedDensityCompare); + } + + if (bestStrategy < 0) + { + JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " + "initGroupedDensity=%u; original order is best, no change\n", + origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost); + return nullptr; + } + + // Assemble the final permutation: + // - Pre-straddler buckets use the best strategy's intra-bucket sort. + // - The straddler bucket uses the best strategy's intra-bucket sort. + // - Post-straddler buckets stay in canonical order. + // - Non-allocatable locals tail (the caller filters those out anyway). + unsigned* bestOrder = new (this, CMK_LvaTable) unsigned[lvaCount]; + unsigned outIdx = 0; + for (unsigned p = 0; p < allocOrderLen; p++) + { + unsigned bStart = passStart[p]; + unsigned bucketSize = passStart[p + 1] - bStart; + if (p < straddleBucket) + { + memcpy(&bestOrder[outIdx], &bestPreStraddleOrder[bStart], bucketSize * sizeof(unsigned)); + } + else if (p == straddleBucket) + { + memcpy(&bestOrder[outIdx], bestStraddleOrder, bucketSize * sizeof(unsigned)); + } + else + { + memcpy(&bestOrder[outIdx], &bucketLcls[bStart], bucketSize * sizeof(unsigned)); + } + outIdx += bucketSize; + } + if (outIdx < lvaCount) + { + memcpy(&bestOrder[outIdx], &bucketLcls[numAllocatable], (lvaCount - numAllocatable) * sizeof(unsigned)); + } + + JITDUMP("Frame layout costs: original=%u density=%u sizeAsc=%u weight=%u refDensity=%u " + "initGroupedDensity=%u; " + "selected '%s' (cost=%u, saved %u encoding bytes est.)\n", + origCost, densityCost, sizeAscCost, weightCost, refDensityCost, initGroupedDensityCost, bestName, bestCost, + origCost > bestCost ? origCost - bestCost : 0); + + return bestOrder; +} +#endif // TARGET_XARCH + //----------------------------------------------------------------------------- // lvaAssignVirtualFrameOffsetsToLocals: compute the virtual stack offsets for // all elements on the stackframe. @@ -5117,13 +5888,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() non-pointer temps */ - enum Allocation - { - ALLOC_NON_PTRS = 0x1, // assign offsets to non-ptr - ALLOC_PTRS = 0x2, // Second pass, assign offsets to tracked ptrs - ALLOC_UNSAFE_BUFFERS = 0x4, - ALLOC_UNSAFE_BUFFERS_WITH_PTRS = 0x8 - }; UINT alloc_order[5]; unsigned int cur = 0; @@ -5179,6 +5943,20 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() UINT assignMore = 0xFFFFFFFF; bool have_LclVarDoubleAlign = false; +#ifdef TARGET_XARCH + // Multi-strategy frame layout optimization for x86/x64. + // See lvaComputeOptimalFrameLayoutOrder for details. + // Only attempt the optimization during the final layout pass; earlier passes + // (PRE_REGALLOC/REGALLOC/TENTATIVE) may invoke lvaAssignVirtualFrameOffsetsToLocals + // for size estimation, and the cost-model assumptions only hold for the final pass. + unsigned* lclVarSortOrder = nullptr; + if ((lvaDoneFrameLayout == FINAL_FRAME_LAYOUT) && lvaLocalVarRefCounted() && !opts.compDbgEnC && + codeGen->isFramePointerUsed()) + { + lclVarSortOrder = lvaComputeOptimalFrameLayoutOrder(stkOffs, alloc_order); + } +#endif // TARGET_XARCH + for (cur = 0; alloc_order[cur]; cur++) { if ((assignMore & alloc_order[cur]) == 0) @@ -5191,8 +5969,15 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() unsigned lclNum; LclVarDsc* varDsc; - for (lclNum = 0, varDsc = lvaTable; lclNum < lvaCount; lclNum++, varDsc++) + for (unsigned sortIdx = 0; sortIdx < lvaCount; sortIdx++) { +#ifdef TARGET_XARCH + lclNum = (lclVarSortOrder != nullptr) ? lclVarSortOrder[sortIdx] : sortIdx; +#else + lclNum = sortIdx; +#endif + varDsc = lvaGetDesc(lclNum); + /* Ignore field locals of the promotion type PROMOTION_TYPE_FIELD_DEPENDENT. In other words, we will not calculate the "base" address of the struct local if the promotion type is PROMOTION_TYPE_FIELD_DEPENDENT.