Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/coreclr/nativeaot/Runtime/EHHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,12 @@ static uintptr_t UnwindSimpleHelperToCaller(
pContext->SetSp(sp+sizeof(uintptr_t)); // pop the stack
#elif defined(HOST_ARM) || defined(HOST_ARM64)
uintptr_t adjustedFaultingIP = pContext->GetLr();
#if defined(HOST_ARM)
// Interface dispatch pushes {r1,r2} (8 bytes) before the potential null-this AV.
// Restore SP to the caller's original value.
if (InInterfaceDispatchHelper(pContext->GetIp()))
Comment thread
jkotas marked this conversation as resolved.
pContext->SetSp(pContext->GetSp() + 8);
#endif
#elif defined(HOST_LOONGARCH64) || defined(HOST_RISCV64)
uintptr_t adjustedFaultingIP = pContext->GetRa();
#else
Expand Down
20 changes: 5 additions & 15 deletions src/coreclr/nativeaot/Runtime/ThunksMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#elif TARGET_X86
#define THUNK_SIZE 12
#elif TARGET_ARM
#define THUNK_SIZE 20
#define THUNK_SIZE 12
#elif TARGET_ARM64
Comment thread
jkotas marked this conversation as resolved.
#define THUNK_SIZE 16
#elif TARGET_LOONGARCH64
Expand Down Expand Up @@ -202,26 +202,16 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection)
#elif TARGET_ARM

// mov r12,<thunk data address>
// str r12,[sp,#-4]
// ldr r12,[r12, <delta to get to last dword in data page]
// bx r12
// ldr pc,[r12, <delta to get to last dword in data page>]
// r12 retains data address; RhCommonStub reads it directly without stack

EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12);
pCurrentThunkAddress += 8;

*((uint32_t*)pCurrentThunkAddress) = 0xcc04f84d;
// ldr pc, [r12, #offset]
*((uint32_t*)pCurrentThunkAddress) = 0xf000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16);
pCurrentThunkAddress += 4;

*((uint32_t*)pCurrentThunkAddress) = 0xc000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16);
pCurrentThunkAddress += 4;

*((uint16_t*)pCurrentThunkAddress) = 0x4760;
pCurrentThunkAddress += 2;

// nops for alignment
*((uint16_t*)pCurrentThunkAddress) = 0xbf00;
pCurrentThunkAddress += 2;

#elif TARGET_ARM64

//adr xip0, <delta PC to thunk data address>
Expand Down
87 changes: 36 additions & 51 deletions src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
#include <unixasmmacros.inc>

// Dispatching version of RhpResolveInterfaceMethod
LEAF_ENTRY RhpInterfaceDispatch, _TEXT

// r12 currently contains the indirection cell address. But we need more scratch registers and
// we may A/V on a null this. Store r1 and r2 in red zone.
str r1, [sp, #-8]
str r2, [sp, #-4]
NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler

// Unwinder special cases this push to be able to unwind out of the potential nullref below.
PROLOG_PUSH "{r1,r2}"
Comment thread
jkotas marked this conversation as resolved.

// Load the MethodTable from the object instance in r0.
// The label marks the location of a potential nullref for the unwinder.
Expand All @@ -25,25 +24,21 @@ LEAF_ENTRY RhpInterfaceDispatch, _TEXT
bne LOCAL_LABEL(Hashtable)

// Fast path: restore r1/r2 before jumping to cached code.
ldr r1, [sp, #-8]
ldr r2, [sp, #-4]
EPILOG_POP "{r1,r2}"
// dmb ensures that the Code load below sees the value written before
// MethodTable. ARM32 has no load-acquire instruction (LDAR is ARMv8 only).
dmb
ldr r12, [r12, #4] // load the cached monomorphic resolved code address
bx r12
EPILOG_BRANCH_REG r12

LOCAL_LABEL(Hashtable):

// Match what the fast path has pushed.
.save {r1,r2}

// r1 = MethodTable, r12 = indirection cell address
Comment thread
jkotas marked this conversation as resolved.
// Look up the target in the dispatch cache hashtable (GenericCache<Key, nint>).
// Spill additional registers to the red zone below sp
// so we don't modify sp (this is a LEAF_ENTRY with no unwind info).
str r3, [sp, #-12]
str lr, [sp, #-16]
str r4, [sp, #-20]
str r5, [sp, #-24]
str r6, [sp, #-28]
PROLOG_PUSH "{r3,r4,r5,r6,r8}"

// Load the _table field (Entry[]) from the cache struct.
PREPARE_EXTERNAL_VAR_INDIRECT g_pDispatchCache, r2
Expand All @@ -53,13 +48,13 @@ LOCAL_LABEL(Hashtable):
// hash = (RotateLeft(dispatchCell, 16) ^ objectType) * GoldenRatio
ror r3, r12, #16
eor r3, r3, r1
movw lr, #0x79B9
movt lr, #0x9E37 // lr = 0x9E3779B9
mul r3, r3, lr
movw r8, #0x79B9
movt r8, #0x9E37 // r8 = 0x9E3779B9
mul r3, r3, r8

// HashToBucket: bucket = hash >> hashShift
ldrb lr, [r2, #8]
lsr r3, r3, lr
ldrb r8, [r2, #8]
lsr r3, r3, r8

mov r4, #0 // i = 0

Expand All @@ -75,15 +70,15 @@ LOCAL_LABEL(ProbeLoop):
dmb

// Compare key (dispatchCell, objectType)
ldr lr, [r5, #4]
cmp r12, lr
ldr r8, [r5, #4]
cmp r12, r8
bne LOCAL_LABEL(ProbeMiss)
ldr lr, [r5, #8]
cmp r1, lr
ldr r8, [r5, #8]
cmp r1, r8
bne LOCAL_LABEL(ProbeMiss)

// Read the cached code pointer, then re-verify the version has not changed.
ldr lr, [r5, #12]
ldr r8, [r5, #12]
dmb

// Verify: (original version & ~1) == re-read version.
Expand All @@ -94,17 +89,12 @@ LOCAL_LABEL(ProbeLoop):
bne LOCAL_LABEL(CacheMiss)

// Dispatch to cached target.
mov r12, lr
mov r12, r8

ldr r6, [sp, #-28]
ldr r5, [sp, #-24]
ldr r4, [sp, #-20]
ldr r3, [sp, #-12]
ldr lr, [sp, #-16]
ldr r1, [sp, #-8]
ldr r2, [sp, #-4]
EPILOG_POP "{r3,r4,r5,r6,r8}"
EPILOG_POP "{r1,r2}"

bx r12
EPILOG_BRANCH_REG r12

LOCAL_LABEL(ProbeMiss):
// If version is zero the rest of the bucket is unclaimed — stop probing.
Expand All @@ -114,28 +104,23 @@ LOCAL_LABEL(ProbeMiss):
// Quadratic reprobe: i++; index = (index + i) & tableMask
add r4, r4, #1
add r3, r3, r4
ldr lr, [r2, #4]
sub lr, lr, #2
and r3, r3, lr
ldr r8, [r2, #4]
sub r8, r8, #2
and r3, r3, r8
cmp r4, #8
blt LOCAL_LABEL(ProbeLoop)

LOCAL_LABEL(CacheMiss):
ldr r6, [sp, #-28]
ldr r5, [sp, #-24]
ldr r4, [sp, #-20]

LOCAL_LABEL(SlowPath):
// restore original value of r1, r2, r3, lr
ldr r3, [sp, #-12]
ldr lr, [sp, #-16]
ldr r1, [sp, #-8]
ldr r2, [sp, #-4]

str r12, [sp, #-8]
EPILOG_POP "{r3,r4,r5,r6,r8}"
EPILOG_POP "{r1,r2}"

// Push args for RhpUniversalTransitionTailCall:
// [sp+0] = extra arg (indirection cell), [sp+4] = target fn (RhpCidResolve)
PROLOG_STACK_ALLOC 8
str r12, [sp, #0]
PREPARE_EXTERNAL_VAR RhpCidResolve, r12
str r12, [sp, #-4]
str r12, [sp, #4]

b C_FUNC(RhpUniversalTransitionTailCall)

LEAF_END RhpInterfaceDispatch, _TEXT
NESTED_END RhpInterfaceDispatch, _TEXT
5 changes: 2 additions & 3 deletions src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@
//
NESTED_ENTRY RhCommonStub, _TEXT, NoHandler
// Custom calling convention:
// red zone has pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers)
// Copy red zone value into r12 so that the PROLOG_PUSH doesn't destroy it
ldr r12, [sp, #-4]
// r12 already has the current thunk's data block pointer
// (thunk branched here via ldr pc, preserving r12)
PROLOG_PUSH "{r0-r4, lr}"
PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers

Expand Down
25 changes: 12 additions & 13 deletions src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@
//
// At input to this function, r0-3, d0-7 and the stack may contain any number of arguments.
//
// In addition, there are 2 extra arguments passed in the RED ZONE (8 byte negative space
// off of sp).
// sp-4 will contain the managed function that is to be called by this transition function
// sp-8 will contain the pointer sized extra argument to the managed function
// In addition, there are 2 extra arguments passed on the stack. The caller pushes them
// (sp adjusted by -8 before branching here):
// [sp+4] = managed function to be called
// [sp+0] = pointer sized extra argument
//
// When invoking the callee:
//
// r0 shall contain a pointer to the TransitionBlock
// r1 shall contain the value that was in sp-8 at entry to this function
// r1 shall contain the value that was in [sp+0] at entry to this function
//
// Frame layout is:
//
Expand Down Expand Up @@ -81,18 +81,17 @@

NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler
// Save argument registers (including floating point) and the return address.
// NOTE: While we do that, capture the two arguments in the red zone into r12 and r3.
ldr r12, [sp, #-4] // Capture first argument from red zone into r12
PROLOG_PUSH "{r3}" // Push r3
ldr r3, [sp, #-4] // Capture second argument from red zone into r3
PROLOG_PUSH "{r0-r2}" // Push the rest of the registers
// Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn
Comment thread
jkotas marked this conversation as resolved.
.pad #8
PROLOG_PUSH "{r0-r1}"
ldr r12, [sp, #12] // Capture target function (caller's [sp+4], now at sp+8+4)
ldr r1, [sp, #8] // Capture extra arg (caller's [sp], now at sp+8)
str r3, [sp, #12] // Store remaining arg registers into the space used for the hidden args
str r2, [sp, #8]
PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data.
PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers
PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC

// Setup the arguments to the transition thunk.
mov r1, r3

#ifdef TRASH_SAVED_ARGUMENT_REGISTERS

// Before calling out, trash all of the argument registers except the ones (r0, r1) that
Expand Down
41 changes: 15 additions & 26 deletions src/coreclr/runtime/arm/StubDispatch.S
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
.macro DEFINE_INTERFACE_DISPATCH_STUB entries

NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler
// r12 currently contains the indirection cell address. But we need more scratch registers and
// we may A/V on a null this. Store r1 and r2 in red zone.
str r1, [sp, #-8]
str r2, [sp, #-4]
// r12 currently contains the indirection cell address. But we need more scratch registers.
// Save r1 and r2 on the stack.
PROLOG_PUSH "{r1,r2}"

// r12 currently holds the indirection cell address. We need to get the cache structure instead.
ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache]
Expand All @@ -37,7 +36,7 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler
// For each entry in the cache, see if its MethodTable type matches the MethodTable in r1.
// If so, call the second cache entry. If not, skip the InterfaceDispatchCacheEntry.
// R1 : Instance MethodTable*
Comment thread
jkotas marked this conversation as resolved.
// R2: Cache data structure
// R2 : Cache data structure
// R12 : Trashed. On successful check, set to the target address to jump to.
.rept \entries
ldr r12, [r2, #CurrentOffset]
Expand All @@ -52,28 +51,17 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler
// Point r12 to the indirection cell using the back pointer in the cache block
ldr r12, [r2, #OFFSETOF__InterfaceDispatchCache__m_pCell]

ldr r1, [sp, #-8]
ldr r2, [sp, #-4]
EPILOG_POP "{r1,r2}"
b C_FUNC(RhpInterfaceDispatchSlow)

// Race detected: r12 still holds the indirection cell address (not yet clobbered).
// Re-dispatch through the indirection cell to retry with the current stub and cache pair.
// ldr pc, [r12] branches to the current m_pStub without clobbering r12.
LOCAL_LABEL(RaceRetry_\entries):
Comment thread
jkotas marked this conversation as resolved.
ldr r1, [sp, #-8]
ldr r2, [sp, #-4]
EPILOG_POP "{r1,r2}"
ldr pc, [r12]

// Common epilog for cache hits. Have to out of line it here due to limitation on the number of
// epilogs imposed by the unwind code macros.
LOCAL_LABEL(99_\entries):
// R2 contains address of the cache block. We store it in the red zone in case the target we jump
// to needs it.
// R12 contains the target address to jump to
ldr r1, [sp, #-8]
// We have to store R2 with address of the cache block into red zone before restoring original r2.
str r2, [sp, #-8]
ldr r2, [sp, #-4]
EPILOG_POP "{r1,r2}"
EPILOG_BRANCH_REG r12

NESTED_END RhpInterfaceDispatch\entries, _TEXT
Expand Down Expand Up @@ -108,17 +96,18 @@ LEAF_END RhpInitialDynamicInterfaceDispatch, _TEXT

// Cache miss case, call the runtime to resolve the target and update the cache.
// Use universal transition helper to allow an exception to flow out of resolution
LEAF_ENTRY RhpInterfaceDispatchSlow, _TEXT
NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler
// r12 has the interface dispatch cell address in it.
// The calling convention of the universal thunk is that the parameter
// for the universal thunk target is to be placed in sp-8
// and the universal thunk target address is to be placed in sp-4
str r12, [sp, #-8]
// Push the two arguments that the universal transition thunk expects:
// [sp] = parameter for the universal thunk target (cell address)
// [sp+4] = universal thunk target address (RhpCidResolve)
PROLOG_STACK_ALLOC 8
str r12, [sp]
PREPARE_EXTERNAL_VAR RhpCidResolve, r12
str r12, [sp, #-4]
str r12, [sp, #4]

// jump to universal transition thunk
b C_FUNC(RhpUniversalTransitionTailCall)
LEAF_END RhpInterfaceDispatchSlow, _TEXT
NESTED_END RhpInterfaceDispatchSlow, _TEXT

#endif // FEATURE_CACHED_INTERFACE_DISPATCH
16 changes: 8 additions & 8 deletions src/coreclr/vm/arm/virtualcallstubcpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,8 @@ struct VTableCallStub

size_t cbSize = 4; // First ldr instruction

// If we never save r0 to the red zone, we have the short version of the stub
if (*(UINT32*)(&pStubCode[cbSize]) != 0x0c04f84d)
// If we never save r0 to the stack, we have the short version of the stub
if (*(UINT32*)(&pStubCode[cbSize]) != 0x0d04f84d)
{
return
4 + // ldr r12,[r0]
Expand All @@ -299,7 +299,7 @@ struct VTableCallStub
4; // Slot value (data storage, not a real instruction)
}

cbSize += 4; // Saving r0 into red zone
cbSize += 4; // Saving r0 (push)
cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of vtable into r12
cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of targe address into r12

Expand Down Expand Up @@ -335,7 +335,7 @@ struct VTableCallHolder

int indirectionsSize = (offsetOfIndirection > 0xFFF ? 12 : 4) + (offsetAfterIndirection > 0xFFF ? 12 : 4);
if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
indirectionsSize += 8; // Save/restore r0 using red zone
indirectionsSize += 8; // Save/restore r0 (push/pop)

return 6 + indirectionsSize + 4;
}
Expand Down Expand Up @@ -429,8 +429,8 @@ void VTableCallHolder::Initialize(unsigned slot)

if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
{
// str r0, [sp, #-4]. Save r0 in the red zone
*(UINT32*)p = 0x0c04f84d; p += 4;
// str r0, [sp, #-4]!
*(UINT32*)p = 0x0d04f84d; p += 4;
}

if (offsetOfIndirection > 0xFFF)
Expand Down Expand Up @@ -463,8 +463,8 @@ void VTableCallHolder::Initialize(unsigned slot)

if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
{
// ldr r0, [sp, #-4]. Restore r0 from the red zone.
*(UINT32*)p = 0x0c04f85d; p += 4;
// ldr r0, [sp], #4
*(UINT32*)p = 0x0b04f85d; p += 4;
}

// bx r12
Expand Down