[clr-interp] Implement cached virtual/interface dispatch (#123815)

BrzVlad · web-flow · commit 49b26e43f8ec · 2026-02-09T14:38:53.000+02:00
We create a simple hashtable (InterpDispatchCache) that maps
DispatchToken + target MT to target method to be called. The cache is
similar to the `DispatchCache` used by VSD. It holds a single mapping
per index, when a collision happens the entry will be replaced with the
new one. Replaced entries are freed during GC. The expectation is that
there will be few collisions given only a subset of methods are being
interpreted.

This makes a microbenchmark from the suite 4x faster.
diff --git a/src/coreclr/interpreter/compiler.cpp b/src/coreclr/interpreter/compiler.cpp
@@ -5182,11 +5182,18 @@ void InterpCompiler::EmitCall(CORINFO_RESOLVED_TOKEN* pConstrainedToken, bool re
             break;
         }
         case CORINFO_VIRTUALCALL_VTABLE:
+        {
             // Traditional virtual call. In theory we could optimize this to using the vtable
             AddIns(tailcall ? INTOP_CALLVIRT_TAIL : INTOP_CALLVIRT);
             m_pLastNewIns->data[0] = GetDataItemIndex(callInfo.hMethod);
+            // Reserved slots for caching DispatchToken and its hash
+            m_pLastNewIns->data[1] = GetNewDataItemIndex(nullptr);
+            int32_t secondCache = GetNewDataItemIndex(nullptr);
+#ifdef DEBUG
+            assert(secondCache == (m_pLastNewIns->data[1] + 1));
+#endif
             break;
-
+        }
         case CORINFO_VIRTUALCALL_LDVIRTFTN:
             if ((callInfo.sig.sigInst.methInstCount != 0) || (m_compHnd->getClassAttribs(m_compHnd->getMethodClass(callInfo.hMethod)) & CORINFO_FLG_SHAREDINST))
             {
@@ -5217,6 +5224,11 @@ void InterpCompiler::EmitCall(CORINFO_RESOLVED_TOKEN* pConstrainedToken, bool re
             {
                 AddIns(tailcall ? INTOP_CALLVIRT_TAIL : INTOP_CALLVIRT);
                 m_pLastNewIns->data[0] = GetDataItemIndex(callInfo.hMethod);
+                m_pLastNewIns->data[1] = GetNewDataItemIndex(nullptr);
+                int32_t secondCache = GetNewDataItemIndex(nullptr);
+#ifdef DEBUG
+                assert(secondCache == (m_pLastNewIns->data[1] + 1));
+#endif
             }
             break;
 
diff --git a/src/coreclr/interpreter/compiler.h b/src/coreclr/interpreter/compiler.h
@@ -105,6 +105,13 @@ class InterpDataItemIndexMap
         _dataItems = dataItems;
     }
 
+    // Allocates a slot in the data items that is not shared with other opcodes
+    // Typically used for caching data at runtime.
+    int32_t GetNewDataItemIndex(void* data)
+    {
+        return _dataItems->Add(data);
+    }
+
     int32_t GetDataItemIndex(const InterpGenericLookup& lookup)
     {
         const size_t sizeOfFieldsConcatenated = sizeof(InterpGenericLookup::offsets) +
@@ -687,6 +694,10 @@ class InterpCompiler
     {
         return m_genericLookupToDataItemIndex.GetDataItemIndex(data);
     }
+    int32_t GetNewDataItemIndex(void* data)
+    {
+        return m_genericLookupToDataItemIndex.GetNewDataItemIndex(data);
+    }
 
     void* GetDataItemAtIndex(int32_t index);
     void* GetAddrOfDataItemAtIndex(int32_t index);
diff --git a/src/coreclr/interpreter/inc/intops.def b/src/coreclr/interpreter/inc/intops.def
@@ -379,7 +379,7 @@ OPDEF(INTOP_CALL_NULLCHECK, "call.nullcheck", 4, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_CALLDELEGATE, "call.delegate", 5, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_CALLDELEGATE_TAIL, "call.delegate.tail", 4, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_CALLI, "calli", 6, 1, 2, InterpOpLdPtr)
-OPDEF(INTOP_CALLVIRT, "callvirt", 4, 1, 1, InterpOpMethodHandle)
+OPDEF(INTOP_CALLVIRT, "callvirt", 5, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_CALL_PINVOKE, "call.pinvoke", 6, 1, 1, InterpOpMethodHandle) // inlined (no marshaling wrapper) pinvokes only
 OPDEF(INTOP_NEWOBJ, "newobj", 5, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_NEWOBJ_GENERIC, "newobj.generic", 6, 1, 2, InterpOpMethodHandle)
@@ -388,7 +388,7 @@ OPDEF(INTOP_NEWOBJ_VT, "newobj.vt", 5, 1, 1, InterpOpMethodHandle)
 // Tail calls
 OPDEF(INTOP_CALL_TAIL, "call.tail", 4, 1, 1, InterpOpMethodHandle)
 OPDEF(INTOP_CALLI_TAIL, "calli.tail", 6, 1, 2, InterpOpLdPtr)
-OPDEF(INTOP_CALLVIRT_TAIL, "callvirt.tail", 4, 1, 1, InterpOpMethodHandle)
+OPDEF(INTOP_CALLVIRT_TAIL, "callvirt.tail", 5, 1, 1, InterpOpMethodHandle)
 
 // The following helper call instructions exist in 2 variants, one for normal methods, and one for cases where a shared generic lookup is needed.
 // In the case where a shared generic lookup is needed an extra argument is passed as an svar, which is a pointer to the generic context.
diff --git a/src/coreclr/vm/contractimpl.h b/src/coreclr/vm/contractimpl.h
@@ -403,6 +403,10 @@ struct DispatchToken
         LIMITED_METHOD_CONTRACT;
         return !(m_token == INVALID_TOKEN);
     }
+
+    //------------------------------------------------------------------------
+    // Returns a hash of the token suitable for use in dispatch caches.
+    UINT16 GetHash() const;
 };  // struct DispatchToken
 
 // DispatchToken.m_token should be the only field of DispatchToken.
diff --git a/src/coreclr/vm/interpexec.cpp b/src/coreclr/vm/interpexec.cpp
@@ -16,6 +16,184 @@
 #include <limits>
 #include <functional>
 
+struct InterpDispatchCacheEntry
+{
+    // MethodTable of the calling object
+    MethodTable* pMT;
+    size_t dispatchToken;
+    // Resolved target MethodDesc
+    MethodDesc* pTargetMD;
+    // Used for linking dead entries for cleanup during GC
+    InterpDispatchCacheEntry* pNextDead;
+
+    InterpDispatchCacheEntry(MethodTable* pMT, size_t dispatchToken, MethodDesc* pTargetMD)
+    {
+        this->pMT = pMT;
+        this->dispatchToken = dispatchToken;
+        this->pTargetMD = pTargetMD;
+        pNextDead = nullptr;
+    }
+};
+
+#define DISPATCH_CACHE_BITS 12
+#define DISPATCH_CACHE_SIZE (1 << DISPATCH_CACHE_BITS)
+#define DISPATCH_CACHE_MASK (DISPATCH_CACHE_SIZE - 1)
+
+// A simple hash table that caches virtual method dispatch results.
+// Maps (DispatchToken, MethodTable*) to MethodDesc*.
+struct InterpDispatchCache
+{
+    InterpDispatchCacheEntry* m_cache[DISPATCH_CACHE_SIZE];
+    // List of dead entries to be freed at GC time
+    InterpDispatchCacheEntry* m_pDeadList;
+
+    MethodDesc* Lookup(size_t dispatchToken, void* pMT, uint16_t dispatchTokenHash)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        size_t idx = Hash(dispatchToken, pMT, dispatchTokenHash);
+
+        InterpDispatchCacheEntry* pEntry = VolatileLoadWithoutBarrier(&m_cache[idx]);
+        // Data dependency ensures field reads are ordered after loading of `pEntry`
+        // The entry struct is immutable once created, so these reads are safe
+        if (pEntry != nullptr && pEntry->pMT == pMT && pEntry->dispatchToken == dispatchToken)
+            return pEntry->pTargetMD;
+
+        return NULL;
+    }
+
+    void Insert(size_t dispatchToken, MethodTable* pMT, MethodDesc* pTargetMD, uint16_t dispatchTokenHash)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        size_t idx = Hash(dispatchToken, pMT, dispatchTokenHash);
+
+        InterpDispatchCacheEntry* pNewEntry = new (nothrow) InterpDispatchCacheEntry(pMT, dispatchToken, pTargetMD);
+        if (pNewEntry == nullptr)
+            return;
+
+        // CAS has release semantics, so the fields of the entry have correct
+        // values once the entry is published. If CAS succeeds, we own the old
+        // entry for freeing
+        InterpDispatchCacheEntry* pOldEntry = InterlockedExchangeT(&m_cache[idx], pNewEntry);
+
+        if (pOldEntry != nullptr)
+            AddToDeadList(pOldEntry);
+    }
+
+    // Called during GC sync point to free dead entries
+    // At this point, no other threads are running, so it is safe to free
+    void ReclaimDeadEntries()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        InterpDispatchCacheEntry* pDeadList = VolatileLoadWithoutBarrier(&m_pDeadList);
+
+        while (pDeadList != nullptr)
+        {
+            InterpDispatchCacheEntry* pNext = pDeadList->pNextDead;
+            delete pDeadList;
+            pDeadList = pNext;
+        }
+
+        VolatileStoreWithoutBarrier(&m_pDeadList, (InterpDispatchCacheEntry*)nullptr);
+    }
+
+    // Add an entry to the dead list for later cleanup
+    void AddToDeadList(InterpDispatchCacheEntry* pEntry)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        InterpDispatchCacheEntry* pOldHead;
+        do
+        {
+            pOldHead = VolatileLoadWithoutBarrier(&m_pDeadList);
+            pEntry->pNextDead = pOldHead;
+        }
+        while (InterlockedCompareExchangeT(&m_pDeadList, pEntry, pOldHead) != pOldHead);
+    }
+
+    // Same as VSD DispatchCache's HashToken + HashMT
+    static uint16_t Hash(size_t dispatchToken, void* pMT, uint16_t dispatchTokenHash)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        size_t mtHash = (size_t)pMT;
+        mtHash = (((mtHash >> DISPATCH_CACHE_BITS) + mtHash) >> LOG2_PTRSIZE) & DISPATCH_CACHE_MASK;
+
+        uint16_t hash = (uint16_t)mtHash;
+        hash ^= (dispatchTokenHash & DISPATCH_CACHE_MASK);
+
+        return hash;
+    }
+
+    void ClearEntriesForLoaderAllocator(LoaderAllocator* pLoaderAllocator)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        for (size_t i = 0; i < DISPATCH_CACHE_SIZE; i++)
+        {
+            InterpDispatchCacheEntry* pEntry = VolatileLoadWithoutBarrier(&m_cache[i]);
+            if (pEntry == nullptr)
+                continue;
+
+            if (pEntry->pMT->GetLoaderAllocator() == pLoaderAllocator ||
+                pEntry->pTargetMD->GetLoaderAllocator() == pLoaderAllocator)
+            {
+                VolatileStoreWithoutBarrier(&m_cache[i], (InterpDispatchCacheEntry*)nullptr);
+                // Given the EE is suspended, we can free the entry without worrying about races
+                delete pEntry;
+            }
+        }
+    }
+};
+
+// Global interpreter dispatch cache instance
+static InterpDispatchCache g_InterpDispatchCache;
+
+// Called during GC, when we are guaranteed no entry is being used by any thread.
+void InterpDispatchCache_ReclaimAll()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_COOPERATIVE;
+        // Should only be called during a GC suspension
+        PRECONDITION(Debug_IsLockedViaThreadSuspension());
+    }
+    CONTRACTL_END;
+
+    g_InterpDispatchCache.ReclaimDeadEntries();
+}
+
+// Clear entries that reference types/methods from the given LoaderAllocator.
+// Called during collectible assembly unload when the EE is suspended.
+void InterpDispatchCache_ClearForLoaderAllocator(LoaderAllocator* pLoaderAllocator)
+{
+    g_InterpDispatchCache.ClearEntriesForLoaderAllocator(pLoaderAllocator);
+}
+
+static size_t CreateDispatchTokenForMethod(MethodDesc* pMD)
+{
+    WRAPPER_NO_CONTRACT;
+
+    uint32_t slotNumber = pMD->GetSlot();
+
+    if (pMD->IsInterface())
+    {
+        // For interface methods, get the interface's TypeID
+        MethodTable* pInterfaceMT = pMD->GetMethodTable();
+        uint32_t typeID = pInterfaceMT->GetTypeID();
+        return DispatchToken::CreateDispatchToken(typeID, slotNumber).To_SIZE_T();
+    }
+    else
+    {
+        // For non-interface virtual methods, use TYPE_ID_THIS_CLASS
+        return DispatchToken::CreateDispatchToken(slotNumber).To_SIZE_T();
+    }
+}
+
 #ifdef TARGET_WASM
 // Unused on WASM
 #define SAVE_THE_LOWEST_SP do {} while (0)
@@ -2686,15 +2864,38 @@ void InterpExecMethod(InterpreterFrame *pInterpreterFrame, InterpMethodContextFr
                     OBJECTREF *pThisArg = LOCAL_VAR_ADDR(callArgsOffset, OBJECTREF);
                     NULL_CHECK(*pThisArg);
 
-                    // Interpreter-TODO
-                    // This needs to be optimized, not operating at MethodDesc level, rather with ftnptr
-                    // slots containing the interpreter IR pointer
-                    targetMethod = CallWithSEHWrapper(
-                        [&pMD, &pThisArg]() {
-                            return pMD->GetMethodDescOfVirtualizedCode(pThisArg, pMD->GetMethodTable());
-                        });
+                    MethodTable *pObjMT = (*pThisArg)->GetMethodTable();
 
-                    ip += 4;
+                    // Interpreter-FIXME: It would be nice to have these caches initialized at compile time instead
+                    // Obtain the cached dispatch token or initialize it
+                    size_t dispatchToken = (size_t)VolatileLoadWithoutBarrier(&pMethod->pDataItems[ip[4]]);
+                    if (dispatchToken == 0)
+                    {
+                        dispatchToken = CreateDispatchTokenForMethod(pMD);
+                        VolatileStoreWithoutBarrier(&pMethod->pDataItems[ip[4]], (void*)dispatchToken);
+                    }
+                    // The token hash is cached in the data item immediately following the dispatchToken
+                    size_t dispatchTokenHash = (size_t)VolatileLoadWithoutBarrier(&pMethod->pDataItems[ip[4] + 1]);
+                    if (dispatchTokenHash == 0)
+                    {
+                        dispatchTokenHash = DispatchToken::From_SIZE_T(dispatchToken).GetHash();
+                        VolatileStoreWithoutBarrier(&pMethod->pDataItems[ip[4] + 1], (void*)dispatchTokenHash);
+                    }
+
+                    // Try cache lookup first
+                    targetMethod = g_InterpDispatchCache.Lookup(dispatchToken, pObjMT, (uint16_t)dispatchTokenHash);
+
+                    if (targetMethod == NULL)
+                    {
+                        // miss, resolve the virtual method and cache it
+                        targetMethod = CallWithSEHWrapper(
+                            [&pMD, &pThisArg]() {
+                                return pMD->GetMethodDescOfVirtualizedCode(pThisArg, pMD->GetMethodTable());
+                            });
+                        g_InterpDispatchCache.Insert(dispatchToken, pObjMT, targetMethod, (uint16_t)dispatchTokenHash);
+                    }
+
+                    ip += 5;
                     goto CALL_INTERP_METHOD;
                 }
 
@@ -2852,7 +3053,6 @@ void InterpExecMethod(InterpreterFrame *pInterpreterFrame, InterpMethodContextFr
                                 [&targetMethod, &pThisArg]() {
                                     return targetMethod->GetMethodDescOfVirtualizedCode(pThisArg, targetMethod->GetMethodTable());
                                 });
-
                         }
                         else
                         {
diff --git a/src/coreclr/vm/interpexec.h b/src/coreclr/vm/interpexec.h
@@ -125,4 +125,7 @@ struct UnmanagedMethodWithTransitionParam
     PCODE callTarget;
 };
 
+void InterpDispatchCache_ReclaimAll();
+void InterpDispatchCache_ClearForLoaderAllocator(LoaderAllocator* pLoaderAllocator);
+
 #endif // _INTERPEXEC_H_
diff --git a/src/coreclr/vm/loaderallocator.cpp b/src/coreclr/vm/loaderallocator.cpp
@@ -13,6 +13,10 @@
 #endif
 #include "comcallablewrapper.h"
 
+#ifdef FEATURE_INTERPRETER
+#include "interpexec.h"
+#endif
+
 //#define ENABLE_LOG_LOADER_ALLOCATOR_CLEANUP 1
 
 #define STUBMANAGER_RANGELIST(stubManager) (stubManager::g_pManager->GetRangeList())
@@ -639,6 +643,10 @@ void LoaderAllocator::GCLoaderAllocators(LoaderAllocator* pOriginalLoaderAllocat
         ExecutionManager::Unload(pDomainLoaderAllocatorDestroyIterator);
         pDomainLoaderAllocatorDestroyIterator->UninitVirtualCallStubManager();
 
+#ifdef FEATURE_INTERPRETER
+        InterpDispatchCache_ClearForLoaderAllocator(pDomainLoaderAllocatorDestroyIterator);
+#endif
+
         // TODO: Do we really want to perform this on each LoaderAllocator?
         MethodTable::ClearMethodDataCache();
 
diff --git a/src/coreclr/vm/syncclean.cpp b/src/coreclr/vm/syncclean.cpp
@@ -8,6 +8,10 @@
 #include "virtualcallstub.h"
 #include "threadsuspend.h"
 
+#ifdef FEATURE_INTERPRETER
+#include "interpexec.h"
+#endif
+
 VolatilePtr<Bucket> SyncClean::m_HashMap = NULL;
 VolatilePtr<EEHashEntry*> SyncClean::m_EEHashTable;
 
@@ -95,4 +99,9 @@ void SyncClean::CleanUp ()
 
     // Give others we want to reclaim during the GC sync point a chance to do it
     VirtualCallStubManager::ReclaimAll();
+
+#ifdef FEATURE_INTERPRETER
+    // Reclaim dead interpreter dispatch cache entries
+    InterpDispatchCache_ReclaimAll();
+#endif
 }
diff --git a/src/coreclr/vm/virtualcallstub.cpp b/src/coreclr/vm/virtualcallstub.cpp

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,13 @@ class InterpDataItemIndexMap`
`105`	`105`	`_dataItems = dataItems;`
`106`	`106`	`}`
`107`	`107`
	`108`	`+ // Allocates a slot in the data items that is not shared with other opcodes`
	`109`	`+ // Typically used for caching data at runtime.`
	`110`	`+ int32_t GetNewDataItemIndex(void* data)`
	`111`	`+ {`
	`112`	`+ return _dataItems->Add(data);`
	`113`	`+ }`
	`114`	`+`
`108`	`115`	`int32_t GetDataItemIndex(const InterpGenericLookup& lookup)`
`109`	`116`	`{`
`110`	`117`	`const size_t sizeOfFieldsConcatenated = sizeof(InterpGenericLookup::offsets) +`
`@@ -687,6 +694,10 @@ class InterpCompiler`
`687`	`694`	`{`
`688`	`695`	`return m_genericLookupToDataItemIndex.GetDataItemIndex(data);`
`689`	`696`	`}`
	`697`	`+ int32_t GetNewDataItemIndex(void* data)`
	`698`	`+ {`
	`699`	`+ return m_genericLookupToDataItemIndex.GetNewDataItemIndex(data);`
	`700`	`+ }`
`690`	`701`
`691`	`702`	`void* GetDataItemAtIndex(int32_t index);`
`692`	`703`	`void* GetAddrOfDataItemAtIndex(int32_t index);`