@@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
30503050 }
30513051}
30523052
3053+ // ------------------------------------------------------------------------
3054+ // genCodeForMemmove: Perform an unrolled memmove. The idea that we can
3055+ // ignore the fact that src and dst might overlap if we save the whole
3056+ // src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30):
3057+ //
3058+ // ldr q16, [x0]
3059+ // ldr q17, [x0, #0x0E]
3060+ // str q16, [x1]
3061+ // str q17, [x1, #0x0E]
3062+ //
3063+ // Arguments:
3064+ // tree - GenTreeBlk node
3065+ //
3066+ void CodeGen::genCodeForMemmove (GenTreeBlk* tree)
3067+ {
3068+ #ifdef TARGET_ARM64
3069+ // TODO-CQ: Support addressing modes, for now we don't use them
3070+ GenTreeIndir* srcIndir = tree->Data ()->AsIndir ();
3071+ assert (srcIndir->isContained () && !srcIndir->Addr ()->isContained ());
3072+
3073+ regNumber dst = genConsumeReg (tree->Addr ());
3074+ regNumber src = genConsumeReg (srcIndir->Addr ());
3075+ unsigned size = tree->Size ();
3076+
3077+ auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
3078+ var_types memType;
3079+ switch (regSize)
3080+ {
3081+ case 1 :
3082+ memType = TYP_UBYTE;
3083+ break ;
3084+ case 2 :
3085+ memType = TYP_USHORT;
3086+ break ;
3087+ case 4 :
3088+ memType = TYP_INT;
3089+ break ;
3090+ case 8 :
3091+ memType = TYP_LONG;
3092+ break ;
3093+ case 16 :
3094+ memType = TYP_SIMD16;
3095+ break ;
3096+ default :
3097+ unreached ();
3098+ }
3099+ if (load)
3100+ {
3101+ GetEmitter ()->emitIns_R_R_I (ins_Load (memType), emitTypeSize (memType), tempReg, src, offset);
3102+ }
3103+ else
3104+ {
3105+ GetEmitter ()->emitIns_R_R_I (ins_Store (memType), emitTypeSize (memType), tempReg, dst, offset);
3106+ }
3107+ };
3108+
3109+ // Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here.
3110+
3111+ // Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent
3112+ // ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815
3113+ unsigned simdSize = FP_REGSIZE_BYTES;
3114+ if (size >= simdSize)
3115+ {
3116+ // Number of SIMD regs needed to save the whole src to regs.
3117+ const unsigned numberOfSimdRegs = tree->AvailableTempRegCount (RBM_ALLFLOAT);
3118+
3119+ // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
3120+ regNumber tempRegs[LinearScan::MaxInternalCount] = {};
3121+ for (unsigned i = 0 ; i < numberOfSimdRegs; i++)
3122+ {
3123+ tempRegs[i] = tree->ExtractTempReg (RBM_ALLFLOAT);
3124+ }
3125+
3126+ auto emitSimdLoadStore = [&](bool load) {
3127+ unsigned offset = 0 ;
3128+ int regIndex = 0 ;
3129+ do
3130+ {
3131+ emitLoadStore (load, simdSize, tempRegs[regIndex++], offset);
3132+ offset += simdSize;
3133+ if (size == offset)
3134+ {
3135+ break ;
3136+ }
3137+ if ((size - offset) < simdSize)
3138+ {
3139+ // Overlap with the previously processed data. We'll always use SIMD for simplicity
3140+ // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
3141+ offset = size - simdSize;
3142+ }
3143+ } while (true );
3144+ };
3145+
3146+ // load everything from SRC to temp regs
3147+ emitSimdLoadStore (/* load */ true );
3148+ // store them to DST
3149+ emitSimdLoadStore (/* load */ false );
3150+ }
3151+ else
3152+ {
3153+ // Here we work with size 1..15
3154+ assert ((size > 0 ) && (size < FP_REGSIZE_BYTES));
3155+
3156+ // Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
3157+ const unsigned loadStoreSize = 1 << BitOperations::Log2 (size);
3158+ if (loadStoreSize == size)
3159+ {
3160+ const regNumber tmpReg = tree->GetSingleTempReg (RBM_ALLINT);
3161+ emitLoadStore (/* load */ true , loadStoreSize, tmpReg, 0 );
3162+ emitLoadStore (/* load */ false , loadStoreSize, tmpReg, 0 );
3163+ }
3164+ else
3165+ {
3166+ assert (tree->AvailableTempRegCount () == 2 );
3167+ const regNumber tmpReg1 = tree->ExtractTempReg (RBM_ALLINT);
3168+ const regNumber tmpReg2 = tree->ExtractTempReg (RBM_ALLINT);
3169+ emitLoadStore (/* load */ true , loadStoreSize, tmpReg1, 0 );
3170+ emitLoadStore (/* load */ true , loadStoreSize, tmpReg2, size - loadStoreSize);
3171+ emitLoadStore (/* load */ false , loadStoreSize, tmpReg1, 0 );
3172+ emitLoadStore (/* load */ false , loadStoreSize, tmpReg2, size - loadStoreSize);
3173+ }
3174+ }
3175+ #else // TARGET_ARM64
3176+ unreached ();
3177+ #endif
3178+ }
3179+
30533180// ------------------------------------------------------------------------
30543181// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
30553182//
@@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
43704497 break ;
43714498
43724499 case GenTreeBlk::BlkOpKindUnroll:
4500+ case GenTreeBlk::BlkOpKindUnrollMemmove:
43734501 if (isCopyBlk)
43744502 {
43754503 if (blkOp->gtBlkOpGcUnsafe )
43764504 {
43774505 GetEmitter ()->emitDisableGC ();
43784506 }
4379- genCodeForCpBlkUnroll (blkOp);
4507+ if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
4508+ {
4509+ genCodeForCpBlkUnroll (blkOp);
4510+ }
4511+ else
4512+ {
4513+ assert (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
4514+ genCodeForMemmove (blkOp);
4515+ }
43804516 if (blkOp->gtBlkOpGcUnsafe )
43814517 {
43824518 GetEmitter ()->emitEnableGC ();
0 commit comments