dotnet · EgorBo · Mar 28, 2022 · Mar 27, 2022 · Mar 27, 2022 · Mar 27, 2022
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -1298,17 +1298,28 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
 
             if (simdSize == 32)
             {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
-                mskIntrinsic = NI_AVX2_MoveMask;
+                // With AVX2 we use testz(xor(v1, v2))
+                cmpIntrinsic = NI_AVX2_Xor;
+                mskIntrinsic = NI_AVX_TestZ;
+                cmpJitType   = simdBaseJitType;
                 mskConstant  = -1;
             }
             else
             {
                 assert(simdSize == 16);
 
-                cmpIntrinsic = NI_SSE2_CompareEqual;
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
+                mskConstant = 0xFFFF;
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // With SSE41 we use testz(xor(v1, v2))
+                    cmpIntrinsic = NI_SSE2_Xor;
+                    mskIntrinsic = NI_SSE41_TestZ;
+                }
+                else
+                {
+                    cmpIntrinsic = NI_SSE2_CompareEqual;
+                    mskIntrinsic = NI_SSE2_MoveMask;
+                }
             }
             break;
         }
@@ -1320,28 +1331,30 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
 
             if (simdSize == 32)
             {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
+                // With AVX2 we use testz(xor(v1, v2))
+                cmpIntrinsic = NI_AVX2_Xor;
                 cmpJitType   = simdBaseJitType;
-                mskIntrinsic = NI_AVX2_MoveMask;
+                mskIntrinsic = NI_AVX_TestZ;
                 mskConstant  = -1;
             }
             else
             {
                 assert(simdSize == 16);
+                mskConstant = 0xFFFF;
 
                 if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
                 {
-                    cmpIntrinsic = NI_SSE41_CompareEqual;
+                    // With SSE41 we use testz(xor(v1, v2))
+                    mskIntrinsic = NI_SSE41_TestZ;
+                    cmpIntrinsic = NI_SSE2_Xor;
                     cmpJitType   = simdBaseJitType;
                 }
                 else
                 {
+                    mskIntrinsic = NI_SSE2_MoveMask;
                     cmpIntrinsic = NI_SSE2_CompareEqual;
                     cmpJitType   = CORINFO_TYPE_UINT;
                 }
-
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
             }
             break;
         }
@@ -1411,6 +1424,23 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
     BlockRange().InsertBefore(node, cmp);
     LowerNode(cmp);
 
+    // TestZ(Xor(v1, v2)) is smaller
+    if ((mskIntrinsic == NI_SSE41_TestZ) || (mskIntrinsic == NI_AVX_TestZ))
+    {
+        // Save cmp's result into a temp
+        node->Op(1) = cmp;
+        LIR::Use cmpUse(BlockRange(), &node->Op(1), node);
+        ReplaceWithLclVar(cmpUse);
+        GenTree* cmpClone = comp->gtClone(node->Op(1));
+        BlockRange().InsertAfter(node->Op(1), cmpClone);
+
+        // Emit vptest(cmp, cmpClone)
+        node->Op(2) = cmpClone;
+        node->ChangeHWIntrinsicId(mskIntrinsic);
+        LowerHWIntrinsicCC(node, mskIntrinsic == NI_SSE41_TestZ ? NI_SSE41_PTEST : NI_AVX_PTEST, cmpCnd);
+        return;
+    }
+
     GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskJitType, simdSize);
     BlockRange().InsertAfter(cmp, msk);
     LowerNode(msk);

diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
@@ -1779,11 +1779,10 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
             return true;
 
         Vector:
-            if (Sse2.IsSupported)
+            if (Vector128.IsHardwareAccelerated)
             {
-                if (Avx2.IsSupported && length >= (nuint)Vector256<byte>.Count)
+                if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256<byte>.Count)
                 {
-                    Vector256<byte> vecResult;
                     nuint offset = 0;
                     nuint lengthToExamine = length - (nuint)Vector256<byte>.Count;
                     // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
@@ -1792,8 +1791,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     {
                         do
                         {
-                            vecResult = Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset));
-                            if (Avx2.MoveMask(vecResult) != -1)
+                            if (Vector256.LoadUnsafe(ref first, offset) !=
+                                Vector256.LoadUnsafe(ref second, offset))
                             {
                                 goto NotEqual;
                             }
@@ -1802,8 +1801,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     }
 
                     // Do final compare as Vector256<byte>.Count from end rather than start
-                    vecResult = Avx2.CompareEqual(LoadVector256(ref first, lengthToExamine), LoadVector256(ref second, lengthToExamine));
-                    if (Avx2.MoveMask(vecResult) == -1)
+                    if (Vector256.LoadUnsafe(ref first, lengthToExamine) ==
+                        Vector256.LoadUnsafe(ref second, lengthToExamine))
                     {
                         // C# compiler inverts this test, making the outer goto the conditional jmp.
                         goto Equal;
@@ -1814,7 +1813,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                 }
                 else if (length >= (nuint)Vector128<byte>.Count)
                 {
-                    Vector128<byte> vecResult;
                     nuint offset = 0;
                     nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
                     // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
@@ -1823,10 +1821,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     {
                         do
                         {
-                            // We use instrincs directly as .Equals calls .AsByte() which doesn't inline at R2R time
-                            // https://github.com/dotnet/runtime/issues/32714
-                            vecResult = Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset));
-                            if (Sse2.MoveMask(vecResult) != 0xFFFF)
+                            if (Vector128.LoadUnsafe(ref first, offset) !=
+                                Vector128.LoadUnsafe(ref second, offset))
                             {
                                 goto NotEqual;
                             }
@@ -1835,8 +1831,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     }
 
                     // Do final compare as Vector128<byte>.Count from end rather than start
-                    vecResult = Sse2.CompareEqual(LoadVector128(ref first, lengthToExamine), LoadVector128(ref second, lengthToExamine));
-                    if (Sse2.MoveMask(vecResult) == 0xFFFF)
+                    if (Vector128.LoadUnsafe(ref first, lengthToExamine) ==
+                        Vector128.LoadUnsafe(ref second, lengthToExamine))
                     {
                         // C# compiler inverts this test, making the outer goto the conditional jmp.
                         goto Equal;
@@ -1846,13 +1842,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                     goto NotEqual;
                 }
             }
-            //else if (AdvSimd.Arm64.IsSupported)
-            //{
-            //    // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
-            //    // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
-            //    // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
-            //    // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
-            //}
             else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
             {
                 nuint offset = 0;
@@ -1883,7 +1872,7 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
             }
 
 #if TARGET_64BIT
-            if (Sse2.IsSupported)
+            if (Vector128.IsHardwareAccelerated)
             {
                 Debug.Assert(length <= (nuint)sizeof(nuint) * 2);