Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 41 additions & 11 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1298,17 +1298,28 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)

if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
mskIntrinsic = NI_AVX2_MoveMask;
// With AVX2 we use testz(xor(v1, v2))
cmpIntrinsic = NI_AVX2_Xor;
mskIntrinsic = NI_AVX_TestZ;
cmpJitType = simdBaseJitType;
mskConstant = -1;
}
else
{
assert(simdSize == 16);

cmpIntrinsic = NI_SSE2_CompareEqual;
mskIntrinsic = NI_SSE2_MoveMask;
mskConstant = 0xFFFF;
mskConstant = 0xFFFF;
if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// With SSE41 we use testz(xor(v1, v2))
cmpIntrinsic = NI_SSE2_Xor;
mskIntrinsic = NI_SSE41_TestZ;
}
else
{
cmpIntrinsic = NI_SSE2_CompareEqual;
mskIntrinsic = NI_SSE2_MoveMask;
}
}
break;
}
Expand All @@ -1320,28 +1331,30 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)

if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
// With AVX2 we use testz(xor(v1, v2))
cmpIntrinsic = NI_AVX2_Xor;
cmpJitType = simdBaseJitType;
mskIntrinsic = NI_AVX2_MoveMask;
mskIntrinsic = NI_AVX_TestZ;
mskConstant = -1;
}
else
{
assert(simdSize == 16);
mskConstant = 0xFFFF;

if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
cmpIntrinsic = NI_SSE41_CompareEqual;
// With SSE41 we use testz(xor(v1, v2))
mskIntrinsic = NI_SSE41_TestZ;
cmpIntrinsic = NI_SSE2_Xor;
cmpJitType = simdBaseJitType;
}
else
{
mskIntrinsic = NI_SSE2_MoveMask;
cmpIntrinsic = NI_SSE2_CompareEqual;
cmpJitType = CORINFO_TYPE_UINT;
}

mskIntrinsic = NI_SSE2_MoveMask;
mskConstant = 0xFFFF;
}
break;
}
Expand Down Expand Up @@ -1411,6 +1424,23 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
BlockRange().InsertBefore(node, cmp);
LowerNode(cmp);

// TestZ(Xor(v1, v2)) is smaller
if ((mskIntrinsic == NI_SSE41_TestZ) || (mskIntrinsic == NI_AVX_TestZ))
{
// Save cmp's result into a temp
node->Op(1) = cmp;
LIR::Use cmpUse(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(cmpUse);
GenTree* cmpClone = comp->gtClone(node->Op(1));
BlockRange().InsertAfter(node->Op(1), cmpClone);

// Emit vptest(cmp, cmpClone)
node->Op(2) = cmpClone;
node->ChangeHWIntrinsicId(mskIntrinsic);
LowerHWIntrinsicCC(node, mskIntrinsic == NI_SSE41_TestZ ? NI_SSE41_PTEST : NI_AVX_PTEST, cmpCnd);
return;
}

GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskJitType, simdSize);
BlockRange().InsertAfter(cmp, msk);
LowerNode(msk);
Expand Down
33 changes: 11 additions & 22 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1779,11 +1779,10 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
return true;

Vector:
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
if (Avx2.IsSupported && length >= (nuint)Vector256<byte>.Count)
if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256<byte>.Count)
{
Vector256<byte> vecResult;
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector256<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Expand All @@ -1792,8 +1791,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
{
do
{
vecResult = Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset));
if (Avx2.MoveMask(vecResult) != -1)
if (Vector256.LoadUnsafe(ref first, offset) !=
Vector256.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
Expand All @@ -1802,8 +1801,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}

// Do final compare as Vector256<byte>.Count from end rather than start
vecResult = Avx2.CompareEqual(LoadVector256(ref first, lengthToExamine), LoadVector256(ref second, lengthToExamine));
if (Avx2.MoveMask(vecResult) == -1)
if (Vector256.LoadUnsafe(ref first, lengthToExamine) ==
Vector256.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
Expand All @@ -1814,7 +1813,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}
else if (length >= (nuint)Vector128<byte>.Count)
{
Vector128<byte> vecResult;
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Expand All @@ -1823,10 +1821,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
{
do
{
// We use instrincs directly as .Equals calls .AsByte() which doesn't inline at R2R time
// https://github.com/dotnet/runtime/issues/32714
vecResult = Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset));
if (Sse2.MoveMask(vecResult) != 0xFFFF)
if (Vector128.LoadUnsafe(ref first, offset) !=
Vector128.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
Expand All @@ -1835,8 +1831,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}

// Do final compare as Vector128<byte>.Count from end rather than start
vecResult = Sse2.CompareEqual(LoadVector128(ref first, lengthToExamine), LoadVector128(ref second, lengthToExamine));
if (Sse2.MoveMask(vecResult) == 0xFFFF)
if (Vector128.LoadUnsafe(ref first, lengthToExamine) ==
Vector128.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
Expand All @@ -1846,13 +1842,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
goto NotEqual;
}
}
//else if (AdvSimd.Arm64.IsSupported)
//{
// // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
// // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
// // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
// // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
//}
else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
{
nuint offset = 0;
Expand Down Expand Up @@ -1883,7 +1872,7 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}

#if TARGET_64BIT
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Debug.Assert(length <= (nuint)sizeof(nuint) * 2);

Expand Down