From e37c3840564290b4ae5826475c465c090237e273 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Mon, 22 Jun 2026 07:02:45 +0000 Subject: [PATCH 1/5] arm64: Optimize ARM64 compare mask ExtractMostSignificantBits consumers - See discussion at https://github.com/dotnet/runtime/pull/121981#discussion_r2564553235 --- src/coreclr/jit/rationalize.cpp | 565 ++++++++++++++++++ src/coreclr/jit/rationalize.h | 6 + .../ExtractMostSignificantBits.cs | 536 +++++++++++++++++ .../ExtractMostSignificantBits.csproj | 17 + 4 files changed, 1124 insertions(+) create mode 100644 src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs create mode 100644 src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index fc4545a2cb46e8..be4eb0a119c8c2 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -1366,6 +1366,543 @@ bool Rationalizer::ShouldRewriteToNonMaskHWIntrinsic(GenTree* node) } #endif // TARGET_XARCH +#if defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask +// +// Arguments: +// intrinsic - The hwintrinsic id +// +// Return Value: +// True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. +// +static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) +{ + switch (intrinsic) + { + case NI_AdvSimd_CompareEqual: + case NI_AdvSimd_CompareGreaterThan: + case NI_AdvSimd_CompareGreaterThanOrEqual: + case NI_AdvSimd_CompareLessThan: + case NI_AdvSimd_CompareLessThanOrEqual: + case NI_AdvSimd_Arm64_CompareEqual: + case NI_AdvSimd_Arm64_CompareGreaterThan: + case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: + case NI_AdvSimd_Arm64_CompareLessThan: + case NI_AdvSimd_Arm64_CompareLessThanOrEqual: + { + return true; + } + + default: + { + return false; + } + } +} + +//---------------------------------------------------------------------------------------------- +// NormalizeCmpMaskSimdBaseType: Normalize a SIMD comparison mask's base type to unsigned. +// +// Arguments: +// simdBaseType - The SIMD base type. +// +// Return Value: +// The normalized SIMD base type, or TYP_UNDEF if it is unsupported. +// +static var_types NormalizeCmpMaskSimdBaseType(var_types simdBaseType) +{ + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + return TYP_UBYTE; + } + + case TYP_SHORT: + case TYP_USHORT: + { + return TYP_USHORT; + } + + case TYP_INT: + case TYP_UINT: + { + return TYP_UINT; + } + + default: + { + return TYP_UNDEF; + } + } +} + +//---------------------------------------------------------------------------------------------- +// IsHWIntrinsicCmpMaskExtractMsb: Checks if an ExtractMostSignificantBits node consumes a SIMD +// comparison mask. +// +// Arguments: +// node - The hwintrinsic node. +// +// Return Value: +// True if the node is an ExtractMostSignificantBits over a SIMD comparison mask. +// +static bool IsHWIntrinsicCmpMaskExtractMsb(GenTreeHWIntrinsic* node) +{ + if ((node->GetHWIntrinsicId() != NI_Vector64_ExtractMostSignificantBits) && + (node->GetHWIntrinsicId() != NI_Vector128_ExtractMostSignificantBits)) + { + return false; + } + + GenTree* op1 = node->Op(1); + + return op1->OperIsHWIntrinsic() && IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()); +} + +//---------------------------------------------------------------------------------------------- +// IsPrimitivePopCount: Checks if a node is a primitive PopCount intrinsic. +// +// Arguments: +// node - The node to check. +// +// Return Value: +// True if the node is a primitive PopCount intrinsic. +// +static bool IsPrimitivePopCount(GenTree* node) +{ + return node->OperIs(GT_INTRINSIC) && (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_PopCount); +} + +//---------------------------------------------------------------------------------------------- +// IsPrimitiveTrailingZeroCount: Checks if a node is a primitive TrailingZeroCount intrinsic. +// +// Arguments: +// node - The node to check. +// +// Return Value: +// True if the node is a primitive TrailingZeroCount intrinsic. +// +static bool IsPrimitiveTrailingZeroCount(GenTree* node) +{ + return node->OperIs(GT_INTRINSIC) && (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); +} + +//---------------------------------------------------------------------------------------------- +// ReplaceHWIntrinsicCmpMaskExtractMsbUse: Replace a scalarized comparison mask extraction with +// the specified replacement node. +// +// Arguments: +// use - A pointer to the node being replaced +// parents - A reference to tree walk data providing the context +// oldNode - The node being replaced +// replacement - The node that replaces *use +// +static void ReplaceHWIntrinsicCmpMaskExtractMsbUse(GenTree** use, + Compiler::GenTreeStack& parents, + GenTree* oldNode, + GenTree* replacement) +{ + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, replacement); + } + else + { + *use = replacement; + } + + // Adjust the parent stack + assert(parents.Top() == oldNode); + (void)parents.Pop(); + parents.Push(replacement); +} + +//---------------------------------------------------------------------------------------------- +// ScalarizeHWIntrinsicCmpMaskReduction: Update an ExtractMostSignificantBits node so it scalarizes +// a vector reduction result. +// +// Arguments: +// node - The ExtractMostSignificantBits node to update +// reduction - The vector reduction node +// simdBaseType - The SIMD base type of the reduction +// simdSize - The SIMD size of the original input +// +static void ScalarizeHWIntrinsicCmpMaskReduction(GenTreeHWIntrinsic* node, + GenTree* reduction, + var_types simdBaseType, + unsigned simdSize) +{ + NamedIntrinsic intrinsic = (simdSize == 8) ? NI_Vector64_ToScalar : NI_Vector128_ToScalar; + + node->gtType = genActualType(simdBaseType); + node->ChangeHWIntrinsicId(intrinsic); + node->SetSimdSize(8); + node->SetSimdBaseType(simdBaseType); + node->Op(1) = reduction; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsb: +// Rewrites an ExtractMostSignificantBits operation when the input is known to be a SIMD comparison +// mask and the result is only checked for zero. +// +// Matches: +// ExtractMostSignificantBits(cmpMask) == 0 +// ExtractMostSignificantBits(cmpMask) != 0 +// +// Replaces the ExtractMostSignificantBits with: +// MaxAcross(cmpMask) +// +// This computes whether any all-bits-set comparison element exists without materializing the full +// bitmask. For Vector64, the reduction is implemented with MaxPairwise. +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsb(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + simdBaseType = NormalizeCmpMaskSimdBaseType(simdBaseType); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (parents.Height() <= 1) + { + return false; + } + + GenTree* parent = parents.Top(1); + + if (!parent->OperIs(GT_EQ, GT_NE)) + { + return false; + } + + GenTree* parentOp1 = parent->gtGetOp1(); + GenTree* parentOp2 = parent->gtGetOp2(); + + if (!(((parentOp1 == node) && parentOp2->IsIntegralConst(0)) || + ((parentOp2 == node) && parentOp1->IsIntegralConst(0)))) + { + return false; + } + + GenTree* op1 = node->Op(1); + + if (!IsHWIntrinsicCmpMaskExtractMsb(node)) + { + return false; + } + + // A comparison produces elements whose value is either all-bits-set or zero. When the + // ExtractMostSignificantBits result is only being compared against zero, use a horizontal max + // reduction to determine if any element was all-bits-set without materializing the full mask. + + GenTree* tmp; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S MaxAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + tmp = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_MaxPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, tmp); + } + else + { + tmp = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_MaxAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, tmp); + } + + op1 = tmp; + + ScalarizeHWIntrinsicCmpMaskReduction(node, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(node, castNode); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, node, castNode); + + return true; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsbPopCount: +// Rewrites PopCount(ExtractMostSignificantBits(...)) when the input is known to be a SIMD +// comparison mask. +// +// Matches: +// PopCount(ExtractMostSignificantBits(cmpMask)) +// +// Replaces it with: +// AddAcross(ShiftRightLogical(cmpMask, elementBits - 1)) +// +// This converts each all-bits-set comparison element to 1, each zero element to 0, then horizontally +// sums those per-element counts. For Vector64, the reduction is implemented with AddPairwise. +// +// Arguments: +// use - A pointer to the intrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeIntrinsic* popCount = (*use)->AsIntrinsic(); + assert(popCount->gtIntrinsicName == NI_PRIMITIVE_PopCount); + + GenTree* extract = popCount->gtGetOp1(); + + if (!extract->OperIsHWIntrinsic()) + { + return false; + } + + GenTreeHWIntrinsic* extractNode = extract->AsHWIntrinsic(); + var_types simdBaseType = NormalizeCmpMaskSimdBaseType(extractNode->GetSimdBaseType()); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (!IsHWIntrinsicCmpMaskExtractMsb(extractNode)) + { + return false; + } + + unsigned simdSize = extractNode->GetSimdSize(); + unsigned elementBitSize = genTypeSize(simdBaseType) * BITS_PER_BYTE; + + // A comparison produces elements whose value is either all-bits-set or zero. For a Count-style + // consumer, normalize each element to one or zero and then horizontally sum the elements. + + GenTree* op1 = extractNode->Op(1); + + GenTree* shiftAmount = m_compiler->gtNewIconNode(elementBitSize - 1); + BlockRange().InsertAfter(op1, shiftAmount); + + GenTree* shift = m_compiler->gtNewSimdHWIntrinsicNode(Compiler::getSIMDTypeForSize(simdSize), op1, shiftAmount, + NI_AdvSimd_ShiftRightLogical, simdBaseType, simdSize); + BlockRange().InsertAfter(shiftAmount, shift); + op1 = shift; + + GenTree* add; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S AddAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + add = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, add); + } + else + { + add = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, add); + } + + op1 = add; + + ScalarizeHWIntrinsicCmpMaskReduction(extractNode, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, extractNode, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(extractNode, castNode); + + BlockRange().Remove(popCount); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, popCount, castNode); + + return true; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount: +// Rewrites TrailingZeroCount(ExtractMostSignificantBits(...)) when the input is known to be a +// SIMD comparison mask. +// +// Matches: +// TrailingZeroCount(ExtractMostSignificantBits(cmpMask)) +// +// Replaces it with: +// MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) - 1 +// +// IndexVector holds one-based element indexes and SentinelVector holds 33. The selected minimum is +// therefore the first matching element index plus one, or 33 if no element matched. Subtracting one +// preserves the zero-mask TrailingZeroCount result of 32. For Vector64, the reduction is +// implemented with MinPairwise. +// +// Arguments: +// use - A pointer to the intrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeIntrinsic* tzcnt = (*use)->AsIntrinsic(); + assert(tzcnt->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); + + GenTree* extract = tzcnt->gtGetOp1(); + + if (!extract->OperIsHWIntrinsic()) + { + return false; + } + + GenTreeHWIntrinsic* extractNode = extract->AsHWIntrinsic(); + var_types simdBaseType = NormalizeCmpMaskSimdBaseType(extractNode->GetSimdBaseType()); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (!IsHWIntrinsicCmpMaskExtractMsb(extractNode)) + { + return false; + } + + unsigned simdSize = extractNode->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + // A comparison produces elements whose value is either all-bits-set or zero. For an + // IndexOf-style consumer, select a 1-based element index when the comparison is true and a + // sentinel value of 33 when false. The horizontal min reduction then finds either the first + // matching index plus one or the sentinel. Subtracting one produces the trailing-zero-count + // result, including 32 for the zero mask case. + + GenTree* op1 = extractNode->Op(1); + + GenTreeVecCon* indexVec = m_compiler->gtNewVconNode(simdType); + GenTreeVecCon* otherVec = m_compiler->gtNewVconNode(simdType); + + const unsigned elementSize = genTypeSize(simdBaseType); + const unsigned elementCount = simdSize / elementSize; + + for (unsigned index = 0; index < elementCount; index++) + { + switch (simdBaseType) + { + case TYP_UBYTE: + { + indexVec->gtSimdVal.u8[index] = static_cast(index + 1); + otherVec->gtSimdVal.u8[index] = 33; + break; + } + + case TYP_USHORT: + { + indexVec->gtSimdVal.u16[index] = static_cast(index + 1); + otherVec->gtSimdVal.u16[index] = 33; + break; + } + + case TYP_UINT: + { + indexVec->gtSimdVal.u32[index] = static_cast(index + 1); + otherVec->gtSimdVal.u32[index] = 33; + break; + } + + default: + { + unreached(); + } + } + } + + BlockRange().InsertAfter(op1, indexVec); + BlockRange().InsertAfter(indexVec, otherVec); + + GenTree* select = m_compiler->gtNewSimdCndSelNode(simdType, op1, indexVec, otherVec, simdBaseType, simdSize); + BlockRange().InsertAfter(otherVec, select); + op1 = select; + + GenTree* min; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S MinAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + min = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_MinPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, min); + } + else + { + min = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_MinAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, min); + } + + op1 = min; + + ScalarizeHWIntrinsicCmpMaskReduction(extractNode, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, extractNode, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(extractNode, castNode); + + GenTree* one = m_compiler->gtNewIconNode(1); + BlockRange().InsertAfter(castNode, one); + + GenTree* result = m_compiler->gtNewOperNode(GT_SUB, TYP_INT, castNode, one); + BlockRange().InsertAfter(one, result); + + BlockRange().Remove(tzcnt); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, tzcnt, result); + + return true; +} +#endif // TARGET_ARM64 + //---------------------------------------------------------------------------------------------- // RewriteHWIntrinsicExtractMsb: Rewrites a hwintrinsic ExtractMostSignificantBytes operation // @@ -1385,6 +1922,18 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree GenTree* op1 = node->Op(1); #if defined(TARGET_ARM64) + if (RewriteHWIntrinsicCmpMaskExtractMsb(use, parents)) + { + return; + } + + if ((parents.Height() > 1) && + (IsPrimitivePopCount(parents.Top(1)) || IsPrimitiveTrailingZeroCount(parents.Top(1))) && + IsHWIntrinsicCmpMaskExtractMsb(node)) + { + return; + } + // ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead. // To do this, we effectively perform the following steps: // 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit @@ -1874,6 +2423,22 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge case GT_INTRINSIC: // Non-target intrinsics should have already been rewritten back into user calls. assert(m_compiler->IsTargetIntrinsic(node->AsIntrinsic()->gtIntrinsicName)); +#if defined(TARGET_ARM64) && defined(FEATURE_HW_INTRINSICS) + if (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_PopCount) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbPopCount(useEdge, parentStack)) + { + node = *useEdge; + } + } + else if (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(useEdge, parentStack)) + { + node = *useEdge; + } + } +#endif // TARGET_ARM64 && FEATURE_HW_INTRINSICS break; #if defined(FEATURE_HW_INTRINSICS) diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index d0449be7c70a56..f7dce954d41df0 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -64,6 +64,12 @@ class Rationalizer final : public Phase bool ShouldRewriteToNonMaskHWIntrinsic(GenTree* node); #endif // TARGET_XARCH +#if defined(TARGET_ARM64) + bool RewriteHWIntrinsicCmpMaskExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); + bool RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Compiler::GenTreeStack& parents); + bool RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree** use, Compiler::GenTreeStack& parents); +#endif // TARGET_ARM64 + void RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); #endif // FEATURE_HW_INTRINSICS diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs new file mode 100644 index 00000000000000..02fc5a11715cb3 --- /dev/null +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs @@ -0,0 +1,536 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using Xunit; + +namespace TestExtractMostSignificantBits +{ + public class Program + { + [Fact] + public static int TestEntryPoint() + { + bool fail = false; + + Vector128 utf16Data = Vector128.Create( + (ushort)0x0000, (ushort)0x0800, (ushort)0x07FF, (ushort)0x8000, + (ushort)0xD7FF, (ushort)0xD800, (ushort)0x0001, (ushort)0xFFFF); + + if (LessThanUInt16Mask(utf16Data, 0x0800) != 0x45) + { + fail = true; + } + + if (GreaterThanOrEqualUInt16Mask(utf16Data, 0x0800) != 0xBA) + { + fail = true; + } + + if (!AnyLessThanUInt16(utf16Data, 0x0800)) + { + fail = true; + } + + if (NoneLessThanUInt16(Vector128.Create((ushort)0x0800), 0x0800) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualUInt16(utf16Data, 0x0800) != 5) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt16(utf16Data, 0x0800) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt16(Vector128.Zero, 0x0800) != 32) + { + fail = true; + } + + Vector128 intData = Vector128.Create(-2, 0, 7, 8); + + if (LessThanInt32Mask(intData, 7) != 0x3) + { + fail = true; + } + + if (!AnyLessThanInt32(intData, 7)) + { + fail = true; + } + + if (NoneLessThanInt32(Vector128.Create(7), 7) != true) + { + fail = true; + } + + if (CountLessThanInt32(intData, 7) != 2) + { + fail = true; + } + + if (IndexOfFirstLessThanInt32(intData, 7) != 0) + { + fail = true; + } + + if (IndexOfFirstLessThanInt32(Vector128.Create(7), 7) != 32) + { + fail = true; + } + + Vector128 byteData = Vector128.Create( + (byte)0x00, (byte)0x80, (byte)0x7F, (byte)0xFF, + (byte)0x01, (byte)0x81, (byte)0x40, (byte)0xC0, + (byte)0x02, (byte)0x82, (byte)0x20, (byte)0xA0, + (byte)0x04, (byte)0x84, (byte)0x10, (byte)0x90); + + if (GreaterThanOrEqualByteMask(byteData, 0x80) != 0xAAAA) + { + fail = true; + } + + if (!AnyGreaterThanOrEqualByte(byteData, 0x80)) + { + fail = true; + } + + if (NoneGreaterThanOrEqualByte(Vector128.Zero, 0x80) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualByte(byteData, 0x80) != 8) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte(byteData, 0x80) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte(Vector128.Zero, 0x80) != 32) + { + fail = true; + } + + Vector64 utf16Data64 = Vector64.Create( + (ushort)0x0000, (ushort)0x0800, (ushort)0x07FF, (ushort)0xFFFF); + + if (LessThanUInt16Mask64(utf16Data64, 0x0800) != 0x5) + { + fail = true; + } + + if (!AnyLessThanUInt1664(utf16Data64, 0x0800)) + { + fail = true; + } + + if (NoneLessThanUInt1664(Vector64.Create((ushort)0x0800), 0x0800) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualUInt1664(utf16Data64, 0x0800) != 2) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt1664(utf16Data64, 0x0800) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt1664(Vector64.Zero, 0x0800) != 32) + { + fail = true; + } + + Vector64 intData64 = Vector64.Create(-2, 8); + + if (LessThanInt32Mask64(intData64, 7) != 0x1) + { + fail = true; + } + + if (!AnyLessThanInt3264(intData64, 7)) + { + fail = true; + } + + if (NoneLessThanInt3264(Vector64.Create(7), 7) != true) + { + fail = true; + } + + if (CountLessThanInt3264(intData64, 7) != 1) + { + fail = true; + } + + if (IndexOfFirstLessThanInt3264(intData64, 7) != 0) + { + fail = true; + } + + if (IndexOfFirstLessThanInt3264(Vector64.Create(7), 7) != 32) + { + fail = true; + } + + Vector64 byteData64 = Vector64.Create( + (byte)0x00, (byte)0x80, (byte)0x7F, (byte)0xFF, + (byte)0x01, (byte)0x81, (byte)0x40, (byte)0xC0); + + if (GreaterThanOrEqualByteMask64(byteData64, 0x80) != 0xAA) + { + fail = true; + } + + if (!AnyGreaterThanOrEqualByte64(byteData64, 0x80)) + { + fail = true; + } + + if (NoneGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualByte64(byteData64, 0x80) != 4) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte64(byteData64, 0x80) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != 32) + { + fail = true; + } + + return fail ? 101 : 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanUInt16Mask(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: and {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualUInt16Mask(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: and {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return BitOperations.PopCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanInt32Mask(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: and {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: smov {{x[0-9]+}}, {{v[0-9]+}}.s[0] + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + // ARM64-FULL-LINE: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + return BitOperations.PopCount(Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualByteMask(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.PopCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanUInt16Mask64(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: and {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return BitOperations.PopCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanInt32Mask64(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: and {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + // ARM64-FULL-LINE: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + return BitOperations.PopCount(Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualByteMask64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.PopCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + } +} diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj new file mode 100644 index 00000000000000..46f205a5ec8387 --- /dev/null +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj @@ -0,0 +1,17 @@ + + + + true + + + None + True + + + + true + + + + + From ab8c02706415642673421524fef12abecb91b4c6 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Thu, 25 Jun 2026 12:43:09 +0000 Subject: [PATCH 2/5] JIT: Track zero/all-bits SIMD masks for EMSB rewrites Teach assertion propagation on ARM64 to recognize Vector64/128 ExtractMostSignificantBits inputs whose value numbers represent per-element boolean masks. The helper recognizes comparison masks, all-zero/all-bits constants, boolean-preserving operations, and reaching PHI values, and marks the EMSB node with a HW intrinsic flag. Consume the flag in rationalization so existing ExtractMostSignificantBits rewrites can handle mask values that have flowed through locals, while keeping unsupported element types filtered out. Add coverage for a comparison mask stored in a local before PopCount(ExtractMostSignificantBits()). --- src/coreclr/jit/assertionprop.cpp | 180 ++++++++++++++++++ src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/rationalize.cpp | 10 + .../ExtractMostSignificantBits.cs | 16 ++ 4 files changed, 207 insertions(+) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index ca8688793d0f6e..183ce2fa3475f9 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -78,6 +78,180 @@ static Range GetRange(Compiler* comp, GenTree* tree, BasicBlock* block, ASSERT_V return Limit(Limit::keUnknown); } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask. +// +// Arguments: +// intrinsic - The hwintrinsic id +// +// Return Value: +// True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. +// +static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) +{ + switch (intrinsic) + { + case NI_AdvSimd_CompareEqual: + case NI_AdvSimd_CompareGreaterThan: + case NI_AdvSimd_CompareGreaterThanOrEqual: + case NI_AdvSimd_CompareLessThan: + case NI_AdvSimd_CompareLessThanOrEqual: + case NI_AdvSimd_Arm64_CompareEqual: + case NI_AdvSimd_Arm64_CompareGreaterThan: + case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: + case NI_AdvSimd_Arm64_CompareLessThan: + case NI_AdvSimd_Arm64_CompareLessThanOrEqual: + { + return true; + } + + default: + { + return false; + } + } +} + +//---------------------------------------------------------------------------------------------- +// AllComponentsEitherZeroOrAllBitsSet: Check if a SIMD VN has per-element boolean values. +// +// Arguments: +// comp - The compiler instance +// vn - The value number +// baseType - The expected SIMD element base type +// +// Return Value: +// True if every SIMD element is known to be either all-bits-set or zero. +// +static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var_types baseType) +{ + if (vn == ValueNumStore::NoVN) + { + return false; + } + + vn = comp->vnStore->VNNormalValue(vn); + + if (comp->vnStore->IsVNConstant(vn)) + { + switch (comp->vnStore->TypeOfVN(vn)) + { + case TYP_SIMD8: + { + simd8_t val = comp->vnStore->GetConstantSimd8(vn); + return val.IsAllBitsSet() || val.IsZero(); + } + + case TYP_SIMD16: + { + simd16_t val = comp->vnStore->GetConstantSimd16(vn); + return val.IsAllBitsSet() || val.IsZero(); + } + + default: + { + return false; + } + } + } + + VNFuncApp funcApp; + NamedIntrinsic intrinsicId; + unsigned simdSize; + var_types intrinsicSimdBaseType; + + if (!comp->vnStore->IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &simdSize, &intrinsicSimdBaseType)) + { + return false; + } + + if ((simdSize != 8) && (simdSize != 16)) + { + return false; + } + + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, baseType, &isScalar); + + if (isScalar) + { + return false; + } + + switch (oper) + { + case GT_EQ: + case GT_NE: + case GT_GT: + case GT_GE: + case GT_LE: + case GT_LT: + { + return varTypeIsIntegral(baseType) && + (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType)); + } + + case GT_NOT: + { + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(0), baseType); + } + + case GT_OR: + case GT_AND: + case GT_XOR: + case GT_AND_NOT: + { + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(0), baseType) && + AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(1), baseType); + } + + default: + { + return false; + } + } +} + +//---------------------------------------------------------------------------------------------- +// optAssertionProp_HWIntrinsic: Propagate VN-derived facts to hwintrinsic tree flags. +// +// Arguments: +// tree - The hwintrinsic node +// +static void optAssertionProp_HWIntrinsic(Compiler* comp, GenTreeHWIntrinsic* tree) +{ + NamedIntrinsic intrinsic = tree->GetHWIntrinsicId(); + + if ((intrinsic != NI_Vector64_ExtractMostSignificantBits) && + (intrinsic != NI_Vector128_ExtractMostSignificantBits)) + { + return; + } + + assert(tree->GetOperandCount() == 1); + + GenTree* op1 = tree->Op(1); + + if (op1->OperIsHWIntrinsic() && !IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) + { + return; + } + + ValueNum op1VN = comp->vnStore->VNConservativeNormalValue(op1->gtVNPair); + + auto vnVisitor = [comp, tree](ValueNum vn) -> ValueNumStore::VNVisit { + return AllComponentsEitherZeroOrAllBitsSet(comp, vn, tree->GetSimdBaseType()) ? ValueNumStore::VNVisit::Continue + : ValueNumStore::VNVisit::Abort; + }; + + if (comp->vnStore->VNVisitReachingVNs(op1VN, vnVisitor) == ValueNumStore::VNVisit::Continue) + { + tree->gtFlags |= GTF_HW_ZERO_OR_ALL_BITS_SET; + } +} +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + //------------------------------------------------------------------------ // SymbolicToRealValue: Convert a symbolic value to a 64-bit signed integer. // @@ -5867,6 +6041,12 @@ GenTree* Compiler::optAssertionProp(ASSERT_VALARG_TP assertions, GenTree* tree, case GT_CALL: return optAssertionProp_Call(assertions, tree->AsCall(), stmt); +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) + case GT_HWINTRINSIC: + optAssertionProp_HWIntrinsic(this, tree->AsHWIntrinsic()); + return nullptr; +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + case GT_EQ: case GT_NE: case GT_LT: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b66874618c106b..2780abac2ab423 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -541,6 +541,7 @@ enum GenTreeFlags : unsigned #ifdef FEATURE_HW_INTRINSICS GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call + GTF_HW_ZERO_OR_ALL_BITS_SET = 0x40000000, // GT_HWINTRINSIC -- each SIMD element is either zero or all-bits-set #endif // FEATURE_HW_INTRINSICS }; diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index be4eb0a119c8c2..306f2f02cfe79d 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -1457,8 +1457,18 @@ static bool IsHWIntrinsicCmpMaskExtractMsb(GenTreeHWIntrinsic* node) return false; } + if (NormalizeCmpMaskSimdBaseType(node->GetSimdBaseType()) == TYP_UNDEF) + { + return false; + } + GenTree* op1 = node->Op(1); + if (((node->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET) != 0) && op1->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + return true; + } + return op1->OperIsHWIntrinsic() && IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()); } diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs index 02fc5a11715cb3..d7785b2a925edc 100644 --- a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs @@ -113,6 +113,11 @@ public static int TestEntryPoint() fail = true; } + if (CountGreaterThanOrEqualByteViaLocal(byteData, 0x80) != 8) + { + fail = true; + } + if (IndexOfFirstGreaterThanOrEqualByte(byteData, 0x80) != 1) { fail = true; @@ -375,6 +380,17 @@ private static int CountGreaterThanOrEqualByte(Vector128 value, byte limit return BitOperations.PopCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); } + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByteViaLocal(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + Vector128 mask = Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)); + return BitOperations.PopCount(mask.ExtractMostSignificantBits()); + } + [MethodImpl(MethodImplOptions.NoInlining)] private static int IndexOfFirstGreaterThanOrEqualByte(Vector128 value, byte limit) { From f6c2d69f7492db1eafed95202f1c37a42c8a13cf Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Thu, 25 Jun 2026 13:24:36 +0000 Subject: [PATCH 3/5] Fix formatting --- src/coreclr/jit/assertionprop.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 183ce2fa3475f9..72e30989fc349c 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -156,7 +156,7 @@ static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var } } - VNFuncApp funcApp; + VNFuncApp funcApp; NamedIntrinsic intrinsicId; unsigned simdSize; var_types intrinsicSimdBaseType; @@ -188,8 +188,7 @@ static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var case GT_LE: case GT_LT: { - return varTypeIsIntegral(baseType) && - (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType)); + return varTypeIsIntegral(baseType) && (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType)); } case GT_NOT: @@ -223,8 +222,7 @@ static void optAssertionProp_HWIntrinsic(Compiler* comp, GenTreeHWIntrinsic* tre { NamedIntrinsic intrinsic = tree->GetHWIntrinsicId(); - if ((intrinsic != NI_Vector64_ExtractMostSignificantBits) && - (intrinsic != NI_Vector128_ExtractMostSignificantBits)) + if ((intrinsic != NI_Vector64_ExtractMostSignificantBits) && (intrinsic != NI_Vector128_ExtractMostSignificantBits)) { return; } From 9753a9068fdf09d29d2eaa34c5e66bd0f226f7c6 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Fri, 26 Jun 2026 09:14:30 +0000 Subject: [PATCH 4/5] Address compare mask PR feedback - Share the ARM64 compare-mask intrinsic helper between assertion propagation and rationalization. - Document the widened unsigned compare-mask base type check and missing assertion-prop compiler argument. - Include the zero-or-all-bits-set hardware intrinsic flag in GenTree comparison. - Reuse the shuffle index-type helper when normalizing compare-mask base types. - Extend the compare-mask ExtractMostSignificantBits zero-count rewrite to cover LeadingZeroCount. - Add ARM64 ExtractMostSignificantBits LeadingZeroCount regression coverage. Change-Id: I074f573d5fabfe8b99ec6679267fe485f2209fd7 --- src/coreclr/jit/assertionprop.cpp | 39 +---- src/coreclr/jit/compiler.h | 36 ++++ src/coreclr/jit/gentree.cpp | 5 + src/coreclr/jit/rationalize.cpp | 154 +++++++++--------- src/coreclr/jit/rationalize.h | 2 +- .../ExtractMostSignificantBits.cs | 40 +++++ 6 files changed, 160 insertions(+), 116 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 72e30989fc349c..3d24401a4a5d36 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -79,40 +79,6 @@ static Range GetRange(Compiler* comp, GenTree* tree, BasicBlock* block, ASSERT_V } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) -//---------------------------------------------------------------------------------------------- -// IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask. -// -// Arguments: -// intrinsic - The hwintrinsic id -// -// Return Value: -// True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. -// -static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) -{ - switch (intrinsic) - { - case NI_AdvSimd_CompareEqual: - case NI_AdvSimd_CompareGreaterThan: - case NI_AdvSimd_CompareGreaterThanOrEqual: - case NI_AdvSimd_CompareLessThan: - case NI_AdvSimd_CompareLessThanOrEqual: - case NI_AdvSimd_Arm64_CompareEqual: - case NI_AdvSimd_Arm64_CompareGreaterThan: - case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: - case NI_AdvSimd_Arm64_CompareLessThan: - case NI_AdvSimd_Arm64_CompareLessThanOrEqual: - { - return true; - } - - default: - { - return false; - } - } -} - //---------------------------------------------------------------------------------------------- // AllComponentsEitherZeroOrAllBitsSet: Check if a SIMD VN has per-element boolean values. // @@ -188,6 +154,8 @@ static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var case GT_LE: case GT_LT: { + // The comparison intrinsic may have used a wider unsigned base type to produce the + // per-element mask, e.g. a TYP_BYTE compare can be implemented as TYP_UBYTE. return varTypeIsIntegral(baseType) && (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType)); } @@ -216,6 +184,7 @@ static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var // optAssertionProp_HWIntrinsic: Propagate VN-derived facts to hwintrinsic tree flags. // // Arguments: +// comp - The compiler instance // tree - The hwintrinsic node // static void optAssertionProp_HWIntrinsic(Compiler* comp, GenTreeHWIntrinsic* tree) @@ -231,7 +200,7 @@ static void optAssertionProp_HWIntrinsic(Compiler* comp, GenTreeHWIntrinsic* tre GenTree* op1 = tree->Op(1); - if (op1->OperIsHWIntrinsic() && !IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) + if (op1->OperIsHWIntrinsic() && !Compiler::IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) { return; } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 968a976d2846be..e690000b248631 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10481,6 +10481,42 @@ class Compiler } } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) + //---------------------------------------------------------------------------------------------- + // IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask. + // + // Arguments: + // intrinsic - The hwintrinsic id + // + // Return Value: + // True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. + // + static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) + { + switch (intrinsic) + { + case NI_AdvSimd_CompareEqual: + case NI_AdvSimd_CompareGreaterThan: + case NI_AdvSimd_CompareGreaterThanOrEqual: + case NI_AdvSimd_CompareLessThan: + case NI_AdvSimd_CompareLessThanOrEqual: + case NI_AdvSimd_Arm64_CompareEqual: + case NI_AdvSimd_Arm64_CompareGreaterThan: + case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: + case NI_AdvSimd_Arm64_CompareLessThan: + case NI_AdvSimd_Arm64_CompareLessThanOrEqual: + { + return true; + } + + default: + { + return false; + } + } + } +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + private: unsigned getSIMDInitTempVarNum(var_types simdType); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f3889f960de4a2..e6195bf5785382 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -3001,6 +3001,11 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) #ifdef FEATURE_HW_INTRINSICS case GT_HWINTRINSIC: + if ((op1->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET) != (op2->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET)) + { + return false; + } + return GenTreeHWIntrinsic::Equals(op1->AsHWIntrinsic(), op2->AsHWIntrinsic()); #endif diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 306f2f02cfe79d..91c4f22a6662e7 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -1367,40 +1367,6 @@ bool Rationalizer::ShouldRewriteToNonMaskHWIntrinsic(GenTree* node) #endif // TARGET_XARCH #if defined(TARGET_ARM64) -//---------------------------------------------------------------------------------------------- -// IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask -// -// Arguments: -// intrinsic - The hwintrinsic id -// -// Return Value: -// True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. -// -static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) -{ - switch (intrinsic) - { - case NI_AdvSimd_CompareEqual: - case NI_AdvSimd_CompareGreaterThan: - case NI_AdvSimd_CompareGreaterThanOrEqual: - case NI_AdvSimd_CompareLessThan: - case NI_AdvSimd_CompareLessThanOrEqual: - case NI_AdvSimd_Arm64_CompareEqual: - case NI_AdvSimd_Arm64_CompareGreaterThan: - case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: - case NI_AdvSimd_Arm64_CompareLessThan: - case NI_AdvSimd_Arm64_CompareLessThanOrEqual: - { - return true; - } - - default: - { - return false; - } - } -} - //---------------------------------------------------------------------------------------------- // NormalizeCmpMaskSimdBaseType: Normalize a SIMD comparison mask's base type to unsigned. // @@ -1416,20 +1382,12 @@ static var_types NormalizeCmpMaskSimdBaseType(var_types simdBaseType) { case TYP_BYTE: case TYP_UBYTE: - { - return TYP_UBYTE; - } - case TYP_SHORT: case TYP_USHORT: - { - return TYP_USHORT; - } - case TYP_INT: case TYP_UINT: { - return TYP_UINT; + return Compiler::getIndexTypeForShuffle(simdBaseType); } default: @@ -1469,7 +1427,7 @@ static bool IsHWIntrinsicCmpMaskExtractMsb(GenTreeHWIntrinsic* node) return true; } - return op1->OperIsHWIntrinsic() && IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()); + return op1->OperIsHWIntrinsic() && Compiler::IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()); } //---------------------------------------------------------------------------------------------- @@ -1487,17 +1445,28 @@ static bool IsPrimitivePopCount(GenTree* node) } //---------------------------------------------------------------------------------------------- -// IsPrimitiveTrailingZeroCount: Checks if a node is a primitive TrailingZeroCount intrinsic. +// IsZeroCount: Checks if a node is a scalar zero-count intrinsic. // // Arguments: // node - The node to check. // // Return Value: -// True if the node is a primitive TrailingZeroCount intrinsic. +// True if the node is a TrailingZeroCount or LeadingZeroCount intrinsic. // -static bool IsPrimitiveTrailingZeroCount(GenTree* node) +static bool IsZeroCount(GenTree* node) { - return node->OperIs(GT_INTRINSIC) && (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); + if (node->OperIs(GT_INTRINSIC)) + { + return (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) || + (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_LeadingZeroCount); + } + + if (node->OperIsHWIntrinsic()) + { + return node->AsHWIntrinsic()->GetHWIntrinsicId() == NI_ArmBase_LeadingZeroCount; + } + + return false; } //---------------------------------------------------------------------------------------------- @@ -1763,20 +1732,25 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Co } //---------------------------------------------------------------------------------------------- -// RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount: -// Rewrites TrailingZeroCount(ExtractMostSignificantBits(...)) when the input is known to be a -// SIMD comparison mask. +// RewriteHWIntrinsicCmpMaskExtractMsbZeroCount: +// Rewrites TrailingZeroCount(ExtractMostSignificantBits(...)) and +// LeadingZeroCount(ExtractMostSignificantBits(...)) when the input is known to be a SIMD comparison mask. // // Matches: // TrailingZeroCount(ExtractMostSignificantBits(cmpMask)) +// LeadingZeroCount(ExtractMostSignificantBits(cmpMask)) // // Replaces it with: -// MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) - 1 +// TrailingZeroCount: MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) - 1 +// LeadingZeroCount: MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) +// +// For TrailingZeroCount, IndexVector holds one-based element indexes and SentinelVector holds 33. +// The selected minimum is therefore the first matching element index plus one, or 33 if no element +// matched. Subtracting one preserves the zero-mask result of 32. // -// IndexVector holds one-based element indexes and SentinelVector holds 33. The selected minimum is -// therefore the first matching element index plus one, or 33 if no element matched. Subtracting one -// preserves the zero-mask TrailingZeroCount result of 32. For Vector64, the reduction is -// implemented with MinPairwise. +// For LeadingZeroCount, IndexVector holds 31 minus the element index and SentinelVector holds 32. +// The selected minimum is therefore the leading-zero-count result directly, including 32 for the +// zero-mask case. For Vector64, the reduction is implemented with MinPairwise. // // Arguments: // use - A pointer to the intrinsic node @@ -1785,12 +1759,15 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Co // Return Value: // True if the node was rewritten; otherwise false. // -bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree** use, Compiler::GenTreeStack& parents) +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(GenTree** use, Compiler::GenTreeStack& parents) { - GenTreeIntrinsic* tzcnt = (*use)->AsIntrinsic(); - assert(tzcnt->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); + GenTree* zeroCount = *use; + assert(IsZeroCount(zeroCount)); + + const bool isTrailingZeroCount = zeroCount->OperIs(GT_INTRINSIC) && + (zeroCount->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); - GenTree* extract = tzcnt->gtGetOp1(); + GenTree* extract = zeroCount->OperIs(GT_INTRINSIC) ? zeroCount->gtGetOp1() : zeroCount->AsHWIntrinsic()->Op(1); if (!extract->OperIsHWIntrinsic()) { @@ -1813,11 +1790,9 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree* unsigned simdSize = extractNode->GetSimdSize(); var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - // A comparison produces elements whose value is either all-bits-set or zero. For an - // IndexOf-style consumer, select a 1-based element index when the comparison is true and a - // sentinel value of 33 when false. The horizontal min reduction then finds either the first - // matching index plus one or the sentinel. Subtracting one produces the trailing-zero-count - // result, including 32 for the zero mask case. + // A comparison produces elements whose value is either all-bits-set or zero. Select an element + // index value when the comparison is true and a sentinel when false. The horizontal min reduction + // then finds the zero-count result directly or with a final subtract, as described above. GenTree* op1 = extractNode->Op(1); @@ -1833,22 +1808,25 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree* { case TYP_UBYTE: { - indexVec->gtSimdVal.u8[index] = static_cast(index + 1); - otherVec->gtSimdVal.u8[index] = 33; + indexVec->gtSimdVal.u8[index] = + static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u8[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } case TYP_USHORT: { - indexVec->gtSimdVal.u16[index] = static_cast(index + 1); - otherVec->gtSimdVal.u16[index] = 33; + indexVec->gtSimdVal.u16[index] = + static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u16[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } case TYP_UINT: { - indexVec->gtSimdVal.u32[index] = static_cast(index + 1); - otherVec->gtSimdVal.u32[index] = 33; + indexVec->gtSimdVal.u32[index] = + static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u32[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } @@ -1899,15 +1877,20 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree* GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, extractNode, /* isUnsigned */ true, TYP_INT); BlockRange().InsertAfter(extractNode, castNode); - GenTree* one = m_compiler->gtNewIconNode(1); - BlockRange().InsertAfter(castNode, one); + GenTree* result = castNode; - GenTree* result = m_compiler->gtNewOperNode(GT_SUB, TYP_INT, castNode, one); - BlockRange().InsertAfter(one, result); + if (isTrailingZeroCount) + { + GenTree* one = m_compiler->gtNewIconNode(1); + BlockRange().InsertAfter(castNode, one); + + result = m_compiler->gtNewOperNode(GT_SUB, TYP_INT, castNode, one); + BlockRange().InsertAfter(one, result); + } - BlockRange().Remove(tzcnt); + BlockRange().Remove(zeroCount); - ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, tzcnt, result); + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, zeroCount, result); return true; } @@ -1938,7 +1921,7 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree } if ((parents.Height() > 1) && - (IsPrimitivePopCount(parents.Top(1)) || IsPrimitiveTrailingZeroCount(parents.Top(1))) && + (IsPrimitivePopCount(parents.Top(1)) || IsZeroCount(parents.Top(1))) && IsHWIntrinsicCmpMaskExtractMsb(node)) { return; @@ -2441,9 +2424,10 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge node = *useEdge; } } - else if (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) + else if ((node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) || + (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_LeadingZeroCount)) { - if (RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(useEdge, parentStack)) + if (RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(useEdge, parentStack)) { node = *useEdge; } @@ -2453,6 +2437,16 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge #if defined(FEATURE_HW_INTRINSICS) case GT_HWINTRINSIC: +#if defined(TARGET_ARM64) + if (IsZeroCount(node)) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(useEdge, parentStack)) + { + node = *useEdge; + break; + } + } +#endif // TARGET_ARM64 RewriteHWIntrinsic(useEdge, parentStack); break; #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index f7dce954d41df0..06674cc1f7ca94 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -67,7 +67,7 @@ class Rationalizer final : public Phase #if defined(TARGET_ARM64) bool RewriteHWIntrinsicCmpMaskExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); bool RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Compiler::GenTreeStack& parents); - bool RewriteHWIntrinsicCmpMaskExtractMsbTrailingZeroCount(GenTree** use, Compiler::GenTreeStack& parents); + bool RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(GenTree** use, Compiler::GenTreeStack& parents); #endif // TARGET_ARM64 void RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs index d7785b2a925edc..b9f26046adbcec 100644 --- a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs @@ -128,6 +128,16 @@ public static int TestEntryPoint() fail = true; } + if (LeadingZeroCountGreaterThanOrEqualByte(byteData, 0x80) != 16) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte(Vector128.Zero, 0x80) != 32) + { + fail = true; + } + Vector64 utf16Data64 = Vector64.Create( (ushort)0x0000, (ushort)0x0800, (ushort)0x07FF, (ushort)0xFFFF); @@ -227,6 +237,16 @@ public static int TestEntryPoint() fail = true; } + if (LeadingZeroCountGreaterThanOrEqualByte64(byteData64, 0x80) != 24) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != 32) + { + fail = true; + } + return fail ? 101 : 100; } @@ -402,6 +422,16 @@ private static int IndexOfFirstGreaterThanOrEqualByte(Vector128 value, byt return BitOperations.TrailingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); } + [MethodImpl(MethodImplOptions.NoInlining)] + private static int LeadingZeroCountGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.LeadingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + [MethodImpl(MethodImplOptions.NoInlining)] private static uint LessThanUInt16Mask64(Vector64 value, ushort limit) { @@ -548,5 +578,15 @@ private static int IndexOfFirstGreaterThanOrEqualByte64(Vector64 value, by // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 return BitOperations.TrailingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int LeadingZeroCountGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.LeadingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } } } From 85bed58998bb6f26a5aada381d2b1393ae7f6152 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Fri, 26 Jun 2026 09:50:11 +0000 Subject: [PATCH 5/5] Fix formatting --- src/coreclr/jit/rationalize.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 91c4f22a6662e7..0b006247d9267a 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -1808,24 +1808,21 @@ bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(GenTree** use, C { case TYP_UBYTE: { - indexVec->gtSimdVal.u8[index] = - static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + indexVec->gtSimdVal.u8[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); otherVec->gtSimdVal.u8[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } case TYP_USHORT: { - indexVec->gtSimdVal.u16[index] = - static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + indexVec->gtSimdVal.u16[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); otherVec->gtSimdVal.u16[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } case TYP_UINT: { - indexVec->gtSimdVal.u32[index] = - static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + indexVec->gtSimdVal.u32[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); otherVec->gtSimdVal.u32[index] = static_cast(isTrailingZeroCount ? 33 : 32); break; } @@ -1920,8 +1917,7 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree return; } - if ((parents.Height() > 1) && - (IsPrimitivePopCount(parents.Top(1)) || IsZeroCount(parents.Top(1))) && + if ((parents.Height() > 1) && (IsPrimitivePopCount(parents.Top(1)) || IsZeroCount(parents.Top(1))) && IsHWIntrinsicCmpMaskExtractMsb(node)) { return;