diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index ca8688793d0f6e..3d24401a4a5d36 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -78,6 +78,147 @@ static Range GetRange(Compiler* comp, GenTree* tree, BasicBlock* block, ASSERT_V return Limit(Limit::keUnknown); } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// AllComponentsEitherZeroOrAllBitsSet: Check if a SIMD VN has per-element boolean values. +// +// Arguments: +// comp - The compiler instance +// vn - The value number +// baseType - The expected SIMD element base type +// +// Return Value: +// True if every SIMD element is known to be either all-bits-set or zero. +// +static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var_types baseType) +{ + if (vn == ValueNumStore::NoVN) + { + return false; + } + + vn = comp->vnStore->VNNormalValue(vn); + + if (comp->vnStore->IsVNConstant(vn)) + { + switch (comp->vnStore->TypeOfVN(vn)) + { + case TYP_SIMD8: + { + simd8_t val = comp->vnStore->GetConstantSimd8(vn); + return val.IsAllBitsSet() || val.IsZero(); + } + + case TYP_SIMD16: + { + simd16_t val = comp->vnStore->GetConstantSimd16(vn); + return val.IsAllBitsSet() || val.IsZero(); + } + + default: + { + return false; + } + } + } + + VNFuncApp funcApp; + NamedIntrinsic intrinsicId; + unsigned simdSize; + var_types intrinsicSimdBaseType; + + if (!comp->vnStore->IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &simdSize, &intrinsicSimdBaseType)) + { + return false; + } + + if ((simdSize != 8) && (simdSize != 16)) + { + return false; + } + + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, baseType, &isScalar); + + if (isScalar) + { + return false; + } + + switch (oper) + { + case GT_EQ: + case GT_NE: + case GT_GT: + case GT_GE: + case GT_LE: + case GT_LT: + { + // The comparison intrinsic may have used a wider unsigned base type to produce the + // per-element mask, e.g. a TYP_BYTE compare can be implemented as TYP_UBYTE. + return varTypeIsIntegral(baseType) && (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType)); + } + + case GT_NOT: + { + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(0), baseType); + } + + case GT_OR: + case GT_AND: + case GT_XOR: + case GT_AND_NOT: + { + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(0), baseType) && + AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.GetArg(1), baseType); + } + + default: + { + return false; + } + } +} + +//---------------------------------------------------------------------------------------------- +// optAssertionProp_HWIntrinsic: Propagate VN-derived facts to hwintrinsic tree flags. +// +// Arguments: +// comp - The compiler instance +// tree - The hwintrinsic node +// +static void optAssertionProp_HWIntrinsic(Compiler* comp, GenTreeHWIntrinsic* tree) +{ + NamedIntrinsic intrinsic = tree->GetHWIntrinsicId(); + + if ((intrinsic != NI_Vector64_ExtractMostSignificantBits) && (intrinsic != NI_Vector128_ExtractMostSignificantBits)) + { + return; + } + + assert(tree->GetOperandCount() == 1); + + GenTree* op1 = tree->Op(1); + + if (op1->OperIsHWIntrinsic() && !Compiler::IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) + { + return; + } + + ValueNum op1VN = comp->vnStore->VNConservativeNormalValue(op1->gtVNPair); + + auto vnVisitor = [comp, tree](ValueNum vn) -> ValueNumStore::VNVisit { + return AllComponentsEitherZeroOrAllBitsSet(comp, vn, tree->GetSimdBaseType()) ? ValueNumStore::VNVisit::Continue + : ValueNumStore::VNVisit::Abort; + }; + + if (comp->vnStore->VNVisitReachingVNs(op1VN, vnVisitor) == ValueNumStore::VNVisit::Continue) + { + tree->gtFlags |= GTF_HW_ZERO_OR_ALL_BITS_SET; + } +} +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + //------------------------------------------------------------------------ // SymbolicToRealValue: Convert a symbolic value to a 64-bit signed integer. // @@ -5867,6 +6008,12 @@ GenTree* Compiler::optAssertionProp(ASSERT_VALARG_TP assertions, GenTree* tree, case GT_CALL: return optAssertionProp_Call(assertions, tree->AsCall(), stmt); +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) + case GT_HWINTRINSIC: + optAssertionProp_HWIntrinsic(this, tree->AsHWIntrinsic()); + return nullptr; +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + case GT_EQ: case GT_NE: case GT_LT: diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 968a976d2846be..e690000b248631 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10481,6 +10481,42 @@ class Compiler } } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_ARM64) + //---------------------------------------------------------------------------------------------- + // IsHWIntrinsicCmpMask: Checks if the hwintrinsic produces a SIMD comparison mask. + // + // Arguments: + // intrinsic - The hwintrinsic id + // + // Return Value: + // True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero. + // + static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic) + { + switch (intrinsic) + { + case NI_AdvSimd_CompareEqual: + case NI_AdvSimd_CompareGreaterThan: + case NI_AdvSimd_CompareGreaterThanOrEqual: + case NI_AdvSimd_CompareLessThan: + case NI_AdvSimd_CompareLessThanOrEqual: + case NI_AdvSimd_Arm64_CompareEqual: + case NI_AdvSimd_Arm64_CompareGreaterThan: + case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: + case NI_AdvSimd_Arm64_CompareLessThan: + case NI_AdvSimd_Arm64_CompareLessThanOrEqual: + { + return true; + } + + default: + { + return false; + } + } + } +#endif // FEATURE_HW_INTRINSICS && TARGET_ARM64 + private: unsigned getSIMDInitTempVarNum(var_types simdType); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f3889f960de4a2..e6195bf5785382 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -3001,6 +3001,11 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) #ifdef FEATURE_HW_INTRINSICS case GT_HWINTRINSIC: + if ((op1->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET) != (op2->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET)) + { + return false; + } + return GenTreeHWIntrinsic::Equals(op1->AsHWIntrinsic(), op2->AsHWIntrinsic()); #endif diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b66874618c106b..2780abac2ab423 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -541,6 +541,7 @@ enum GenTreeFlags : unsigned #ifdef FEATURE_HW_INTRINSICS GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call + GTF_HW_ZERO_OR_ALL_BITS_SET = 0x40000000, // GT_HWINTRINSIC -- each SIMD element is either zero or all-bits-set #endif // FEATURE_HW_INTRINSICS }; diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index fc4545a2cb46e8..0b006247d9267a 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -1366,6 +1366,533 @@ bool Rationalizer::ShouldRewriteToNonMaskHWIntrinsic(GenTree* node) } #endif // TARGET_XARCH +#if defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// NormalizeCmpMaskSimdBaseType: Normalize a SIMD comparison mask's base type to unsigned. +// +// Arguments: +// simdBaseType - The SIMD base type. +// +// Return Value: +// The normalized SIMD base type, or TYP_UNDEF if it is unsupported. +// +static var_types NormalizeCmpMaskSimdBaseType(var_types simdBaseType) +{ + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + return Compiler::getIndexTypeForShuffle(simdBaseType); + } + + default: + { + return TYP_UNDEF; + } + } +} + +//---------------------------------------------------------------------------------------------- +// IsHWIntrinsicCmpMaskExtractMsb: Checks if an ExtractMostSignificantBits node consumes a SIMD +// comparison mask. +// +// Arguments: +// node - The hwintrinsic node. +// +// Return Value: +// True if the node is an ExtractMostSignificantBits over a SIMD comparison mask. +// +static bool IsHWIntrinsicCmpMaskExtractMsb(GenTreeHWIntrinsic* node) +{ + if ((node->GetHWIntrinsicId() != NI_Vector64_ExtractMostSignificantBits) && + (node->GetHWIntrinsicId() != NI_Vector128_ExtractMostSignificantBits)) + { + return false; + } + + if (NormalizeCmpMaskSimdBaseType(node->GetSimdBaseType()) == TYP_UNDEF) + { + return false; + } + + GenTree* op1 = node->Op(1); + + if (((node->gtFlags & GTF_HW_ZERO_OR_ALL_BITS_SET) != 0) && op1->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + return true; + } + + return op1->OperIsHWIntrinsic() && Compiler::IsHWIntrinsicCmpMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()); +} + +//---------------------------------------------------------------------------------------------- +// IsPrimitivePopCount: Checks if a node is a primitive PopCount intrinsic. +// +// Arguments: +// node - The node to check. +// +// Return Value: +// True if the node is a primitive PopCount intrinsic. +// +static bool IsPrimitivePopCount(GenTree* node) +{ + return node->OperIs(GT_INTRINSIC) && (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_PopCount); +} + +//---------------------------------------------------------------------------------------------- +// IsZeroCount: Checks if a node is a scalar zero-count intrinsic. +// +// Arguments: +// node - The node to check. +// +// Return Value: +// True if the node is a TrailingZeroCount or LeadingZeroCount intrinsic. +// +static bool IsZeroCount(GenTree* node) +{ + if (node->OperIs(GT_INTRINSIC)) + { + return (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) || + (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_LeadingZeroCount); + } + + if (node->OperIsHWIntrinsic()) + { + return node->AsHWIntrinsic()->GetHWIntrinsicId() == NI_ArmBase_LeadingZeroCount; + } + + return false; +} + +//---------------------------------------------------------------------------------------------- +// ReplaceHWIntrinsicCmpMaskExtractMsbUse: Replace a scalarized comparison mask extraction with +// the specified replacement node. +// +// Arguments: +// use - A pointer to the node being replaced +// parents - A reference to tree walk data providing the context +// oldNode - The node being replaced +// replacement - The node that replaces *use +// +static void ReplaceHWIntrinsicCmpMaskExtractMsbUse(GenTree** use, + Compiler::GenTreeStack& parents, + GenTree* oldNode, + GenTree* replacement) +{ + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, replacement); + } + else + { + *use = replacement; + } + + // Adjust the parent stack + assert(parents.Top() == oldNode); + (void)parents.Pop(); + parents.Push(replacement); +} + +//---------------------------------------------------------------------------------------------- +// ScalarizeHWIntrinsicCmpMaskReduction: Update an ExtractMostSignificantBits node so it scalarizes +// a vector reduction result. +// +// Arguments: +// node - The ExtractMostSignificantBits node to update +// reduction - The vector reduction node +// simdBaseType - The SIMD base type of the reduction +// simdSize - The SIMD size of the original input +// +static void ScalarizeHWIntrinsicCmpMaskReduction(GenTreeHWIntrinsic* node, + GenTree* reduction, + var_types simdBaseType, + unsigned simdSize) +{ + NamedIntrinsic intrinsic = (simdSize == 8) ? NI_Vector64_ToScalar : NI_Vector128_ToScalar; + + node->gtType = genActualType(simdBaseType); + node->ChangeHWIntrinsicId(intrinsic); + node->SetSimdSize(8); + node->SetSimdBaseType(simdBaseType); + node->Op(1) = reduction; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsb: +// Rewrites an ExtractMostSignificantBits operation when the input is known to be a SIMD comparison +// mask and the result is only checked for zero. +// +// Matches: +// ExtractMostSignificantBits(cmpMask) == 0 +// ExtractMostSignificantBits(cmpMask) != 0 +// +// Replaces the ExtractMostSignificantBits with: +// MaxAcross(cmpMask) +// +// This computes whether any all-bits-set comparison element exists without materializing the full +// bitmask. For Vector64, the reduction is implemented with MaxPairwise. +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsb(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + simdBaseType = NormalizeCmpMaskSimdBaseType(simdBaseType); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (parents.Height() <= 1) + { + return false; + } + + GenTree* parent = parents.Top(1); + + if (!parent->OperIs(GT_EQ, GT_NE)) + { + return false; + } + + GenTree* parentOp1 = parent->gtGetOp1(); + GenTree* parentOp2 = parent->gtGetOp2(); + + if (!(((parentOp1 == node) && parentOp2->IsIntegralConst(0)) || + ((parentOp2 == node) && parentOp1->IsIntegralConst(0)))) + { + return false; + } + + GenTree* op1 = node->Op(1); + + if (!IsHWIntrinsicCmpMaskExtractMsb(node)) + { + return false; + } + + // A comparison produces elements whose value is either all-bits-set or zero. When the + // ExtractMostSignificantBits result is only being compared against zero, use a horizontal max + // reduction to determine if any element was all-bits-set without materializing the full mask. + + GenTree* tmp; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S MaxAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + tmp = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_MaxPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, tmp); + } + else + { + tmp = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_MaxAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, tmp); + } + + op1 = tmp; + + ScalarizeHWIntrinsicCmpMaskReduction(node, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(node, castNode); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, node, castNode); + + return true; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsbPopCount: +// Rewrites PopCount(ExtractMostSignificantBits(...)) when the input is known to be a SIMD +// comparison mask. +// +// Matches: +// PopCount(ExtractMostSignificantBits(cmpMask)) +// +// Replaces it with: +// AddAcross(ShiftRightLogical(cmpMask, elementBits - 1)) +// +// This converts each all-bits-set comparison element to 1, each zero element to 0, then horizontally +// sums those per-element counts. For Vector64, the reduction is implemented with AddPairwise. +// +// Arguments: +// use - A pointer to the intrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeIntrinsic* popCount = (*use)->AsIntrinsic(); + assert(popCount->gtIntrinsicName == NI_PRIMITIVE_PopCount); + + GenTree* extract = popCount->gtGetOp1(); + + if (!extract->OperIsHWIntrinsic()) + { + return false; + } + + GenTreeHWIntrinsic* extractNode = extract->AsHWIntrinsic(); + var_types simdBaseType = NormalizeCmpMaskSimdBaseType(extractNode->GetSimdBaseType()); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (!IsHWIntrinsicCmpMaskExtractMsb(extractNode)) + { + return false; + } + + unsigned simdSize = extractNode->GetSimdSize(); + unsigned elementBitSize = genTypeSize(simdBaseType) * BITS_PER_BYTE; + + // A comparison produces elements whose value is either all-bits-set or zero. For a Count-style + // consumer, normalize each element to one or zero and then horizontally sum the elements. + + GenTree* op1 = extractNode->Op(1); + + GenTree* shiftAmount = m_compiler->gtNewIconNode(elementBitSize - 1); + BlockRange().InsertAfter(op1, shiftAmount); + + GenTree* shift = m_compiler->gtNewSimdHWIntrinsicNode(Compiler::getSIMDTypeForSize(simdSize), op1, shiftAmount, + NI_AdvSimd_ShiftRightLogical, simdBaseType, simdSize); + BlockRange().InsertAfter(shiftAmount, shift); + op1 = shift; + + GenTree* add; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S AddAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + add = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, add); + } + else + { + add = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, add); + } + + op1 = add; + + ScalarizeHWIntrinsicCmpMaskReduction(extractNode, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, extractNode, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(extractNode, castNode); + + BlockRange().Remove(popCount); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, popCount, castNode); + + return true; +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicCmpMaskExtractMsbZeroCount: +// Rewrites TrailingZeroCount(ExtractMostSignificantBits(...)) and +// LeadingZeroCount(ExtractMostSignificantBits(...)) when the input is known to be a SIMD comparison mask. +// +// Matches: +// TrailingZeroCount(ExtractMostSignificantBits(cmpMask)) +// LeadingZeroCount(ExtractMostSignificantBits(cmpMask)) +// +// Replaces it with: +// TrailingZeroCount: MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) - 1 +// LeadingZeroCount: MinAcross(BitwiseSelect(cmpMask, IndexVector, SentinelVector)) +// +// For TrailingZeroCount, IndexVector holds one-based element indexes and SentinelVector holds 33. +// The selected minimum is therefore the first matching element index plus one, or 33 if no element +// matched. Subtracting one preserves the zero-mask result of 32. +// +// For LeadingZeroCount, IndexVector holds 31 minus the element index and SentinelVector holds 32. +// The selected minimum is therefore the leading-zero-count result directly, including 32 for the +// zero-mask case. For Vector64, the reduction is implemented with MinPairwise. +// +// Arguments: +// use - A pointer to the intrinsic node +// parents - A reference to tree walk data providing the context +// +// Return Value: +// True if the node was rewritten; otherwise false. +// +bool Rationalizer::RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTree* zeroCount = *use; + assert(IsZeroCount(zeroCount)); + + const bool isTrailingZeroCount = zeroCount->OperIs(GT_INTRINSIC) && + (zeroCount->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount); + + GenTree* extract = zeroCount->OperIs(GT_INTRINSIC) ? zeroCount->gtGetOp1() : zeroCount->AsHWIntrinsic()->Op(1); + + if (!extract->OperIsHWIntrinsic()) + { + return false; + } + + GenTreeHWIntrinsic* extractNode = extract->AsHWIntrinsic(); + var_types simdBaseType = NormalizeCmpMaskSimdBaseType(extractNode->GetSimdBaseType()); + + if (simdBaseType == TYP_UNDEF) + { + return false; + } + + if (!IsHWIntrinsicCmpMaskExtractMsb(extractNode)) + { + return false; + } + + unsigned simdSize = extractNode->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + // A comparison produces elements whose value is either all-bits-set or zero. Select an element + // index value when the comparison is true and a sentinel when false. The horizontal min reduction + // then finds the zero-count result directly or with a final subtract, as described above. + + GenTree* op1 = extractNode->Op(1); + + GenTreeVecCon* indexVec = m_compiler->gtNewVconNode(simdType); + GenTreeVecCon* otherVec = m_compiler->gtNewVconNode(simdType); + + const unsigned elementSize = genTypeSize(simdBaseType); + const unsigned elementCount = simdSize / elementSize; + + for (unsigned index = 0; index < elementCount; index++) + { + switch (simdBaseType) + { + case TYP_UBYTE: + { + indexVec->gtSimdVal.u8[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u8[index] = static_cast(isTrailingZeroCount ? 33 : 32); + break; + } + + case TYP_USHORT: + { + indexVec->gtSimdVal.u16[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u16[index] = static_cast(isTrailingZeroCount ? 33 : 32); + break; + } + + case TYP_UINT: + { + indexVec->gtSimdVal.u32[index] = static_cast(isTrailingZeroCount ? index + 1 : 31 - index); + otherVec->gtSimdVal.u32[index] = static_cast(isTrailingZeroCount ? 33 : 32); + break; + } + + default: + { + unreached(); + } + } + } + + BlockRange().InsertAfter(op1, indexVec); + BlockRange().InsertAfter(indexVec, otherVec); + + GenTree* select = m_compiler->gtNewSimdCndSelNode(simdType, op1, indexVec, otherVec, simdBaseType, simdSize); + BlockRange().InsertAfter(otherVec, select); + op1 = select; + + GenTree* min; + + if ((simdSize == 8) && (simdBaseType == TYP_UINT)) + { + // Vector64 has only two lanes and AdvSimd does not provide a 2S MinAcross form. + // Use a pairwise reduction with the same vector as both operands instead. + + LIR::Use op1Use; + LIR::Use::MakeDummyUse(BlockRange(), op1, &op1Use); + + // The pairwise form consumes op1 twice, so spill it to a temp before cloning the use. + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); + + GenTree* op2 = m_compiler->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + + min = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_MinPairwise, simdBaseType, simdSize); + BlockRange().InsertAfter(op2, min); + } + else + { + min = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_MinAcross, simdBaseType, simdSize); + BlockRange().InsertAfter(op1, min); + } + + op1 = min; + + ScalarizeHWIntrinsicCmpMaskReduction(extractNode, op1, simdBaseType, simdSize); + + GenTree* castNode = m_compiler->gtNewCastNode(TYP_INT, extractNode, /* isUnsigned */ true, TYP_INT); + BlockRange().InsertAfter(extractNode, castNode); + + GenTree* result = castNode; + + if (isTrailingZeroCount) + { + GenTree* one = m_compiler->gtNewIconNode(1); + BlockRange().InsertAfter(castNode, one); + + result = m_compiler->gtNewOperNode(GT_SUB, TYP_INT, castNode, one); + BlockRange().InsertAfter(one, result); + } + + BlockRange().Remove(zeroCount); + + ReplaceHWIntrinsicCmpMaskExtractMsbUse(use, parents, zeroCount, result); + + return true; +} +#endif // TARGET_ARM64 + //---------------------------------------------------------------------------------------------- // RewriteHWIntrinsicExtractMsb: Rewrites a hwintrinsic ExtractMostSignificantBytes operation // @@ -1385,6 +1912,17 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree GenTree* op1 = node->Op(1); #if defined(TARGET_ARM64) + if (RewriteHWIntrinsicCmpMaskExtractMsb(use, parents)) + { + return; + } + + if ((parents.Height() > 1) && (IsPrimitivePopCount(parents.Top(1)) || IsZeroCount(parents.Top(1))) && + IsHWIntrinsicCmpMaskExtractMsb(node)) + { + return; + } + // ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead. // To do this, we effectively perform the following steps: // 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit @@ -1874,10 +2412,37 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge case GT_INTRINSIC: // Non-target intrinsics should have already been rewritten back into user calls. assert(m_compiler->IsTargetIntrinsic(node->AsIntrinsic()->gtIntrinsicName)); +#if defined(TARGET_ARM64) && defined(FEATURE_HW_INTRINSICS) + if (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_PopCount) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbPopCount(useEdge, parentStack)) + { + node = *useEdge; + } + } + else if ((node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount) || + (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_LeadingZeroCount)) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(useEdge, parentStack)) + { + node = *useEdge; + } + } +#endif // TARGET_ARM64 && FEATURE_HW_INTRINSICS break; #if defined(FEATURE_HW_INTRINSICS) case GT_HWINTRINSIC: +#if defined(TARGET_ARM64) + if (IsZeroCount(node)) + { + if (RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(useEdge, parentStack)) + { + node = *useEdge; + break; + } + } +#endif // TARGET_ARM64 RewriteHWIntrinsic(useEdge, parentStack); break; #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index d0449be7c70a56..06674cc1f7ca94 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -64,6 +64,12 @@ class Rationalizer final : public Phase bool ShouldRewriteToNonMaskHWIntrinsic(GenTree* node); #endif // TARGET_XARCH +#if defined(TARGET_ARM64) + bool RewriteHWIntrinsicCmpMaskExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); + bool RewriteHWIntrinsicCmpMaskExtractMsbPopCount(GenTree** use, Compiler::GenTreeStack& parents); + bool RewriteHWIntrinsicCmpMaskExtractMsbZeroCount(GenTree** use, Compiler::GenTreeStack& parents); +#endif // TARGET_ARM64 + void RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); #endif // FEATURE_HW_INTRINSICS diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs new file mode 100644 index 00000000000000..b9f26046adbcec --- /dev/null +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.cs @@ -0,0 +1,592 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using Xunit; + +namespace TestExtractMostSignificantBits +{ + public class Program + { + [Fact] + public static int TestEntryPoint() + { + bool fail = false; + + Vector128 utf16Data = Vector128.Create( + (ushort)0x0000, (ushort)0x0800, (ushort)0x07FF, (ushort)0x8000, + (ushort)0xD7FF, (ushort)0xD800, (ushort)0x0001, (ushort)0xFFFF); + + if (LessThanUInt16Mask(utf16Data, 0x0800) != 0x45) + { + fail = true; + } + + if (GreaterThanOrEqualUInt16Mask(utf16Data, 0x0800) != 0xBA) + { + fail = true; + } + + if (!AnyLessThanUInt16(utf16Data, 0x0800)) + { + fail = true; + } + + if (NoneLessThanUInt16(Vector128.Create((ushort)0x0800), 0x0800) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualUInt16(utf16Data, 0x0800) != 5) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt16(utf16Data, 0x0800) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt16(Vector128.Zero, 0x0800) != 32) + { + fail = true; + } + + Vector128 intData = Vector128.Create(-2, 0, 7, 8); + + if (LessThanInt32Mask(intData, 7) != 0x3) + { + fail = true; + } + + if (!AnyLessThanInt32(intData, 7)) + { + fail = true; + } + + if (NoneLessThanInt32(Vector128.Create(7), 7) != true) + { + fail = true; + } + + if (CountLessThanInt32(intData, 7) != 2) + { + fail = true; + } + + if (IndexOfFirstLessThanInt32(intData, 7) != 0) + { + fail = true; + } + + if (IndexOfFirstLessThanInt32(Vector128.Create(7), 7) != 32) + { + fail = true; + } + + Vector128 byteData = Vector128.Create( + (byte)0x00, (byte)0x80, (byte)0x7F, (byte)0xFF, + (byte)0x01, (byte)0x81, (byte)0x40, (byte)0xC0, + (byte)0x02, (byte)0x82, (byte)0x20, (byte)0xA0, + (byte)0x04, (byte)0x84, (byte)0x10, (byte)0x90); + + if (GreaterThanOrEqualByteMask(byteData, 0x80) != 0xAAAA) + { + fail = true; + } + + if (!AnyGreaterThanOrEqualByte(byteData, 0x80)) + { + fail = true; + } + + if (NoneGreaterThanOrEqualByte(Vector128.Zero, 0x80) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualByte(byteData, 0x80) != 8) + { + fail = true; + } + + if (CountGreaterThanOrEqualByteViaLocal(byteData, 0x80) != 8) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte(byteData, 0x80) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte(Vector128.Zero, 0x80) != 32) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte(byteData, 0x80) != 16) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte(Vector128.Zero, 0x80) != 32) + { + fail = true; + } + + Vector64 utf16Data64 = Vector64.Create( + (ushort)0x0000, (ushort)0x0800, (ushort)0x07FF, (ushort)0xFFFF); + + if (LessThanUInt16Mask64(utf16Data64, 0x0800) != 0x5) + { + fail = true; + } + + if (!AnyLessThanUInt1664(utf16Data64, 0x0800)) + { + fail = true; + } + + if (NoneLessThanUInt1664(Vector64.Create((ushort)0x0800), 0x0800) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualUInt1664(utf16Data64, 0x0800) != 2) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt1664(utf16Data64, 0x0800) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualUInt1664(Vector64.Zero, 0x0800) != 32) + { + fail = true; + } + + Vector64 intData64 = Vector64.Create(-2, 8); + + if (LessThanInt32Mask64(intData64, 7) != 0x1) + { + fail = true; + } + + if (!AnyLessThanInt3264(intData64, 7)) + { + fail = true; + } + + if (NoneLessThanInt3264(Vector64.Create(7), 7) != true) + { + fail = true; + } + + if (CountLessThanInt3264(intData64, 7) != 1) + { + fail = true; + } + + if (IndexOfFirstLessThanInt3264(intData64, 7) != 0) + { + fail = true; + } + + if (IndexOfFirstLessThanInt3264(Vector64.Create(7), 7) != 32) + { + fail = true; + } + + Vector64 byteData64 = Vector64.Create( + (byte)0x00, (byte)0x80, (byte)0x7F, (byte)0xFF, + (byte)0x01, (byte)0x81, (byte)0x40, (byte)0xC0); + + if (GreaterThanOrEqualByteMask64(byteData64, 0x80) != 0xAA) + { + fail = true; + } + + if (!AnyGreaterThanOrEqualByte64(byteData64, 0x80)) + { + fail = true; + } + + if (NoneGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != true) + { + fail = true; + } + + if (CountGreaterThanOrEqualByte64(byteData64, 0x80) != 4) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte64(byteData64, 0x80) != 1) + { + fail = true; + } + + if (IndexOfFirstGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != 32) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte64(byteData64, 0x80) != 24) + { + fail = true; + } + + if (LeadingZeroCountGreaterThanOrEqualByte64(Vector64.Zero, 0x80) != 32) + { + fail = true; + } + + return fail ? 101 : 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanUInt16Mask(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: and {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualUInt16Mask(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: and {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return BitOperations.PopCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualUInt16(Vector128 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanInt32Mask(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: and {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: smov {{x[0-9]+}}, {{v[0-9]+}}.s[0] + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + // ARM64-FULL-LINE: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + return BitOperations.PopCount(Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstLessThanInt32(Vector128 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.LessThan(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualByteMask(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.PopCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByteViaLocal(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + Vector128 mask = Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)); + return BitOperations.PopCount(mask.ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int LeadingZeroCountGreaterThanOrEqualByte(Vector128 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.LeadingZeroCount(Vector128.GreaterThanOrEqual(value, Vector128.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanUInt16Mask64(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: and {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 + // ARM64-FULL-LINE: addv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + return BitOperations.PopCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualUInt1664(Vector64 value, ushort limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.h[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint LessThanInt32Mask64(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: and {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + // ARM64-FULL-LINE: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + return BitOperations.PopCount(Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstLessThanInt3264(Vector64 value, int limit) + { + // ARM64-FULL-LINE: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.s[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.LessThan(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint GreaterThanOrEqualByteMask64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool AnyGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, ne + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool NoneGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: cset {{[wx][0-9]+}}, eq + return Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits() == 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int CountGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7 + // ARM64-FULL-LINE: addv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.PopCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int IndexOfFirstGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + // ARM64-FULL-LINE: sub {{w[0-9]+}}, {{w[0-9]+}}, #1 + return BitOperations.TrailingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static int LeadingZeroCountGreaterThanOrEqualByte64(Vector64 value, byte limit) + { + // ARM64-FULL-LINE: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b + // ARM64-FULL-LINE: umov {{w[0-9]+}}, {{v[0-9]+}}.b[0] + return BitOperations.LeadingZeroCount(Vector64.GreaterThanOrEqual(value, Vector64.Create(limit)).ExtractMostSignificantBits()); + } + } +} diff --git a/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj new file mode 100644 index 00000000000000..46f205a5ec8387 --- /dev/null +++ b/src/tests/JIT/opt/InstructionCombining/ExtractMostSignificantBits.csproj @@ -0,0 +1,17 @@ + + + + true + + + None + True + + + + true + + + + +