diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1d5ed98d0b4910..4fda3845fa0891 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25514,6 +25514,9 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX2_BroadcastVector128ToVector256: + case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR) { addr = Op(1); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 287b214d0f40b8..06803b3a76894c 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1426,6 +1426,9 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX2_BroadcastVector128ToVector256: + case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: { // These intrinsics have both pointer and vector overloads // We want to be able to differentiate between them so lets diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 0300c6b7ef0e24..bddcb05ad7fce4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -766,7 +766,7 @@ HARDWARE_INTRINSIC(AVX2, Blend, HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, false, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, false, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -835,8 +835,8 @@ HARDWARE_INTRINSIC(AVX512F, And, HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 157ed28e70fbb9..fcde0e67292e8c 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -118,6 +118,7 @@ class Lowering final : public Phase void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); + void TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 3daa96268d7213..954f795c1f7553 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -101,8 +101,30 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) { node->Data()->ChangeType(TYP_BYTE); } - ContainCheckStoreIndir(node); + +#if defined(FEATURE_HW_INTRINSICS) + if (comp->IsBaselineVector512IsaSupportedOpportunistically() || + comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + if (!node->Data()->OperIs(GT_CNS_VEC)) + { + return; + } + + if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)) + { + return; + } + if (node->Data()->AsVecCon()->IsAllBitsSet() || node->Data()->AsVecCon()->IsZero()) + { + // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. + return; + } + + TryCompressConstVecData(node); + } +#endif } //---------------------------------------------------------------------------------------------- @@ -7663,6 +7685,22 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { switch (parentIntrinsicId) { + case NI_AVX2_BroadcastVector128ToVector256: + case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: + { + if (parentNode->OperIsMemoryLoad()) + { + supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); + break; + } + else + { + supportsGeneralLoads = true; + break; + } + } + case NI_SSE41_ConvertToVector128Int16: case NI_SSE41_ConvertToVector128Int32: case NI_SSE41_ConvertToVector128Int64: @@ -8512,6 +8550,80 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, MakeSrcContained(parentNode, childNode); } +//---------------------------------------------------------------------------------------------- +// TryCompressConstVecData: +// Try to compress the constant vector input if it has duplicated parts and can be optimized by +// broadcast +// +// Arguments: +// node - the storeind node. +// +// Return: +// return true if compress success. +void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) +{ + assert(node->Data()->OperIs(GT_CNS_VEC)); + GenTreeVecCon* vecCon = node->Data()->AsVecCon(); + GenTreeHWIntrinsic* broadcast = nullptr; + + if (vecCon->TypeIs(TYP_SIMD32)) + { + assert(comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)); + if (vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) + { + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd32Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd32Val.f64[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); + memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, + NI_AVX2_BroadcastVector128ToVector256, CORINFO_TYPE_UINT, 32); + } + } + else + { + assert(vecCon->TypeIs(TYP_SIMD64)); + assert(comp->IsBaselineVector512IsaSupportedOpportunistically()); + if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) + { + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); + memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, + NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); + } + else if (vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) + { + simd32_t simd32Val = {}; + simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; + simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD32); + memcpy(&compressedVecCon->gtSimdVal, &simd32Val, sizeof(simd32_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, + NI_AVX512F_BroadcastVector256ToVector512, CORINFO_TYPE_UINT, 64); + } + } + + if (broadcast == nullptr) + { + return; + } + + BlockRange().InsertBefore(node, broadcast); + node->Data() = broadcast; + LowerNode(broadcast); +} + //---------------------------------------------------------------------------------------------- // ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware // intrinsic node. @@ -8708,6 +8820,20 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX2_BroadcastVector128ToVector256: + case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: + { + if (node->OperIsMemoryLoad()) + { + ContainCheckHWIntrinsicAddr(node, op1); + return; + } + + assert(op1->OperIs(GT_CNS_VEC)); + break; + } + case NI_AVX512F_ConvertToVector256Int32: case NI_AVX512F_ConvertToVector256UInt32: case NI_AVX512F_VL_ConvertToVector128UInt32: