Skip to content

Commit 91116c7

Browse files
committed
accelerate more casts on x86
1 parent 927e973 commit 91116c7

File tree

13 files changed

+547
-266
lines changed

13 files changed

+547
-266
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7390,8 +7390,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
73907390
// The treeNode must have an assigned register.
73917391
// SrcType=float/double and DstType= int32/uint32/int64/uint64
73927392
//
7393-
// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
7394-
//
73957393
void CodeGen::genFloatToIntCast(GenTree* treeNode)
73967394
{
73977395
// we don't expect to see overflow detecting float/double --> int type conversions here
@@ -7413,28 +7411,11 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
74137411
var_types dstType = treeNode->CastToType();
74147412
var_types srcType = op1->TypeGet();
74157413
assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
7414+
assert(!varTypeIsSmall(dstType));
74167415

7417-
// We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
7418-
// For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
7419-
// front-end or lowering phase to have generated two levels of cast. The first one is
7420-
// for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
7421-
// the required smaller int type.
7422-
emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
7423-
noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
7424-
7425-
// We shouldn't be seeing uint64 here as it should have been converted
7426-
// into a helper call by either front-end or lowering phase, unless we have AVX512
7427-
// accelerated conversions.
7428-
assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) ||
7429-
compiler->canUseEvexEncodingDebugOnly());
7430-
7431-
// If the dstType is TYP_UINT, we have 32-bits to encode the
7432-
// float number. Any of 33rd or above bits can be the sign bit.
7433-
// To achieve it we pretend as if we are converting it to a long.
7434-
if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))) && !compiler->canUseEvexEncoding())
7435-
{
7436-
dstType = TYP_LONG;
7437-
}
7416+
// Unless AVX10.2 saturating conversion instructions are available, these
7417+
// casts should have been lowered to a sequence of HWIntrinsic nodes.
7418+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2));
74387419

74397420
// Note that we need to specify dstType here so that it will determine
74407421
// the size of destination integer register and also the rex.w prefix.

src/coreclr/jit/compiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6286,6 +6286,7 @@ class Compiler
62866286
void fgConvertBBToThrowBB(BasicBlock* block);
62876287

62886288
bool fgCastNeeded(GenTree* tree, var_types toType);
6289+
bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);
62896290

62906291
void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
62916292
void fgLoopCallMark();

src/coreclr/jit/decomposelongs.cpp

Lines changed: 195 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
138138
}
139139

140140
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
141-
if (!tree->TypeIs(TYP_LONG) &&
142-
!(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
141+
// On x86, long->floating casts are implemented in DecomposeCast.
142+
bool isLongToFloatingCast =
143+
(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree));
144+
145+
if (!tree->TypeIs(TYP_LONG) && !isLongToFloatingCast)
143146
#else
144147
if (!tree->TypeIs(TYP_LONG))
145148
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
@@ -159,6 +162,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
159162
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
160163
// Here we do a conservative check for specific cases where it is certain the load/store
161164
// can be contained. In those cases, we can skip decomposition.
165+
//
166+
// We also look for longs consumed directly by a long->floating cast. These can skip
167+
// decomposition because the cast is implemented using HWIntrinsics.
162168

163169
GenTree* user = use.User();
164170

@@ -582,44 +588,213 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
582588
}
583589

584590
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
585-
if (varTypeIsFloating(dstType))
591+
if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
586592
{
587593
// We will reach this path only if morph did not convert the cast to a helper call,
588594
// meaning we can perform the cast using SIMD instructions.
589-
// The sequence this creates is simply:
590-
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
591-
592-
NamedIntrinsic intrinsicId = NI_Illegal;
593-
GenTree* srcOp = cast->CastOp();
594-
var_types dstType = cast->CastToType();
595-
CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
596-
CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
597595

598596
assert(!cast->gtOverflow());
599597
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
600598

601-
intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
599+
GenTree* srcOp = cast->CastOp();
600+
GenTree* castResult = nullptr;
601+
LIR::Range castRange = LIR::EmptyRange();
602+
CorInfoType srcBaseType = CORINFO_TYPE_UNDEF;
603+
CorInfoType dstBaseType = CORINFO_TYPE_UNDEF;
602604

603-
GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
604-
GenTree* convert =
605-
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
606-
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
605+
if (varTypeIsFloating(srcType))
606+
{
607+
srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
608+
dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
609+
}
610+
else
611+
{
612+
srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
613+
dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
614+
}
607615

608-
Range().InsertAfter(cast, createScalar, convert, toScalar);
609-
Range().Remove(cast);
616+
// This creates the equivalent of the following C# code:
617+
// var srcVec = Vector128.CreateScalarUnsafe(castOp);
610618

611-
if (createScalar->IsCnsVec())
619+
GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcBaseType, 16);
620+
castRange.InsertAtEnd(srcVector);
621+
622+
if (srcVector->IsCnsVec())
612623
{
613624
Range().Remove(srcOp);
614625
}
615626

627+
if (varTypeIsFloating(dstType))
628+
{
629+
// long->floating casts don't require any kind of fixup. We simply use the vector
630+
// form of the instructions, because the scalar form is not supported on 32-bit.
631+
632+
NamedIntrinsic intrinsicId =
633+
(dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
634+
635+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
636+
}
637+
else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
638+
{
639+
// Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
640+
// but we have to use the vector form.
641+
642+
NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
643+
? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation
644+
: NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;
645+
646+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
647+
}
648+
else if (dstType == TYP_ULONG)
649+
{
650+
// AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
651+
// we only need to fix up negative or NaN values before conversion.
652+
//
653+
// maxs[sd] will take the value from the second operand if the first operand's value is
654+
// NaN, which allows us to fix up both negative and NaN values with a single instruction.
655+
//
656+
// This creates the equivalent of the following C# code:
657+
// var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
658+
// castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
659+
660+
GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
661+
GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar,
662+
srcBaseType, 16);
663+
664+
castRange.InsertAtEnd(zero);
665+
castRange.InsertAtEnd(fixupVal);
666+
667+
castResult =
668+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
669+
NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16);
670+
}
671+
else
672+
{
673+
assert(dstType == TYP_LONG);
674+
675+
// The logic for floating->signed long casts is similar to the AVX-512 implementation
676+
// in LowerCast, except that all operations must be done in SIMD registers.
677+
678+
if (srcType == TYP_FLOAT)
679+
{
680+
// For float->long, the result will be twice as wide as the input. Broadcasting the
681+
// input allows us to use two adjacent elements when creating the fixup mask later.
682+
683+
srcVector = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector,
684+
NI_AVX2_BroadcastScalarToVector128, srcBaseType, 16);
685+
castRange.InsertAtEnd(srcVector);
686+
}
687+
688+
// We will use the input value multiple times, so we replace it with a lclVar.
689+
LIR::Use srcUse;
690+
LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
691+
srcUse.ReplaceWithLclVar(m_compiler);
692+
srcVector = srcUse.Def();
693+
694+
// Fix up NaN values before conversion. Saturation is handled after conversion,
695+
// because MaxValue is not precisely representable in the floating format.
696+
//
697+
// This creates the equivalent of the following C# code:
698+
// var nanMask = Sse.CompareScalarOrdered(srcVec, srcVec);
699+
// var fixupVal = Sse.And(srcVec, nanMask);
700+
// convertResult = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(fixupVal);
701+
702+
GenTree* srcClone = m_compiler->gtClone(srcVector);
703+
GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, srcClone,
704+
NI_X86Base_CompareScalarOrdered, srcBaseType, 16);
705+
706+
castRange.InsertAtEnd(srcClone);
707+
castRange.InsertAtEnd(nanMask);
708+
709+
srcClone = m_compiler->gtClone(srcVector);
710+
GenTree* fixupVal = m_compiler->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16);
711+
712+
castRange.InsertAtEnd(srcClone);
713+
castRange.InsertAtEnd(fixupVal);
714+
715+
GenTree* convertResult =
716+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
717+
NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16);
718+
719+
castRange.InsertAtEnd(convertResult);
720+
721+
// Now we handle saturation of the result for positive overflow.
722+
//
723+
// This creates the equivalent of the following C# code:
724+
// var maxFloatingValue = Vector128.Create(9223372036854775808.0);
725+
// var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
726+
// var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode);
727+
728+
NamedIntrinsic compareIntrinsic = (srcType == TYP_FLOAT) ? NI_AVX_Compare : NI_AVX_CompareScalar;
729+
GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
730+
731+
if (srcType == TYP_FLOAT)
732+
{
733+
// For float->long, we broadcast the comparison value, same as we broadcast the input.
734+
for (uint32_t index = 0; index < 4; index++)
735+
{
736+
maxFloatingValue->gtSimdVal.f32[index] = 9223372036854775808.0f;
737+
}
738+
}
739+
else
740+
{
741+
maxFloatingValue->gtSimdVal.f64[0] = 9223372036854775808.0;
742+
}
743+
744+
castRange.InsertAtEnd(maxFloatingValue);
745+
746+
srcClone = m_compiler->gtClone(srcVector);
747+
GenTree* compareMode = m_compiler->gtNewIconNode(
748+
static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
749+
GenTree* compareMax = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, maxFloatingValue,
750+
compareMode, compareIntrinsic, srcBaseType, 16);
751+
752+
castRange.InsertAtEnd(srcClone);
753+
castRange.InsertAtEnd(compareMode);
754+
castRange.InsertAtEnd(compareMax);
755+
756+
// We will use the compare mask multiple times, so we replace it with a lclVar.
757+
LIR::Use cmpUse;
758+
LIR::Use::MakeDummyUse(castRange, compareMax, &cmpUse);
759+
cmpUse.ReplaceWithLclVar(m_compiler);
760+
compareMax = cmpUse.Def();
761+
762+
// Mask in long.MaxValue for positive saturation. In the case of overflow, the compare
763+
// mask will be all ones. We shift that value right by one to create the MaxValue vector.
764+
// This is where we treat two adjacent elements from a float compare as one 64-bit mask.
765+
//
766+
// This creates the equivalent of the following C# code:
767+
// var maxLong = Sse2.ShiftRightLogical(compareMax, 1);
768+
// castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult);
769+
770+
GenTree* cmpClone = m_compiler->gtClone(compareMax);
771+
GenTree* one = m_compiler->gtNewIconNode(1);
772+
GenTree* maxLong = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, compareMax, one,
773+
NI_X86Base_ShiftRightLogical, dstBaseType, 16);
774+
775+
castRange.InsertAtEnd(one);
776+
castRange.InsertAtEnd(maxLong);
777+
castRange.InsertAtEnd(cmpClone);
778+
779+
castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, cmpClone, maxLong, convertResult, dstBaseType, 16);
780+
}
781+
782+
// Because the results are in a SIMD register, we need to ToScalar() them out.
783+
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstBaseType, 16);
784+
785+
castRange.InsertAtEnd(castResult);
786+
castRange.InsertAtEnd(toScalar);
787+
788+
Range().InsertAfter(cast, std::move(castRange));
789+
Range().Remove(cast);
790+
616791
if (use.IsDummyUse())
617792
{
618793
toScalar->SetUnusedValue();
619794
}
620795
use.ReplaceWith(toScalar);
621796

622-
return toScalar->gtNext;
797+
return toScalar;
623798
}
624799
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
625800

src/coreclr/jit/flowgraph.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,33 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
12701270
return true;
12711271
}
12721272

1273+
bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
1274+
{
1275+
if (varTypeIsFloating(fromType))
1276+
{
1277+
return (varTypeIsIntegral(toType) && overflow)
1278+
#if defined(TARGET_X86)
1279+
|| (varTypeIsLong(toType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512))
1280+
#elif !defined(TARGET_64BIT)
1281+
|| varTypeIsLong(toType)
1282+
#endif
1283+
;
1284+
}
1285+
1286+
#if !defined(TARGET_64BIT)
1287+
if (varTypeIsFloating(toType))
1288+
{
1289+
return varTypeIsLong(fromType)
1290+
#if defined(TARGET_X86)
1291+
&& !compOpportunisticallyDependsOn(InstructionSet_AVX512)
1292+
#endif // TARGET_X86
1293+
;
1294+
}
1295+
#endif // !TARGET_64BIT
1296+
1297+
return false;
1298+
}
1299+
12731300
GenTree* Compiler::fgGetCritSectOfStaticMethod()
12741301
{
12751302
noway_assert(!compIsForInlining());

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2057,6 +2057,14 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
20572057
}
20582058
baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
20592059
}
2060+
else if (op1->IsCnsVec())
2061+
{
2062+
CORINFO_FIELD_HANDLE hnd =
2063+
GetEmitter()->emitSimdConst(&op1->AsVecCon()->gtSimdVal, emitTypeSize(op1));
2064+
2065+
baseReg = internalRegisters.GetSingle(node);
2066+
GetEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), baseReg, hnd, 0, INS_OPTS_NONE);
2067+
}
20602068
else
20612069
{
20622070
// Require GT_IND addr to be not contained.

src/coreclr/jit/importer.cpp

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8289,28 +8289,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
82898289
goto _CONV;
82908290

82918291
_CONV:
8292-
// only converts from FLOAT or DOUBLE to an integer type
8293-
// and converts from ULONG (or LONG on ARM) to DOUBLE are morphed to calls
8294-
8295-
if (varTypeIsFloating(lclTyp))
8296-
{
8297-
callNode = varTypeIsLong(impStackTop().val) ||
8298-
uns // uint->dbl gets turned into uint->long->dbl
8299-
#ifdef TARGET_64BIT
8300-
// TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
8301-
// TYP_BYREF could be used as TYP_I_IMPL which is long.
8302-
// TODO-CQ: remove this when we lower casts long/ulong --> float/double
8303-
// and generate SSE2 code instead of going through helper calls.
8304-
|| impStackTop().val->TypeIs(TYP_BYREF)
8305-
#endif
8306-
;
8307-
}
8308-
else
8309-
{
8310-
callNode = varTypeIsFloating(impStackTop().val->TypeGet());
8311-
}
8312-
8313-
op1 = impPopStack().val;
8292+
op1 = impPopStack().val;
8293+
callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);
83148294

83158295
impBashVarAddrsToI(op1);
83168296

0 commit comments

Comments
 (0)