dotnet
diff --git a/‎src/coreclr/jit/codegenxarch.cpp‎
Lines changed: 4 additions & 23 deletions b/‎src/coreclr/jit/codegenxarch.cpp‎
Lines changed: 4 additions & 23 deletions
diff --git a/‎src/coreclr/jit/compiler.h‎
Lines changed: 1 addition & 0 deletions b/‎src/coreclr/jit/compiler.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/coreclr/jit/decomposelongs.cpp‎
Lines changed: 195 additions & 20 deletions b/‎src/coreclr/jit/decomposelongs.cpp‎
Lines changed: 195 additions & 20 deletions
diff --git a/‎src/coreclr/jit/flowgraph.cpp‎
Lines changed: 27 additions & 0 deletions b/‎src/coreclr/jit/flowgraph.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/coreclr/jit/hwintrinsiccodegenxarch.cpp‎
Lines changed: 8 additions & 0 deletions b/‎src/coreclr/jit/hwintrinsiccodegenxarch.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/coreclr/jit/importer.cpp‎
Lines changed: 2 additions & 22 deletions b/‎src/coreclr/jit/importer.cpp‎
Lines changed: 2 additions & 22 deletions
@@ -7390,8 +7390,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
 //    The treeNode must have an assigned register.
 //    SrcType=float/double and DstType= int32/uint32/int64/uint64
 //
-// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
-//
 void CodeGen::genFloatToIntCast(GenTree* treeNode)
 {
     // we don't expect to see overflow detecting float/double --> int type conversions here
@@ -7413,28 +7411,11 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
     var_types dstType = treeNode->CastToType();
     var_types srcType = op1->TypeGet();
     assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
+    assert(!varTypeIsSmall(dstType));
 
-    // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
-    // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
-    // front-end or lowering phase to have generated two levels of cast. The first one is
-    // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
-    // the required smaller int type.
-    emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
-    noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
-
-    // We shouldn't be seeing uint64 here as it should have been converted
-    // into a helper call by either front-end or lowering phase, unless we have AVX512
-    // accelerated conversions.
-    assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) ||
-           compiler->canUseEvexEncodingDebugOnly());
-
-    // If the dstType is TYP_UINT, we have 32-bits to encode the
-    // float number. Any of 33rd or above bits can be the sign bit.
-    // To achieve it we pretend as if we are converting it to a long.
-    if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))) && !compiler->canUseEvexEncoding())
-    {
-        dstType = TYP_LONG;
-    }
+    // Unless AVX10.2 saturating conversion instructions are available, these
+    // casts should have been lowered to a sequence of HWIntrinsic nodes.
+    assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2));
 
     // Note that we need to specify dstType here so that it will determine
     // the size of destination integer register and also the rex.w prefix.
 
@@ -6286,6 +6286,7 @@ class Compiler
     void fgConvertBBToThrowBB(BasicBlock* block);
 
     bool fgCastNeeded(GenTree* tree, var_types toType);
+    bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);
 
     void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
     void fgLoopCallMark();
 
@@ -138,8 +138,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
     }
 
 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
-    if (!tree->TypeIs(TYP_LONG) &&
-        !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
+    // On x86, long->floating casts are implemented in DecomposeCast.
+    bool isLongToFloatingCast =
+        (tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree));
+
+    if (!tree->TypeIs(TYP_LONG) && !isLongToFloatingCast)
 #else
     if (!tree->TypeIs(TYP_LONG))
 #endif // FEATURE_HW_INTRINSICS && TARGET_X86
@@ -159,6 +162,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
         // HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
         // Here we do a conservative check for specific cases where it is certain the load/store
         // can be contained. In those cases, we can skip decomposition.
+        //
+        // We also look for longs consumed directly by a long->floating cast. These can skip
+        // decomposition because the cast is implemented using HWIntrinsics.
 
         GenTree* user = use.User();
 
@@ -582,44 +588,213 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
     }
 
 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
-    if (varTypeIsFloating(dstType))
+    if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
     {
         // We will reach this path only if morph did not convert the cast to a helper call,
         // meaning we can perform the cast using SIMD instructions.
-        // The sequence this creates is simply:
-        //    AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
-
-        NamedIntrinsic intrinsicId      = NI_Illegal;
-        GenTree*       srcOp            = cast->CastOp();
-        var_types      dstType          = cast->CastToType();
-        CorInfoType    baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
-        CorInfoType    baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
 
         assert(!cast->gtOverflow());
         assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
 
-        intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+        GenTree*    srcOp       = cast->CastOp();
+        GenTree*    castResult  = nullptr;
+        LIR::Range  castRange   = LIR::EmptyRange();
+        CorInfoType srcBaseType = CORINFO_TYPE_UNDEF;
+        CorInfoType dstBaseType = CORINFO_TYPE_UNDEF;
 
-        GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
-        GenTree* convert =
-            m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
-        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
+        if (varTypeIsFloating(srcType))
+        {
+            srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
+            dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
+        }
+        else
+        {
+            srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
+            dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
+        }
 
-        Range().InsertAfter(cast, createScalar, convert, toScalar);
-        Range().Remove(cast);
+        // This creates the equivalent of the following C# code:
+        //   var srcVec = Vector128.CreateScalarUnsafe(castOp);
 
-        if (createScalar->IsCnsVec())
+        GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcBaseType, 16);
+        castRange.InsertAtEnd(srcVector);
+
+        if (srcVector->IsCnsVec())
         {
             Range().Remove(srcOp);
         }
 
+        if (varTypeIsFloating(dstType))
+        {
+            // long->floating casts don't require any kind of fixup. We simply use the vector
+            // form of the instructions, because the scalar form is not supported on 32-bit.
+
+            NamedIntrinsic intrinsicId =
+                (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
+        }
+        else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+        {
+            // Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
+            // but we have to use the vector form.
+
+            NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
+                                             ? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation
+                                             : NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
+        }
+        else if (dstType == TYP_ULONG)
+        {
+            // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
+            // we only need to fix up negative or NaN values before conversion.
+            //
+            // maxs[sd] will take the value from the second operand if the first operand's value is
+            // NaN, which allows us to fix up both negative and NaN values with a single instruction.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
+            //   castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
+
+            GenTree* zero     = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+            GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar,
+                                                                     srcBaseType, 16);
+
+            castRange.InsertAtEnd(zero);
+            castRange.InsertAtEnd(fixupVal);
+
+            castResult =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
+                                                     NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16);
+        }
+        else
+        {
+            assert(dstType == TYP_LONG);
+
+            // The logic for floating->signed long casts is similar to the AVX-512 implementation
+            // in LowerCast, except that all operations must be done in SIMD registers.
+
+            if (srcType == TYP_FLOAT)
+            {
+                // For float->long, the result will be twice as wide as the input. Broadcasting the
+                // input allows us to use two adjacent elements when creating the fixup mask later.
+
+                srcVector = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector,
+                                                                 NI_AVX2_BroadcastScalarToVector128, srcBaseType, 16);
+                castRange.InsertAtEnd(srcVector);
+            }
+
+            // We will use the input value multiple times, so we replace it with a lclVar.
+            LIR::Use srcUse;
+            LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
+            srcUse.ReplaceWithLclVar(m_compiler);
+            srcVector = srcUse.Def();
+
+            // Fix up NaN values before conversion. Saturation is handled after conversion,
+            // because MaxValue is not precisely representable in the floating format.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var nanMask = Sse.CompareScalarOrdered(srcVec, srcVec);
+            //   var fixupVal = Sse.And(srcVec, nanMask);
+            //   convertResult = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(fixupVal);
+
+            GenTree* srcClone = m_compiler->gtClone(srcVector);
+            GenTree* nanMask  = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, srcClone,
+                                                                     NI_X86Base_CompareScalarOrdered, srcBaseType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(nanMask);
+
+            srcClone          = m_compiler->gtClone(srcVector);
+            GenTree* fixupVal = m_compiler->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(fixupVal);
+
+            GenTree* convertResult =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
+                                                     NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16);
+
+            castRange.InsertAtEnd(convertResult);
+
+            // Now we handle saturation of the result for positive overflow.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var maxFloatingValue = Vector128.Create(9223372036854775808.0);
+            //   var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
+            //   var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode);
+
+            NamedIntrinsic compareIntrinsic = (srcType == TYP_FLOAT) ? NI_AVX_Compare : NI_AVX_CompareScalar;
+            GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
+
+            if (srcType == TYP_FLOAT)
+            {
+                // For float->long, we broadcast the comparison value, same as we broadcast the input.
+                for (uint32_t index = 0; index < 4; index++)
+                {
+                    maxFloatingValue->gtSimdVal.f32[index] = 9223372036854775808.0f;
+                }
+            }
+            else
+            {
+                maxFloatingValue->gtSimdVal.f64[0] = 9223372036854775808.0;
+            }
+
+            castRange.InsertAtEnd(maxFloatingValue);
+
+            srcClone             = m_compiler->gtClone(srcVector);
+            GenTree* compareMode = m_compiler->gtNewIconNode(
+                static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
+            GenTree* compareMax = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, maxFloatingValue,
+                                                                       compareMode, compareIntrinsic, srcBaseType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(compareMode);
+            castRange.InsertAtEnd(compareMax);
+
+            // We will use the compare mask multiple times, so we replace it with a lclVar.
+            LIR::Use cmpUse;
+            LIR::Use::MakeDummyUse(castRange, compareMax, &cmpUse);
+            cmpUse.ReplaceWithLclVar(m_compiler);
+            compareMax = cmpUse.Def();
+
+            // Mask in long.MaxValue for positive saturation. In the case of overflow, the compare
+            // mask will be all ones. We shift that value right by one to create the MaxValue vector.
+            // This is where we treat two adjacent elements from a float compare as one 64-bit mask.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var maxLong = Sse2.ShiftRightLogical(compareMax, 1);
+            //   castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult);
+
+            GenTree* cmpClone = m_compiler->gtClone(compareMax);
+            GenTree* one      = m_compiler->gtNewIconNode(1);
+            GenTree* maxLong  = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, compareMax, one,
+                                                                     NI_X86Base_ShiftRightLogical, dstBaseType, 16);
+
+            castRange.InsertAtEnd(one);
+            castRange.InsertAtEnd(maxLong);
+            castRange.InsertAtEnd(cmpClone);
+
+            castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, cmpClone, maxLong, convertResult, dstBaseType, 16);
+        }
+
+        // Because the results are in a SIMD register, we need to ToScalar() them out.
+        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstBaseType, 16);
+
+        castRange.InsertAtEnd(castResult);
+        castRange.InsertAtEnd(toScalar);
+
+        Range().InsertAfter(cast, std::move(castRange));
+        Range().Remove(cast);
+
         if (use.IsDummyUse())
         {
             toScalar->SetUnusedValue();
         }
         use.ReplaceWith(toScalar);
 
-        return toScalar->gtNext;
+        return toScalar;
     }
 #endif // FEATURE_HW_INTRINSICS && TARGET_X86
 
 
@@ -1270,6 +1270,33 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
     return true;
 }
 
+bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
+{
+    if (varTypeIsFloating(fromType))
+    {
+        return (varTypeIsIntegral(toType) && overflow)
+#if defined(TARGET_X86)
+               || (varTypeIsLong(toType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512))
+#elif !defined(TARGET_64BIT)
+               || varTypeIsLong(toType)
+#endif
+            ;
+    }
+
+#if !defined(TARGET_64BIT)
+    if (varTypeIsFloating(toType))
+    {
+        return varTypeIsLong(fromType)
+#if defined(TARGET_X86)
+               && !compOpportunisticallyDependsOn(InstructionSet_AVX512)
+#endif // TARGET_X86
+            ;
+    }
+#endif // !TARGET_64BIT
+
+    return false;
+}
+
 GenTree* Compiler::fgGetCritSectOfStaticMethod()
 {
     noway_assert(!compIsForInlining());
 
@@ -2057,6 +2057,14 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
                     }
                     baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
                 }
+                else if (op1->IsCnsVec())
+                {
+                    CORINFO_FIELD_HANDLE hnd =
+                        GetEmitter()->emitSimdConst(&op1->AsVecCon()->gtSimdVal, emitTypeSize(op1));
+
+                    baseReg = internalRegisters.GetSingle(node);
+                    GetEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), baseReg, hnd, 0, INS_OPTS_NONE);
+                }
                 else
                 {
                     // Require GT_IND addr to be not contained.
 
@@ -8289,28 +8289,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
                 goto _CONV;
 
             _CONV:
-                // only converts from FLOAT or DOUBLE to an integer type
-                // and converts from  ULONG (or LONG on ARM) to DOUBLE are morphed to calls
-
-                if (varTypeIsFloating(lclTyp))
-                {
-                    callNode = varTypeIsLong(impStackTop().val) ||
-                               uns // uint->dbl gets turned into uint->long->dbl
-#ifdef TARGET_64BIT
-                                   // TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
-                                   // TYP_BYREF could be used as TYP_I_IMPL which is long.
-                                   // TODO-CQ: remove this when we lower casts long/ulong --> float/double
-                                   // and generate SSE2 code instead of going through helper calls.
-                               || impStackTop().val->TypeIs(TYP_BYREF)
-#endif
-                        ;
-                }
-                else
-                {
-                    callNode = varTypeIsFloating(impStackTop().val->TypeGet());
-                }
-
-                op1 = impPopStack().val;
+                op1      = impPopStack().val;
+                callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);
 
                 impBashVarAddrsToI(op1);
Original file line number	Diff line number	Diff line change
`@@ -2057,6 +2057,14 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)`
`2057`	`2057`	`}`
`2058`	`2058`	`baseReg = (isEBPbased) ? REG_EBP : REG_ESP;`
`2059`	`2059`	`}`
	`2060`	`+ else if (op1->IsCnsVec())`
	`2061`	`+ {`
	`2062`	`+ CORINFO_FIELD_HANDLE hnd =`
	`2063`	`+ GetEmitter()->emitSimdConst(&op1->AsVecCon()->gtSimdVal, emitTypeSize(op1));`
	`2064`	`+`
	`2065`	`+ baseReg = internalRegisters.GetSingle(node);`
	`2066`	`+ GetEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), baseReg, hnd, 0, INS_OPTS_NONE);`
	`2067`	`+ }`
`2060`	`2068`	`else`
`2061`	`2069`	`{`
`2062`	`2070`	`// Require GT_IND addr to be not contained.`