@@ -138,8 +138,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
138138    }
139139
140140#if  defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
141-     if  (!tree->TypeIs (TYP_LONG) &&
142-         !(tree->OperIs (GT_CAST) && varTypeIsLong (tree->AsCast ()->CastOp ()) && varTypeIsFloating (tree)))
141+     //  On x86, long->floating casts are implemented in DecomposeCast.
142+     bool  isLongToFloatingCast =
143+         (tree->OperIs (GT_CAST) && varTypeIsLong (tree->AsCast ()->CastOp ()) && varTypeIsFloating (tree));
144+ 
145+     if  (!tree->TypeIs (TYP_LONG) && !isLongToFloatingCast)
143146#else 
144147    if  (!tree->TypeIs (TYP_LONG))
145148#endif  //  FEATURE_HW_INTRINSICS && TARGET_X86
@@ -159,6 +162,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
159162        //  HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
160163        //  Here we do a conservative check for specific cases where it is certain the load/store
161164        //  can be contained. In those cases, we can skip decomposition.
165+         // 
166+         //  We also look for longs consumed directly by a long->floating cast. These can skip
167+         //  decomposition because the cast is implemented using HWIntrinsics.
162168
163169        GenTree* user = use.User ();
164170
@@ -582,44 +588,213 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
582588    }
583589
584590#if  defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
585-     if  (varTypeIsFloating (dstType))
591+     if  (varTypeIsFloating (srcType) ||  varTypeIsFloating ( dstType))
586592    {
587593        //  We will reach this path only if morph did not convert the cast to a helper call,
588594        //  meaning we can perform the cast using SIMD instructions.
589-         //  The sequence this creates is simply:
590-         //     AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
591- 
592-         NamedIntrinsic intrinsicId      = NI_Illegal;
593-         GenTree*       srcOp            = cast->CastOp ();
594-         var_types      dstType          = cast->CastToType ();
595-         CorInfoType    baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
596-         CorInfoType    baseIntegralType = cast->IsUnsigned () ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
597595
598596        assert (!cast->gtOverflow ());
599597        assert (m_compiler->compIsaSupportedDebugOnly (InstructionSet_AVX512));
600598
601-         intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
599+         GenTree*    srcOp       = cast->CastOp ();
600+         GenTree*    castResult  = nullptr ;
601+         LIR::Range  castRange   = LIR::EmptyRange ();
602+         CorInfoType srcBaseType = CORINFO_TYPE_UNDEF;
603+         CorInfoType dstBaseType = CORINFO_TYPE_UNDEF;
602604
603-         GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode (TYP_SIMD16, srcOp, baseIntegralType, 16 );
604-         GenTree* convert =
605-             m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16 );
606-         GenTree* toScalar = m_compiler->gtNewSimdToScalarNode (dstType, convert, baseFloatingType, 16 );
605+         if  (varTypeIsFloating (srcType))
606+         {
607+             srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
608+             dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
609+         }
610+         else 
611+         {
612+             srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
613+             dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
614+         }
607615
608-         Range (). InsertAfter (cast, createScalar, convert, toScalar); 
609-         Range (). Remove (cast );
616+         //  This creates the equivalent of the following C# code: 
617+         //    var srcVec = Vector128.CreateScalarUnsafe(castOp );
610618
611-         if  (createScalar->IsCnsVec ())
619+         GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode (TYP_SIMD16, srcOp, srcBaseType, 16 );
620+         castRange.InsertAtEnd (srcVector);
621+ 
622+         if  (srcVector->IsCnsVec ())
612623        {
613624            Range ().Remove (srcOp);
614625        }
615626
627+         if  (varTypeIsFloating (dstType))
628+         {
629+             //  long->floating casts don't require any kind of fixup. We simply use the vector
630+             //  form of the instructions, because the scalar form is not supported on 32-bit.
631+ 
632+             NamedIntrinsic intrinsicId =
633+                 (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
634+ 
635+             castResult = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16 );
636+         }
637+         else  if  (m_compiler->compOpportunisticallyDependsOn (InstructionSet_AVX10v2))
638+         {
639+             //  Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
640+             //  but we have to use the vector form.
641+ 
642+             NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
643+                                              ? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation
644+                                              : NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;
645+ 
646+             castResult = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16 );
647+         }
648+         else  if  (dstType == TYP_ULONG)
649+         {
650+             //  AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
651+             //  we only need to fix up negative or NaN values before conversion.
652+             // 
653+             //  maxs[sd] will take the value from the second operand if the first operand's value is
654+             //  NaN, which allows us to fix up both negative and NaN values with a single instruction.
655+             // 
656+             //  This creates the equivalent of the following C# code:
657+             //    var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
658+             //    castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
659+ 
660+             GenTree* zero     = m_compiler->gtNewZeroConNode (TYP_SIMD16);
661+             GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar,
662+                                                                      srcBaseType, 16 );
663+ 
664+             castRange.InsertAtEnd (zero);
665+             castRange.InsertAtEnd (fixupVal);
666+ 
667+             castResult =
668+                 m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, fixupVal,
669+                                                      NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16 );
670+         }
671+         else 
672+         {
673+             assert (dstType == TYP_LONG);
674+ 
675+             //  The logic for floating->signed long casts is similar to the AVX-512 implementation
676+             //  in LowerCast, except that all operations must be done in SIMD registers.
677+ 
678+             if  (srcType == TYP_FLOAT)
679+             {
680+                 //  For float->long, the result will be twice as wide as the input. Broadcasting the
681+                 //  input allows us to use two adjacent elements when creating the fixup mask later.
682+ 
683+                 srcVector = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcVector,
684+                                                                  NI_AVX2_BroadcastScalarToVector128, srcBaseType, 16 );
685+                 castRange.InsertAtEnd (srcVector);
686+             }
687+ 
688+             //  We will use the input value multiple times, so we replace it with a lclVar.
689+             LIR::Use srcUse;
690+             LIR::Use::MakeDummyUse (castRange, srcVector, &srcUse);
691+             srcUse.ReplaceWithLclVar (m_compiler);
692+             srcVector = srcUse.Def ();
693+ 
694+             //  Fix up NaN values before conversion. Saturation is handled after conversion,
695+             //  because MaxValue is not precisely representable in the floating format.
696+             // 
697+             //  This creates the equivalent of the following C# code:
698+             //    var nanMask = Sse.CompareScalarOrdered(srcVec, srcVec);
699+             //    var fixupVal = Sse.And(srcVec, nanMask);
700+             //    convertResult = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(fixupVal);
701+ 
702+             GenTree* srcClone = m_compiler->gtClone (srcVector);
703+             GenTree* nanMask  = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcVector, srcClone,
704+                                                                      NI_X86Base_CompareScalarOrdered, srcBaseType, 16 );
705+ 
706+             castRange.InsertAtEnd (srcClone);
707+             castRange.InsertAtEnd (nanMask);
708+ 
709+             srcClone          = m_compiler->gtClone (srcVector);
710+             GenTree* fixupVal = m_compiler->gtNewSimdBinOpNode (GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16 );
711+ 
712+             castRange.InsertAtEnd (srcClone);
713+             castRange.InsertAtEnd (fixupVal);
714+ 
715+             GenTree* convertResult =
716+                 m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, fixupVal,
717+                                                      NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16 );
718+ 
719+             castRange.InsertAtEnd (convertResult);
720+ 
721+             //  Now we handle saturation of the result for positive overflow.
722+             // 
723+             //  This creates the equivalent of the following C# code:
724+             //    var maxFloatingValue = Vector128.Create(9223372036854775808.0);
725+             //    var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
726+             //    var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode);
727+ 
728+             NamedIntrinsic compareIntrinsic = (srcType == TYP_FLOAT) ? NI_AVX_Compare : NI_AVX_CompareScalar;
729+             GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode (TYP_SIMD16);
730+ 
731+             if  (srcType == TYP_FLOAT)
732+             {
733+                 //  For float->long, we broadcast the comparison value, same as we broadcast the input.
734+                 for  (uint32_t  index = 0 ; index < 4 ; index++)
735+                 {
736+                     maxFloatingValue->gtSimdVal .f32 [index] = 9223372036854775808 .0f ;
737+                 }
738+             }
739+             else 
740+             {
741+                 maxFloatingValue->gtSimdVal .f64 [0 ] = 9223372036854775808.0 ;
742+             }
743+ 
744+             castRange.InsertAtEnd (maxFloatingValue);
745+ 
746+             srcClone             = m_compiler->gtClone (srcVector);
747+             GenTree* compareMode = m_compiler->gtNewIconNode (
748+                 static_cast <int32_t >(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
749+             GenTree* compareMax = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, srcClone, maxFloatingValue,
750+                                                                        compareMode, compareIntrinsic, srcBaseType, 16 );
751+ 
752+             castRange.InsertAtEnd (srcClone);
753+             castRange.InsertAtEnd (compareMode);
754+             castRange.InsertAtEnd (compareMax);
755+ 
756+             //  We will use the compare mask multiple times, so we replace it with a lclVar.
757+             LIR::Use cmpUse;
758+             LIR::Use::MakeDummyUse (castRange, compareMax, &cmpUse);
759+             cmpUse.ReplaceWithLclVar (m_compiler);
760+             compareMax = cmpUse.Def ();
761+ 
762+             //  Mask in long.MaxValue for positive saturation. In the case of overflow, the compare
763+             //  mask will be all ones. We shift that value right by one to create the MaxValue vector.
764+             //  This is where we treat two adjacent elements from a float compare as one 64-bit mask.
765+             // 
766+             //  This creates the equivalent of the following C# code:
767+             //    var maxLong = Sse2.ShiftRightLogical(compareMax, 1);
768+             //    castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult);
769+ 
770+             GenTree* cmpClone = m_compiler->gtClone (compareMax);
771+             GenTree* one      = m_compiler->gtNewIconNode (1 );
772+             GenTree* maxLong  = m_compiler->gtNewSimdHWIntrinsicNode (TYP_SIMD16, compareMax, one,
773+                                                                      NI_X86Base_ShiftRightLogical, dstBaseType, 16 );
774+ 
775+             castRange.InsertAtEnd (one);
776+             castRange.InsertAtEnd (maxLong);
777+             castRange.InsertAtEnd (cmpClone);
778+ 
779+             castResult = m_compiler->gtNewSimdCndSelNode (TYP_SIMD16, cmpClone, maxLong, convertResult, dstBaseType, 16 );
780+         }
781+ 
782+         //  Because the results are in a SIMD register, we need to ToScalar() them out.
783+         GenTree* toScalar = m_compiler->gtNewSimdToScalarNode (genActualType (dstType), castResult, dstBaseType, 16 );
784+ 
785+         castRange.InsertAtEnd (castResult);
786+         castRange.InsertAtEnd (toScalar);
787+ 
788+         Range ().InsertAfter (cast, std::move (castRange));
789+         Range ().Remove (cast);
790+ 
616791        if  (use.IsDummyUse ())
617792        {
618793            toScalar->SetUnusedValue ();
619794        }
620795        use.ReplaceWith (toScalar);
621796
622-         return  toScalar-> gtNext ;
797+         return  toScalar;
623798    }
624799#endif  //  FEATURE_HW_INTRINSICS && TARGET_X86
625800
0 commit comments