Skip to content

Commit aeecdb8

Browse files
authored
Expose AVX512F embedded rounding intrinsics. (#97415)
* Expose embedded rounding related scalar intrinsic APIs * Expose embedded rounding related arithmatic intrinsic APIs * Ensure the new APIs are properly lowered * Bug fixes * Expose embedded rounding casting APIs * Expose arithmetic embedded rounding unit tests * Add a test template for embedded rounding APIs, this will be enough to cover all the binary APIs including vector and scalar operations. * Add template for unary ops * Expose all the embedded rounding unit tests generated by the templates * Expose embedded rounding casting APIs unit tests * Expose handwritten unit tests for embedded rounding APIs with special input arg lists. * Bug fixes: 1. ConvertToVector256Int32/UInt32 use special code gen path, adding a fallback path when embedded rounding is activated and the control byte is not constant. * Bug fix: Fix wrong data type in the API definition. * formatting * Update API documents for embedded rounding APIs. * resolve conflicts with #97569 * formatting * bug fix and remove un-needed SAE related intrinsics * resolve comments: 1. update the arg lists for genHWIntrinsic_R_RM * resolve comments: Add jumptable fallback to non-table driven embedded rounding intrinsics. * resolve comments: 1. remove some redundent checks on embedded rounding intrinsics * Bug fix: pass the correct operand GenTree node, when emitting the fallback for embedded rounding intrinsics. * formatting * revert an unexpected change. * 1.Resolve comments: 2. Added FMA intrinsics with embedded rounding and unit tests. * Expose the rest of embedded rounding APIs * formatting * Ensure the control byte local is assigned to the correct register.
1 parent f2d5b2f commit aeecdb8

23 files changed

+3737
-147
lines changed

src/coreclr/jit/codegen.h

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -976,13 +976,23 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
976976
#ifdef FEATURE_HW_INTRINSICS
977977
void genHWIntrinsic(GenTreeHWIntrinsic* node);
978978
#if defined(TARGET_XARCH)
979-
void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber reg, GenTree* rmOp);
979+
void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node,
980+
instruction ins,
981+
emitAttr attr,
982+
regNumber reg,
983+
GenTree* rmOp,
984+
insOpts instOptions = INS_OPTS_NONE);
980985
void genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
981986
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions);
982987
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
983988
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr);
984-
void genHWIntrinsic_R_R_R_RM(
985-
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3);
989+
void genHWIntrinsic_R_R_R_RM(instruction ins,
990+
emitAttr attr,
991+
regNumber targetReg,
992+
regNumber op1Reg,
993+
regNumber op2Reg,
994+
GenTree* op3,
995+
insOpts instOptions = INS_OPTS_NONE);
986996
void genHWIntrinsic_R_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
987997

988998
void genBaseIntrinsic(GenTreeHWIntrinsic* node);
@@ -994,7 +1004,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
9941004
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
9951005
void genAESIntrinsic(GenTreeHWIntrinsic* node);
9961006
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
997-
void genFMAIntrinsic(GenTreeHWIntrinsic* node);
1007+
void genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
9981008
void genPermuteVar2x(GenTreeHWIntrinsic* node);
9991009
void genLZCNTIntrinsic(GenTreeHWIntrinsic* node);
10001010
void genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node);
@@ -1008,6 +1018,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
10081018
regNumber baseReg,
10091019
regNumber offsReg,
10101020
HWIntrinsicSwitchCaseBody emitSwCase);
1021+
1022+
void genNonTableDrivenHWIntrinsicsJumpTableFallback(GenTreeHWIntrinsic* node, GenTree* lastOp);
10111023
#endif // defined(TARGET_XARCH)
10121024

10131025
#ifdef TARGET_ARM64

src/coreclr/jit/emitxarch.cpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6565,7 +6565,7 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN
65656565
* Add an instruction with two register operands.
65666566
*/
65676567

6568-
void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2)
6568+
void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, insOpts instOptions)
65696569
{
65706570
if (IsMovInstruction(ins))
65716571
{
@@ -6587,6 +6587,13 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
65876587
id->idReg1(reg1);
65886588
id->idReg2(reg2);
65896589

6590+
if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
6591+
{
6592+
// if EVEX.b needs to be set in this path, then it should be embedded rounding.
6593+
assert(UseEvexEncoding());
6594+
id->idSetEvexbContext(instOptions);
6595+
}
6596+
65906597
UNATIVE_OFFSET sz = emitInsSizeRR(id);
65916598
id->idCodeSize(sz);
65926599

@@ -8545,20 +8552,32 @@ void emitter::emitIns_SIMD_R_R_R_C(instruction ins,
85458552
// op1Reg -- The register of the first operand
85468553
// op2Reg -- The register of the second operand
85478554
// op3Reg -- The register of the second operand
8555+
// instOptions - The options that modify how the instruction is generated
85488556
//
8549-
void emitter::emitIns_SIMD_R_R_R_R(
8550-
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg)
8557+
void emitter::emitIns_SIMD_R_R_R_R(instruction ins,
8558+
emitAttr attr,
8559+
regNumber targetReg,
8560+
regNumber op1Reg,
8561+
regNumber op2Reg,
8562+
regNumber op3Reg,
8563+
insOpts instOptions)
85518564
{
85528565
if (IsFMAInstruction(ins) || IsPermuteVar2xInstruction(ins) || IsAVXVNNIInstruction(ins))
85538566
{
85548567
assert(UseSimdEncoding());
85558568

8569+
if (instOptions != INS_OPTS_NONE)
8570+
{
8571+
// insOpts is currently available only in EVEX encoding.
8572+
assert(UseEvexEncoding());
8573+
}
8574+
85568575
// Ensure we aren't overwriting op2 or op3
85578576
assert((op2Reg != targetReg) || (op1Reg == targetReg));
85588577
assert((op3Reg != targetReg) || (op1Reg == targetReg));
85598578

85608579
emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
8561-
emitIns_R_R_R(ins, attr, targetReg, op2Reg, op3Reg);
8580+
emitIns_R_R_R(ins, attr, targetReg, op2Reg, op3Reg, instOptions);
85628581
}
85638582
else if (UseSimdEncoding())
85648583
{
@@ -11659,6 +11678,7 @@ void emitter::emitDispIns(
1165911678
default:
1166011679
{
1166111680
printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr));
11681+
emitDispEmbRounding(id);
1166211682
break;
1166311683
}
1166411684
}

src/coreclr/jit/emitxarch.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ void emitIns_R_I(instruction ins,
635635

636636
void emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regNumber srgReg, bool canSkip);
637637

638-
void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2);
638+
void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, insOpts instOptions = INS_OPTS_NONE);
639639

640640
void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival);
641641

@@ -839,8 +839,13 @@ void emitIns_SIMD_R_R_R_C(instruction ins,
839839
regNumber op2Reg,
840840
CORINFO_FIELD_HANDLE fldHnd,
841841
int offs);
842-
void emitIns_SIMD_R_R_R_R(
843-
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg);
842+
void emitIns_SIMD_R_R_R_R(instruction ins,
843+
emitAttr attr,
844+
regNumber targetReg,
845+
regNumber op1Reg,
846+
regNumber op2Reg,
847+
regNumber op3Reg,
848+
insOpts instOptions = INS_OPTS_NONE);
844849
void emitIns_SIMD_R_R_R_S(
845850
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);
846851

src/coreclr/jit/gentree.cpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26484,6 +26484,95 @@ bool GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic() const
2648426484
return Oper == GT_AND || Oper == GT_OR || Oper == GT_XOR || Oper == GT_AND_NOT;
2648526485
}
2648626486

26487+
//------------------------------------------------------------------------
26488+
// OperIsEmbRoundingEnabled: Is this HWIntrinsic a node with embedded rounding feature.
26489+
//
26490+
// Return Value:
26491+
// Whether "this" is a node with embedded rounding feature.
26492+
//
26493+
bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const
26494+
{
26495+
#if defined(TARGET_XARCH)
26496+
NamedIntrinsic intrinsicId = GetHWIntrinsicId();
26497+
26498+
if (!HWIntrinsicInfo::IsEmbRoundingCompatible(intrinsicId))
26499+
{
26500+
return false;
26501+
}
26502+
26503+
size_t numArgs = GetOperandCount();
26504+
switch (intrinsicId)
26505+
{
26506+
// these intrinsics only have the embedded rounding enabled implementation.
26507+
case NI_AVX512F_AddScalar:
26508+
case NI_AVX512F_DivideScalar:
26509+
case NI_AVX512F_MultiplyScalar:
26510+
case NI_AVX512F_SubtractScalar:
26511+
case NI_AVX512F_SqrtScalar:
26512+
{
26513+
return true;
26514+
}
26515+
26516+
case NI_AVX512F_FusedMultiplyAdd:
26517+
case NI_AVX512F_FusedMultiplyAddScalar:
26518+
case NI_AVX512F_FusedMultiplyAddNegated:
26519+
case NI_AVX512F_FusedMultiplyAddNegatedScalar:
26520+
case NI_AVX512F_FusedMultiplyAddSubtract:
26521+
case NI_AVX512F_FusedMultiplySubtract:
26522+
case NI_AVX512F_FusedMultiplySubtractAdd:
26523+
case NI_AVX512F_FusedMultiplySubtractNegated:
26524+
case NI_AVX512F_FusedMultiplySubtractNegatedScalar:
26525+
case NI_AVX512F_FusedMultiplySubtractScalar:
26526+
{
26527+
return numArgs == 4;
26528+
}
26529+
26530+
case NI_AVX512F_Add:
26531+
case NI_AVX512F_Divide:
26532+
case NI_AVX512F_Multiply:
26533+
case NI_AVX512F_Subtract:
26534+
26535+
case NI_AVX512F_Scale:
26536+
case NI_AVX512F_ScaleScalar:
26537+
26538+
case NI_AVX512F_ConvertScalarToVector128Single:
26539+
#if defined(TARGET_AMD64)
26540+
case NI_AVX512F_X64_ConvertScalarToVector128Double:
26541+
case NI_AVX512F_X64_ConvertScalarToVector128Single:
26542+
#endif // TARGET_AMD64
26543+
{
26544+
return numArgs == 3;
26545+
}
26546+
26547+
case NI_AVX512F_Sqrt:
26548+
case NI_AVX512F_ConvertToInt32:
26549+
case NI_AVX512F_ConvertToUInt32:
26550+
case NI_AVX512F_ConvertToVector256Int32:
26551+
case NI_AVX512F_ConvertToVector256Single:
26552+
case NI_AVX512F_ConvertToVector256UInt32:
26553+
case NI_AVX512F_ConvertToVector512Single:
26554+
case NI_AVX512F_ConvertToVector512UInt32:
26555+
case NI_AVX512F_ConvertToVector512Int32:
26556+
#if defined(TARGET_AMD64)
26557+
case NI_AVX512F_X64_ConvertToInt64:
26558+
case NI_AVX512F_X64_ConvertToUInt64:
26559+
#endif // TARGET_AMD64
26560+
case NI_AVX512DQ_ConvertToVector256Single:
26561+
case NI_AVX512DQ_ConvertToVector512Double:
26562+
case NI_AVX512DQ_ConvertToVector512Int64:
26563+
case NI_AVX512DQ_ConvertToVector512UInt64:
26564+
{
26565+
return numArgs == 2;
26566+
}
26567+
26568+
default:
26569+
unreached();
26570+
}
26571+
#else // !TARGET_XARCH
26572+
return false;
26573+
#endif // TARGET_XARCH
26574+
}
26575+
2648726576
//------------------------------------------------------------------------------
2648826577
// OperRequiresAsgFlag : Check whether the operation requires GTF_ASG flag regardless
2648926578
// of the children's flags.

src/coreclr/jit/gentree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6387,6 +6387,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
63876387
bool OperIsBroadcastScalar() const;
63886388
bool OperIsCreateScalarUnsafe() const;
63896389
bool OperIsBitwiseHWIntrinsic() const;
6390+
bool OperIsEmbRoundingEnabled() const;
63906391

63916392
bool OperRequiresAsgFlag() const;
63926393
bool OperRequiresCallFlag() const;

src/coreclr/jit/hwintrinsic.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -607,21 +607,6 @@ struct HWIntrinsicInfo
607607
HWIntrinsicFlag flags = lookupFlags(id);
608608
return (flags & HW_Flag_EmbMaskingIncompatible) == 0;
609609
}
610-
611-
static size_t EmbRoundingArgPos(NamedIntrinsic id)
612-
{
613-
// This helper function returns the expected position,
614-
// where the embedded rounding control argument should be.
615-
assert(IsEmbRoundingCompatible(id));
616-
switch (id)
617-
{
618-
case NI_AVX512F_Add:
619-
return 3;
620-
621-
default:
622-
unreached();
623-
}
624-
}
625610
#endif // TARGET_XARCH
626611

627612
static bool CanBenefitFromConstantProp(NamedIntrinsic id)

0 commit comments

Comments
 (0)