Skip to content

Commit 453d58e

Browse files
Tweak the register order for xarch to better account for callee saved (#88151)
* Tweak the register order for xarch to better account for callee saved * Have a different register order for leaf methods * Preference R12 after R13 for 2x the savings * Allow float/vector constants to use EVEX registers * Fix Runtime_34937 to expect the right codegen on Unix * Don't add VAR_ORDER_LEAF and add a comment explaining why we prefer callee trash then callee save * Simplifying the REG_VAR_ORDER_FLT_EVEX_CALLEE_* defines * Ensure x86 matches the general x64 ordering * Ensure that AllBitsSet vector constants correctly handle high simd registers * Ensure that Runtime_34937 expects the right codegen for Windows x64 * Improve the register ordering more * Ensure scalar AllBitsSet handles EVEX registers * Don't change register allocation order for x86 * Minimize the diffs
1 parent 0aca9e5 commit 453d58e

File tree

8 files changed

+185
-80
lines changed

8 files changed

+185
-80
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,16 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
438438
simd8_t val8 = *(simd8_t*)val;
439439
if (val8.IsAllBitsSet())
440440
{
441-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
441+
if (emitter::isHighSimdReg(targetReg))
442+
{
443+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
444+
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
445+
static_cast<int8_t>(0xFF));
446+
}
447+
else
448+
{
449+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
450+
}
442451
}
443452
else if (val8.IsZero())
444453
{
@@ -456,7 +465,16 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
456465
simd12_t val12 = *(simd12_t*)val;
457466
if (val12.IsAllBitsSet())
458467
{
459-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
468+
if (emitter::isHighSimdReg(targetReg))
469+
{
470+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
471+
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
472+
static_cast<int8_t>(0xFF));
473+
}
474+
else
475+
{
476+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
477+
}
460478
}
461479
else if (val12.IsZero())
462480
{
@@ -476,7 +494,16 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
476494
simd16_t val16 = *(simd16_t*)val;
477495
if (val16.IsAllBitsSet())
478496
{
479-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
497+
if (emitter::isHighSimdReg(targetReg))
498+
{
499+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
500+
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
501+
static_cast<int8_t>(0xFF));
502+
}
503+
else
504+
{
505+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
506+
}
480507
}
481508
else if (val16.IsZero())
482509
{
@@ -494,7 +521,16 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
494521
simd32_t val32 = *(simd32_t*)val;
495522
if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
496523
{
497-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
524+
if (emitter::isHighSimdReg(targetReg))
525+
{
526+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
527+
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
528+
static_cast<int8_t>(0xFF));
529+
}
530+
else
531+
{
532+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
533+
}
498534
}
499535
else if (val32.IsZero())
500536
{
@@ -592,8 +628,17 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
592628
}
593629
else if (tree->IsFloatAllBitsSet())
594630
{
595-
// A faster/smaller way to generate AllBitsSet
596-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
631+
if (emitter::isHighSimdReg(targetReg))
632+
{
633+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
634+
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, EA_16BYTE, targetReg, targetReg, targetReg,
635+
static_cast<int8_t>(0xFF));
636+
}
637+
else
638+
{
639+
// A faster/smaller way to generate AllBitsSet
640+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
641+
}
597642
}
598643
else
599644
{

src/coreclr/jit/emitxarch.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,38 +1570,19 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
15701570
bool emitter::HasHighSIMDReg(const instrDesc* id) const
15711571
{
15721572
#if defined(TARGET_AMD64)
1573-
if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2()))
1573+
if (isHighSimdReg(id->idReg1()) || isHighSimdReg(id->idReg2()))
15741574
return true;
15751575

15761576
if (id->idIsSmallDsc())
15771577
return false;
15781578

1579-
if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || (id->idHasReg4() && IsHighSIMDReg(id->idReg4())))
1579+
if ((id->idHasReg3() && isHighSimdReg(id->idReg3())) || (id->idHasReg4() && isHighSimdReg(id->idReg4())))
15801580
return true;
15811581
#endif
15821582
// X86 JIT operates in 32-bit mode and hence extended reg are not available.
15831583
return false;
15841584
}
15851585

1586-
//------------------------------------------------------------------------
1587-
// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD
1588-
// registers (mm16-mm31).
1589-
//
1590-
// Arguments:
1591-
// reg -- register to check
1592-
//
1593-
// Return Value:
1594-
// true if the register is strictly an EVEX encoded high SIMD register
1595-
bool emitter::IsHighSIMDReg(regNumber reg) const
1596-
{
1597-
#ifdef TARGET_AMD64
1598-
return ((reg >= REG_XMM16) && (reg <= REG_XMM31));
1599-
#else
1600-
// X86 JIT operates in 32-bit mode and hence extended reg are not available.
1601-
return false;
1602-
#endif
1603-
}
1604-
16051586
//------------------------------------------------------------------------
16061587
// HasMaskReg: Checks if an instruction uses a KMask registers (k0-k7)
16071588
//
@@ -3160,7 +3141,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi
31603141

31613142
if (IsExtendedReg(reg))
31623143
{
3163-
if (IsHighSIMDReg(reg))
3144+
if (isHighSimdReg(reg))
31643145
{
31653146
*code = AddRexXPrefix(id, *code); // EVEX.X
31663147
}
@@ -3203,7 +3184,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi
32033184

32043185
if (IsExtendedReg(reg))
32053186
{
3206-
if (IsHighSIMDReg(reg))
3187+
if (isHighSimdReg(reg))
32073188
{
32083189
*code = AddEvexRPrimePrefix(*code); // EVEX.R'
32093190
}
@@ -3262,7 +3243,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber
32623243
// Rather see these paths cleaned up.
32633244
regBits = HighAwareRegEncoding(reg);
32643245

3265-
if (IsHighSIMDReg(reg))
3246+
if (isHighSimdReg(reg))
32663247
{
32673248
// Have to set the EVEX V' bit
32683249
code = AddEvexVPrimePrefix(code);
@@ -3308,7 +3289,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod
33083289

33093290
if (IsExtendedReg(reg))
33103291
{
3311-
if (IsHighSIMDReg(reg))
3292+
if (isHighSimdReg(reg))
33123293
{
33133294
*code = AddEvexVPrimePrefix(*code); // EVEX.X
33143295
}

src/coreclr/jit/emitxarch.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,16 @@ inline static bool isMaskReg(regNumber reg)
2828
return (reg >= REG_MASK_FIRST && reg <= REG_MASK_LAST);
2929
}
3030

31+
inline static bool isHighSimdReg(regNumber reg)
32+
{
33+
#ifdef TARGET_AMD64
34+
return ((reg >= REG_XMM16) && (reg <= REG_XMM31));
35+
#else
36+
// X86 JIT operates in 32-bit mode and hence extended regs are not available.
37+
return false;
38+
#endif
39+
}
40+
3141
/************************************************************************/
3242
/* Routines that compute the size of / encode instructions */
3343
/************************************************************************/
@@ -890,7 +900,6 @@ inline bool HasEmbeddedBroadcast(const instrDesc* id) const
890900
}
891901

892902
inline bool HasHighSIMDReg(const instrDesc* id) const;
893-
inline bool IsHighSIMDReg(regNumber) const;
894903

895904
inline bool HasMaskReg(const instrDesc* id) const;
896905

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1861,6 +1861,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
18611861
lastOp = op3;
18621862

18631863
// generate all-one mask vector
1864+
assert(!emitter::isHighSimdReg(targetReg));
18641865
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
18651866
}
18661867

src/coreclr/jit/lsrabuild.cpp

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1913,12 +1913,13 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc
19131913

19141914
static const regNumber lsraRegOrder[] = {REG_VAR_ORDER};
19151915
const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder);
1916-
// TODO-XARCH-AVX512 we might want to move this to be configured with the rbm variables too
1916+
19171917
static const regNumber lsraRegOrderFlt[] = {REG_VAR_ORDER_FLT};
19181918
const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt);
1919+
19191920
#if defined(TARGET_AMD64)
1920-
static const regNumber lsraRegOrderFltUpper[] = {REG_VAR_ORDER_FLT_UPPER};
1921-
const unsigned lsraRegOrderUpperFltSize = ArrLen(lsraRegOrderFltUpper);
1921+
static const regNumber lsraRegOrderFltEvex[] = {REG_VAR_ORDER_FLT_EVEX};
1922+
const unsigned lsraRegOrderFltEvexSize = ArrLen(lsraRegOrderFltEvex);
19221923
#endif // TARGET_AMD64
19231924

19241925
//------------------------------------------------------------------------
@@ -1945,23 +1946,37 @@ void LinearScan::buildPhysRegRecords()
19451946
// initializing the floating registers.
19461947
// For that `compFloatingPointUsed` should be set accurately
19471948
// before invoking allocator.
1948-
for (unsigned int i = 0; i < lsraRegOrderFltSize; i++)
1949-
{
1950-
regNumber reg = lsraRegOrderFlt[i];
1951-
RegRecord* curr = &physRegs[reg];
1952-
curr->regOrder = (unsigned char)i;
1953-
}
1949+
1950+
const regNumber* regOrderFlt;
1951+
unsigned regOrderFltSize;
1952+
19541953
#if defined(TARGET_AMD64)
1954+
// x64 has additional registers available when EVEX is supported
1955+
// and that causes a different ordering to be used since they are
1956+
// callee trash and should appear at the end up the existing callee
1957+
// trash set
1958+
19551959
if (compiler->canUseEvexEncoding())
19561960
{
1957-
for (unsigned int i = 0; i < lsraRegOrderUpperFltSize; i++)
1958-
{
1959-
regNumber reg = lsraRegOrderFltUpper[i];
1960-
RegRecord* curr = &physRegs[reg];
1961-
curr->regOrder = (unsigned char)(i + lsraRegOrderFltSize);
1962-
}
1961+
regOrderFlt = &lsraRegOrderFltEvex[0];
1962+
regOrderFltSize = lsraRegOrderFltEvexSize;
1963+
}
1964+
else
1965+
{
1966+
regOrderFlt = &lsraRegOrderFlt[0];
1967+
regOrderFltSize = lsraRegOrderFltSize;
1968+
}
1969+
#else
1970+
regOrderFlt = &lsraRegOrderFlt[0];
1971+
regOrderFltSize = lsraRegOrderFltSize;
1972+
#endif
1973+
1974+
for (unsigned int i = 0; i < regOrderFltSize; i++)
1975+
{
1976+
regNumber reg = regOrderFlt[i];
1977+
RegRecord* curr = &physRegs[reg];
1978+
curr->regOrder = (unsigned char)i;
19631979
}
1964-
#endif // TARGET_AMD64
19651980
}
19661981

19671982
//------------------------------------------------------------------------

src/coreclr/jit/lsraxarch.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,14 @@ int LinearScan::BuildNode(GenTree* tree)
154154
case GT_CNS_VEC:
155155
{
156156
srcCount = 0;
157+
157158
assert(dstCount == 1);
158159
assert(!tree->IsReuseRegVal());
159-
RefPosition* def = BuildDef(tree, BuildEvexIncompatibleMask(tree));
160+
161+
RefPosition* def = BuildDef(tree);
160162
def->getInterval()->isConstant = true;
163+
break;
161164
}
162-
break;
163165

164166
#if !defined(TARGET_64BIT)
165167

0 commit comments

Comments
 (0)