Skip to content

Commit 0fd6a44

Browse files
author
Li Tian
committed
Remove AVX/SSE transition penalties
There are two two kinds of transition penalties: 1.Transition from 256-bit AVX code to 128-bit legacy SSE code. 2.Transition from 128-bit legacy SSE code to either 128 or 256-bit AVX code. This only happens if there was a preceding AVX256->legacy SSE transition penalty. The primary goal is to remove the dotnet#1 AVX to SSE transition penalty. Added two emitter flags: contains256bitAVXInstruction indicates that if the JIT method contains 256-bit AVX code, containsAVXInstruction indicates that if the method contains 128-bit or 256-bit AVX code. Issue VZEROUPPER in prolog if the method contains 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, this could happen for reverse pinvoke situation. Issue VZEROUPPER in epilog if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty. To limite code size increase impact, we only issue VZEROUPPER before PInvoke call on user defined function if the JIT method contains 256-bit AVX code, assuming user defined function contains legacy SSE code. No need to issue VZEROUPPER after PInvoke call because dotnet#2 SSE to AVX transition penalty won't happen since dotnet#1 AVX to SSE transition has been taken care of before the PInvoke call. We measured ~3% to 1% performance gain on TechEmPower plaintext and verified those VTune AVX/SSE events: OTHER_ASSISTS.AVX_TO_SSE and OTHER_ASSISTS.SSE_TO_AVE have been reduced to 0. Fix #7240
1 parent c10c1ff commit 0fd6a44

File tree

5 files changed

+139
-20
lines changed

5 files changed

+139
-20
lines changed

src/jit/codegencommon.cpp

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10583,6 +10583,19 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
1058310583
// funclet frames: this will be FuncletInfo.fiSpDelta.
1058410584
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1058510585
{
10586+
#ifdef FEATURE_AVX_SUPPORT
10587+
bool bVzeroupperIssued = false;
10588+
// If the method contains AVX instruction, issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
10589+
// This is to avoid AVX/SSE transition penalty since in Reverse PInvoke, native code that contains Legacy SSE
10590+
// instruction can call into this managed code that contains 128bit or 256bit AVX instruction
10591+
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
10592+
getEmitter()->ContainsAVX())
10593+
{
10594+
instGen(INS_vzeroupper);
10595+
bVzeroupperIssued = true;
10596+
}
10597+
#endif
10598+
1058610599
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1058710600

1058810601
// Only callee saved floating point registers should be in regMask
@@ -10611,6 +10624,16 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1061110624
regMaskTP regBit = genRegMask(reg);
1061210625
if ((regBit & regMask) != 0)
1061310626
{
10627+
#ifdef FEATURE_AVX_SUPPORT
10628+
// Vzeroupper needs to be issued before copyIns (vmovupd etc.) that are AVX instructions
10629+
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
10630+
!bVzeroupperIssued &&
10631+
getEmitter()->IsAVXInstruction(copyIns))
10632+
{
10633+
instGen(INS_vzeroupper);
10634+
bVzeroupperIssued = true;
10635+
}
10636+
#endif
1061410637
// ABI requires us to preserve lower 128-bits of YMM register.
1061510638
getEmitter()->emitIns_AR_R(copyIns,
1061610639
EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
@@ -10621,16 +10644,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1062110644
offset -= XMM_REGSIZE_BYTES;
1062210645
}
1062310646
}
10624-
10625-
#ifdef FEATURE_AVX_SUPPORT
10626-
// Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
10627-
// This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
10628-
// using SSE2.
10629-
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
10630-
{
10631-
instGen(INS_vzeroupper);
10632-
}
10633-
#endif
1063410647
}
1063510648

1063610649
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
@@ -10651,6 +10664,19 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1065110664
// fast path return
1065210665
if (regMask == RBM_NONE)
1065310666
{
10667+
#ifdef FEATURE_AVX_SUPPORT
10668+
// Before return, check if the method contains 256bit AVX instruction, issue a Vzeroupper to zero out
10669+
// upper 128-bits of all YMM regs. This is to avoid AVX-256 to legacy SSE transition penalty.
10670+
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
10671+
getEmitter()->Contains256bitAVX())
10672+
{
10673+
instGen(INS_vzeroupper);
10674+
}
10675+
10676+
// reset both AVX and 256bit AVX flags for the function before return
10677+
getEmitter()->SetContainsAVX(false);
10678+
getEmitter()->SetContains256bitAVX(false);
10679+
#endif
1065410680
return;
1065510681
}
1065610682

@@ -10682,16 +10708,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1068210708
assert((offset % 16) == 0);
1068310709
#endif // _TARGET_AMD64_
1068410710

10685-
#ifdef FEATURE_AVX_SUPPORT
10686-
// Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
10687-
// This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
10688-
// using SSE2.
10689-
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
10690-
{
10691-
instGen(INS_vzeroupper);
10692-
}
10693-
#endif
10694-
1069510711
for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
1069610712
{
1069710713
regMaskTP regBit = genRegMask(reg);
@@ -10706,6 +10722,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1070610722
offset -= XMM_REGSIZE_BYTES;
1070710723
}
1070810724
}
10725+
10726+
#ifdef FEATURE_AVX_SUPPORT
10727+
// Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
10728+
// This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
10729+
// using SSE2.
10730+
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
10731+
getEmitter()->Contains256bitAVX())
10732+
{
10733+
instGen(INS_vzeroupper);
10734+
}
10735+
// reset both AVX and 256bit AVX flags for the function before return
10736+
getEmitter()->SetContainsAVX(false);
10737+
getEmitter()->SetContains256bitAVX(false);
10738+
#endif
1070910739
}
1071010740
#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
1071110741

src/jit/codegenxarch.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5001,6 +5001,20 @@ void CodeGen::genCallInstruction(GenTreePtr node)
50015001

50025002
#endif // defined(_TARGET_X86_)
50035003

5004+
#ifdef FEATURE_AVX_SUPPORT
5005+
// When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
5006+
// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
5007+
// transition penalty, assume the user function may contains legacy SSE instruction
5008+
if (call->IsPInvoke() && call->gtCallType == CT_USER_FUNC && compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
5009+
{
5010+
if (getEmitter()->Contains256bitAVX())
5011+
{
5012+
instGen(INS_vzeroupper);
5013+
}
5014+
}
5015+
#endif
5016+
5017+
50045018
if (target != nullptr)
50055019
{
50065020
#ifdef _TARGET_X86_

src/jit/compiler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,6 +2310,9 @@ void Compiler::compSetProcessor()
23102310
if (opts.compCanUseAVX)
23112311
{
23122312
codeGen->getEmitter()->SetUseAVX(true);
2313+
// Assume each JITted method does not contain AVX instruction at first
2314+
codeGen->getEmitter()->SetContainsAVX(false);
2315+
codeGen->getEmitter()->SetContains256bitAVX(false);
23132316
}
23142317
else
23152318
#endif // FEATURE_AVX_SUPPORT

0 commit comments

Comments
 (0)