Skip to content

Commit 3868373

Browse files
AMDGPU/GlobalISel: uniformity analysis based register bank selection
Current algorithm only considers register banks for inputs but does take control flow into account. This is wrong in cases where inputs are uniform (in sgpr) but because of divergent control flow instruction is divergent and should use vgpr instead of sgpr register banks. Most notable example are phis. Also in cases where only available machine instruction uses vgpr registers uniform instructions end up using vgpr register banks. Start with simple implementation for G_FADD. Pre-select register bank for destination register using machine uniformity analysis info. Then select register banks that would allow selection of available machine instructions. For G_FADD vgpr machine instruction is available on all targets but sgpr version is not. When there is no sgpr version assign vgpr register banks and move vgpr destination to sgpr using readfirstlane.
1 parent cc3a8f3 commit 3868373

17 files changed

+1838
-1214
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
#include "AMDGPURegBankSelect.h"
1616
#include "AMDGPU.h"
1717
#include "GCNSubtarget.h"
18+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1820
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
21+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1922
#include "llvm/InitializePasses.h"
2023

2124
#define DEBUG_TYPE "regbankselect"
@@ -68,7 +71,26 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
6871
MachineUniformityInfo Uniformity =
6972
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
7073
!ST.isSingleLaneExecution(F));
71-
(void)Uniformity; // TODO: Use this
74+
75+
// Switch for uniformity info based regbank selection. Pre-selects register
76+
// bank on dst registers using machine uniformity analysis.
77+
// Keep in sinc with switches in getInstrMapping and applyMappingImpl.
78+
for (MachineBasicBlock &MBB : MF) {
79+
for (MachineInstr &MI : MBB) {
80+
switch (MI.getOpcode()) {
81+
case AMDGPU::G_FADD: {
82+
Register Dst = MI.getOperand(0).getReg();
83+
if (Uniformity.isUniform(Dst))
84+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
85+
else
86+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
87+
break;
88+
}
89+
default:
90+
break;
91+
}
92+
}
93+
}
7294

7395
assignRegisterBanks(MF);
7496

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 118 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,26 @@ Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
713713
return Dst;
714714
}
715715

716+
// Create new vgpr destination register for MI then move it to current
717+
// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
718+
void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
719+
MachineInstr &MI) const {
720+
MachineRegisterInfo &MRI = *B.getMRI();
721+
Register Dst = MI.getOperand(0).getReg();
722+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
723+
if (DstBank != &AMDGPU::SGPRRegBank)
724+
return;
725+
726+
Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
727+
MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);
728+
729+
MI.getOperand(0).setReg(VgprDst);
730+
MachineBasicBlock *MBB = MI.getParent();
731+
B.setInsertPt(*MBB, std::next(MI.getIterator()));
732+
// readFirstLane VgprDst into Dst after MI.
733+
return buildReadFirstLaneForType(B, Dst, VgprDst);
734+
}
735+
716736
void AMDGPURegisterBankInfo::buildReadFirstLaneB32(MachineIRBuilder &B,
717737
Register SgprDst,
718738
Register VgprSrc) const {
@@ -736,32 +756,42 @@ void AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
736756
}
737757

738758
B.buildUnmerge(VgprSrcParts, VgprSrc);
739-
for (unsigned i = 0; i < NumElts; ++i) {
759+
for (unsigned i = 0; i < NumElts; ++i)
740760
buildReadFirstLaneB32(B, SgprDstParts[i], VgprSrcParts[i]);
741-
}
761+
742762
B.buildMergeLikeInstr(SgprDst, SgprDstParts);
743763
}
744764

745765
void AMDGPURegisterBankInfo::buildReadFirstLaneForType(MachineIRBuilder &B,
746766
Register SgprDst,
747767
Register VgprSrc) const {
748768
MachineRegisterInfo &MRI = *B.getMRI();
769+
LLT S16 = LLT::scalar(16);
749770
LLT S32 = LLT::scalar(32);
750771
LLT S64 = LLT::scalar(64);
751772
LLT Ty = MRI.getType(SgprDst);
752773

753-
if (Ty == S32 || Ty == LLT::pointer(3, 32)) {
754-
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
774+
if (Ty == S16) {
775+
Register VgprSrc32 = MRI.createGenericVirtualRegister(S32);
776+
MRI.setRegBank(VgprSrc32, AMDGPU::VGPRRegBank);
777+
Register SgprDst32 = MRI.createGenericVirtualRegister(S32);
778+
MRI.setRegBank(SgprDst32, AMDGPU::SGPRRegBank);
779+
780+
B.buildAnyExt(VgprSrc32, VgprSrc);
781+
buildReadFirstLaneB32(B, SgprDst32, VgprSrc32);
782+
B.buildTrunc(SgprDst, SgprDst32);
783+
return;
755784
}
756785

757-
if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64)) {
786+
if (Ty == S32 || Ty == LLT::pointer(3, 32))
787+
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
788+
789+
if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64))
758790
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2);
759-
}
760791

761-
if (Ty.isVector() && Ty.getElementType() == S32) {
792+
if (Ty.isVector() && Ty.getElementType() == S32)
762793
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc,
763794
Ty.getNumElements());
764-
}
765795

766796
llvm_unreachable("Type not supported");
767797
}
@@ -1036,6 +1066,17 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
10361066
MI.getOperand(OpIdx).setReg(Reg);
10371067
}
10381068

1069+
// MI has uniform inputs and output but only available machine instruction has
1070+
// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
1071+
void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
1072+
MachineIRBuilder &B, MachineInstr &MI,
1073+
const OperandsMapper &OpdMapper) const {
1074+
const RegisterBank *DstBank =
1075+
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1076+
if (DstBank != &AMDGPU::VGPRRegBank)
1077+
buildReadFirstLaneDst(B, MI);
1078+
}
1079+
10391080
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
10401081
/// rest will be in the remainder.
10411082
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
@@ -2117,6 +2158,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21172158
B.setInstrAndDebugLoc(MI);
21182159
unsigned Opc = MI.getOpcode();
21192160
MachineRegisterInfo &MRI = OpdMapper.getMRI();
2161+
2162+
// Switch for uniformity info based regbank selection.
2163+
// Keep in sinc with switches in AMDGPURegBankSelect and getInstrMapping.
2164+
switch (Opc) {
2165+
case AMDGPU::G_FADD: {
2166+
applyDefaultMapping(OpdMapper);
2167+
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2168+
if (!Subtarget.hasSALUFloatInsts() || (Size != 32 && Size != 16))
2169+
constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
2170+
return;
2171+
}
2172+
default:
2173+
break;
2174+
}
2175+
21202176
switch (Opc) {
21212177
case AMDGPU::G_CONSTANT:
21222178
case AMDGPU::G_IMPLICIT_DEF: {
@@ -3372,6 +3428,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
33723428
MI.getNumOperands());
33733429
}
33743430

3431+
const RegisterBankInfo::InstructionMapping &
3432+
AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
3433+
const MachineInstr &MI) const {
3434+
SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
3435+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3436+
// Dst reg bank should have been set already by uniformity info
3437+
OpdsMapping[0] =
3438+
getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3439+
3440+
for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
3441+
const MachineOperand &Op = MI.getOperand(i);
3442+
if (!Op.isReg())
3443+
continue;
3444+
3445+
unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3446+
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3447+
OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3448+
}
3449+
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3450+
MI.getNumOperands());
3451+
}
3452+
33753453
const RegisterBankInfo::InstructionMapping &
33763454
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
33773455
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -3524,6 +3602,20 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
35243602
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
35253603
}
35263604

3605+
const RegisterBankInfo::ValueMapping *
3606+
AMDGPURegisterBankInfo::getPreAssignedOpMapping(
3607+
Register Reg, const MachineRegisterInfo &MRI,
3608+
const TargetRegisterInfo &TRI) const {
3609+
const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
3610+
assert(Bank);
3611+
unsigned BankId = Bank->getID();
3612+
unsigned Size = getSizeInBits(Reg, MRI, TRI);
3613+
assert(BankId == AMDGPU::SGPRRegBankID ||
3614+
BankId == (Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID));
3615+
3616+
return AMDGPU::getValueMapping(BankId, Size);
3617+
}
3618+
35273619
const RegisterBankInfo::ValueMapping *
35283620
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
35293621
const MachineRegisterInfo &MRI,
@@ -3640,6 +3732,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
36403732

36413733
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
36423734

3735+
// Switch for uniformity info based regbank selection.
3736+
// Requires pre-selected, by AMDGPURegBankSelect, reg-banks on dst registers.
3737+
// Keep in sinc with switches in AMDGPURegBankSelect and applyMappingImpl.
3738+
switch (MI.getOpcode()) {
3739+
case AMDGPU::G_FADD: {
3740+
Register Dst = MI.getOperand(0).getReg();
3741+
unsigned Size = MRI.getType(Dst).getSizeInBits();
3742+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
3743+
assert(DstBank);
3744+
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
3745+
DstBank == &AMDGPU::SGPRRegBank)
3746+
return getDefaultMappingSOP(MI);
3747+
return getDefaultMappingVOPWithPreassignedDef(MI);
3748+
}
3749+
default:
3750+
break;
3751+
}
3752+
36433753
switch (MI.getOpcode()) {
36443754
default:
36453755
return getInvalidInstructionMapping();
@@ -3735,7 +3845,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
37353845
if (isSALUMapping(MI))
37363846
return getDefaultMappingSOP(MI);
37373847
return getDefaultMappingVOP(MI);
3738-
case AMDGPU::G_FADD:
37393848
case AMDGPU::G_FSUB:
37403849
case AMDGPU::G_FMUL:
37413850
case AMDGPU::G_FMA:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
5959

6060
Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;
6161

62+
void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;
63+
6264
void buildReadFirstLaneForType(MachineIRBuilder &B, Register SgprDst,
6365
Register VgprSrc) const;
6466

@@ -74,6 +76,10 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
7476

7577
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
7678
unsigned OpIdx) const;
79+
void
80+
constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
81+
const OperandsMapper &OpdMapper) const;
82+
7783
bool applyMappingDynStackAlloc(MachineIRBuilder &B,
7884
const OperandsMapper &OpdMapper,
7985
MachineInstr &MI) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,31 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
4949
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
5050
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
5151
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
52-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
53-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
5452
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
53+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
5554
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
56-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
5755
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
58-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
59-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
60-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
62-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
63-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
64-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
65-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
66-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
56+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
57+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
58+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
59+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
60+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
62+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
63+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
64+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
65+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
66+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
67+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
68+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
69+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v2
70+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
71+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
72+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
73+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
74+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
75+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
76+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
6777
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
6878
;
6979
; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -90,23 +100,35 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
90100
; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
91101
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
92102
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
93-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
94-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
95103
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
104+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
96105
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
97-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
98106
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
99-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
100-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
101-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
102-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
103-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
104-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
105-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
106-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
107-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
108-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
109-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8
107+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
108+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
109+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
110+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
111+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
112+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v2
113+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
114+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
115+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
116+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
117+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
118+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
119+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
120+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v5
121+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
122+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
123+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s11, v2
124+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
125+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s5, v0
126+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
127+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
128+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
129+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
130+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
131+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s5
110132
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
111133
;
112134
; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:

llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,10 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
199199
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
200200
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
201201
; GFX6-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1]
202+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
203+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
204+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
205+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
202206
; GFX6-NEXT: ; return to shader part epilog
203207
;
204208
; GFX78-LABEL: s_floor_f64:
@@ -223,6 +227,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
223227
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
224228
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
225229
; GFX6-NEXT: v_add_f64 v[0:1], -s[2:3], -v[0:1]
230+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
231+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
232+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
233+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
226234
; GFX6-NEXT: ; return to shader part epilog
227235
;
228236
; GFX78-LABEL: s_floor_f64_fneg:
@@ -248,6 +256,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
248256
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
249257
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
250258
; GFX6-NEXT: v_add_f64 v[0:1], |s[2:3]|, -v[0:1]
259+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
260+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
261+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
262+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
251263
; GFX6-NEXT: ; return to shader part epilog
252264
;
253265
; GFX78-LABEL: s_floor_f64_fabs:
@@ -273,6 +285,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
273285
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
274286
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
275287
; GFX6-NEXT: v_add_f64 v[0:1], -|s[2:3]|, -v[0:1]
288+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
289+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
290+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
291+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
276292
; GFX6-NEXT: ; return to shader part epilog
277293
;
278294
; GFX78-LABEL: s_floor_f64_fneg_fabs:

0 commit comments

Comments
 (0)