Skip to content

Commit 142a8cf

Browse files
AMDGPU/GlobalISel: uniformity analysis based register bank selection
Current algorithm only considers register banks for inputs but does take control flow into account. This is wrong in cases where inputs are uniform (in sgpr) but because of divergent control flow instruction is divergent and should use vgpr instead of sgpr register banks. Most notable example are phis. Also in cases where only available machine instruction uses vgpr registers uniform instructions end up using vgpr register banks. Start with simple implementation for G_FADD. Pre-select register bank for destination register using machine uniformity analysis info. Then select register banks that would allow selection of available machine instructions. For G_FADD vgpr machine instruction is available on all targets but sgpr version is not. When there is no sgpr version assign vgpr register banks and move vgpr destination to sgpr using readfirstlane.
1 parent 7266d7a commit 142a8cf

18 files changed

+1830
-1224
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,26 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
6868
MachineUniformityInfo Uniformity =
6969
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
7070
!ST.isSingleLaneExecution(F));
71-
(void)Uniformity; // TODO: Use this
71+
72+
// Switch for uniformity info based regbank selection. Pre-selects register
73+
// bank on dst registers using machine uniformity analysis.
74+
// Keep in sync with switches in getInstrMapping and applyMappingImpl.
75+
for (MachineBasicBlock &MBB : MF) {
76+
for (MachineInstr &MI : MBB) {
77+
switch (MI.getOpcode()) {
78+
case AMDGPU::G_FADD: {
79+
Register Dst = MI.getOperand(0).getReg();
80+
if (Uniformity.isUniform(Dst))
81+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
82+
else
83+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
84+
break;
85+
}
86+
default:
87+
break;
88+
}
89+
}
90+
}
7291

7392
assignRegisterBanks(MF);
7493

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,27 @@ Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
715715
return buildReadFirstLaneForType(B, Ty, Src).getReg(0);
716716
}
717717

718+
// Create new vgpr destination register for MI then move it to current
719+
// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
720+
void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
721+
MachineInstr &MI) const {
722+
MachineRegisterInfo &MRI = *B.getMRI();
723+
Register Dst = MI.getOperand(0).getReg();
724+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
725+
if (DstBank != &AMDGPU::SGPRRegBank)
726+
return;
727+
728+
Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
729+
MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);
730+
731+
MI.getOperand(0).setReg(VgprDst);
732+
MachineBasicBlock *MBB = MI.getParent();
733+
B.setInsertPt(*MBB, std::next(MI.getIterator()));
734+
// readFirstLane VgprDst into Dst after MI.
735+
buildReadFirstLaneForType(B, Dst, VgprDst);
736+
return;
737+
}
738+
718739
MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneB32(
719740
MachineIRBuilder &B, const DstOp &SgprDst, const SrcOp &VgprSrc) const {
720741
MachineRegisterInfo &MRI = *B.getMRI();
@@ -745,10 +766,16 @@ MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
745766
MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneForType(
746767
MachineIRBuilder &B, const DstOp &SgprDst, const SrcOp &VgprSrc) const {
747768
MachineRegisterInfo &MRI = *B.getMRI();
769+
LLT S16 = LLT::scalar(16);
748770
LLT S32 = LLT::scalar(32);
749771
LLT S64 = LLT::scalar(64);
750772
LLT Ty = SgprDst.getLLTTy(MRI);
751773

774+
if (Ty == S16) {
775+
return B.buildTrunc(
776+
SgprDst, buildReadFirstLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc)));
777+
}
778+
752779
if (Ty == S32 || (Ty.isPointer() && Ty.getSizeInBits() == 32)) {
753780
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
754781
}
@@ -1035,6 +1062,17 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
10351062
MI.getOperand(OpIdx).setReg(Reg);
10361063
}
10371064

1065+
// MI has uniform inputs and output but only available machine instruction has
1066+
// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
1067+
void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
1068+
MachineIRBuilder &B, MachineInstr &MI,
1069+
const OperandsMapper &OpdMapper) const {
1070+
const RegisterBank *DstBank =
1071+
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1072+
if (DstBank != &AMDGPU::VGPRRegBank)
1073+
buildReadFirstLaneDst(B, MI);
1074+
}
1075+
10381076
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
10391077
/// rest will be in the remainder.
10401078
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
@@ -2191,6 +2229,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21912229
B.setInstrAndDebugLoc(MI);
21922230
unsigned Opc = MI.getOpcode();
21932231
MachineRegisterInfo &MRI = OpdMapper.getMRI();
2232+
2233+
// Switch for uniformity info based regbank selection.
2234+
// Keep in sync with switches in AMDGPURegBankSelect and getInstrMapping.
2235+
switch (Opc) {
2236+
case AMDGPU::G_FADD: {
2237+
applyDefaultMapping(OpdMapper);
2238+
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2239+
if (!Subtarget.hasSALUFloatInsts() || (Size != 32 && Size != 16))
2240+
constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
2241+
return;
2242+
}
2243+
default:
2244+
break;
2245+
}
2246+
21942247
switch (Opc) {
21952248
case AMDGPU::G_CONSTANT:
21962249
case AMDGPU::G_IMPLICIT_DEF: {
@@ -3565,6 +3618,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
35653618
MI.getNumOperands());
35663619
}
35673620

3621+
const RegisterBankInfo::InstructionMapping &
3622+
AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
3623+
const MachineInstr &MI) const {
3624+
SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
3625+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3626+
// Dst reg bank should have been set already by uniformity info
3627+
OpdsMapping[0] =
3628+
getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3629+
3630+
for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
3631+
const MachineOperand &Op = MI.getOperand(i);
3632+
if (!Op.isReg())
3633+
continue;
3634+
3635+
unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3636+
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3637+
OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3638+
}
3639+
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3640+
MI.getNumOperands());
3641+
}
3642+
35683643
const RegisterBankInfo::InstructionMapping &
35693644
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
35703645
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -3717,6 +3792,20 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
37173792
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
37183793
}
37193794

3795+
const RegisterBankInfo::ValueMapping *
3796+
AMDGPURegisterBankInfo::getPreAssignedOpMapping(
3797+
Register Reg, const MachineRegisterInfo &MRI,
3798+
const TargetRegisterInfo &TRI) const {
3799+
const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
3800+
assert(Bank);
3801+
unsigned BankId = Bank->getID();
3802+
unsigned Size = getSizeInBits(Reg, MRI, TRI);
3803+
assert(BankId == AMDGPU::SGPRRegBankID ||
3804+
BankId == (Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID));
3805+
3806+
return AMDGPU::getValueMapping(BankId, Size);
3807+
}
3808+
37203809
const RegisterBankInfo::ValueMapping *
37213810
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
37223811
const MachineRegisterInfo &MRI,
@@ -3833,6 +3922,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
38333922

38343923
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
38353924

3925+
// Switch for uniformity info based regbank selection.
3926+
// Requires pre-selected, by AMDGPURegBankSelect, reg-banks on dst registers.
3927+
// Keep in sync with switches in AMDGPURegBankSelect and applyMappingImpl.
3928+
switch (MI.getOpcode()) {
3929+
case AMDGPU::G_FADD: {
3930+
Register Dst = MI.getOperand(0).getReg();
3931+
unsigned Size = MRI.getType(Dst).getSizeInBits();
3932+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
3933+
assert(DstBank);
3934+
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
3935+
DstBank == &AMDGPU::SGPRRegBank)
3936+
return getDefaultMappingSOP(MI);
3937+
return getDefaultMappingVOPWithPreassignedDef(MI);
3938+
}
3939+
default:
3940+
break;
3941+
}
3942+
38363943
switch (MI.getOpcode()) {
38373944
default:
38383945
return getInvalidInstructionMapping();
@@ -3930,7 +4037,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39304037
if (isSALUMapping(MI))
39314038
return getDefaultMappingSOP(MI);
39324039
return getDefaultMappingVOP(MI);
3933-
case AMDGPU::G_FADD:
39344040
case AMDGPU::G_FSUB:
39354041
case AMDGPU::G_FMUL:
39364042
case AMDGPU::G_FMA:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
6060

6161
Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;
6262

63+
void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;
64+
6365
MachineInstrBuilder buildReadFirstLaneForType(MachineIRBuilder &B,
6466
const DstOp &SgprDst,
6567
const SrcOp &VgprSrc) const;
@@ -78,6 +80,10 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
7880

7981
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
8082
unsigned OpIdx) const;
83+
void
84+
constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
85+
const OperandsMapper &OpdMapper) const;
86+
8187
bool applyMappingDynStackAlloc(MachineIRBuilder &B,
8288
const OperandsMapper &OpdMapper,
8389
MachineInstr &MI) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,31 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
4949
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
5050
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
5151
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
52-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
53-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
5452
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
53+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
5554
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
56-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
5755
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
58-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
59-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
60-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
62-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
63-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
64-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
65-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
66-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
56+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
57+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
58+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
59+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
60+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
62+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
63+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
64+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
65+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
66+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
67+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
68+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
69+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v2
70+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
71+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
72+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
73+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
74+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
75+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
76+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
6777
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
6878
;
6979
; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -90,23 +100,35 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
90100
; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
91101
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
92102
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
93-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
94-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
95103
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
104+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
96105
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
97-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
98106
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
99-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
100-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
101-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
102-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
103-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
104-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
105-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
106-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
107-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
108-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
109-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8
107+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
108+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
109+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
110+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
111+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
112+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v2
113+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
114+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
115+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
116+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
117+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
118+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
119+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
120+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v5
121+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
122+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
123+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s11, v2
124+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
125+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s5, v0
126+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
127+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
128+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
129+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
130+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
131+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s5
110132
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
111133
;
112134
; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:

llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,10 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
199199
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
200200
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
201201
; GFX6-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1]
202+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
203+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
204+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
205+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
202206
; GFX6-NEXT: ; return to shader part epilog
203207
;
204208
; GFX78-LABEL: s_floor_f64:
@@ -223,6 +227,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
223227
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
224228
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
225229
; GFX6-NEXT: v_add_f64 v[0:1], -s[2:3], -v[0:1]
230+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
231+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
232+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
233+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
226234
; GFX6-NEXT: ; return to shader part epilog
227235
;
228236
; GFX78-LABEL: s_floor_f64_fneg:
@@ -248,6 +256,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
248256
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
249257
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
250258
; GFX6-NEXT: v_add_f64 v[0:1], |s[2:3]|, -v[0:1]
259+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
260+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
261+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
262+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
251263
; GFX6-NEXT: ; return to shader part epilog
252264
;
253265
; GFX78-LABEL: s_floor_f64_fabs:
@@ -273,6 +285,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
273285
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
274286
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
275287
; GFX6-NEXT: v_add_f64 v[0:1], -|s[2:3]|, -v[0:1]
288+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
289+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
290+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
291+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
276292
; GFX6-NEXT: ; return to shader part epilog
277293
;
278294
; GFX78-LABEL: s_floor_f64_fneg_fabs:

0 commit comments

Comments
 (0)