Skip to content

Commit a8e9007

Browse files
AMDGPU/GlobalISel: uniformity analysis based register bank selection
Current algorithm only considers register banks for inputs but does take control flow into account. This is wrong in cases where inputs are uniform (in sgpr) but because of divergent control flow instruction is divergent and should use vgpr instead of sgpr register banks. Most notable example are phis. Also in cases where only available machine instruction uses vgpr registers uniform instructions end up using vgpr register banks. Start with simple implementation for G_FADD. Pre-select register bank for destination register using machine uniformity analysis info. Then select register banks that would allow selection of available machine instructions. For G_FADD vgpr machine instruction is available on all targets but sgpr version is not. When there is no sgpr version assign vgpr register banks and move vgpr destination to sgpr using readfirstlane.
1 parent 100828d commit a8e9007

16 files changed

+1812
-1210
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
#include "AMDGPURegBankSelect.h"
1616
#include "AMDGPU.h"
1717
#include "GCNSubtarget.h"
18+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1820
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
21+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1922
#include "llvm/InitializePasses.h"
2023

2124
#define DEBUG_TYPE "regbankselect"
@@ -68,7 +71,26 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
6871
MachineUniformityInfo Uniformity =
6972
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
7073
!ST.isSingleLaneExecution(F));
71-
(void)Uniformity; // TODO: Use this
74+
75+
// Switch for uniformity info based regbank selection. Pre-selects register
76+
// bank on dst registers using machine uniformity analysis.
77+
// Keep in sinc with switches in getInstrMapping and applyMappingImpl.
78+
for (MachineBasicBlock &MBB : MF) {
79+
for (MachineInstr &MI : MBB) {
80+
switch (MI.getOpcode()) {
81+
case AMDGPU::G_FADD: {
82+
Register Dst = MI.getOperand(0).getReg();
83+
if (Uniformity.isUniform(Dst))
84+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
85+
else
86+
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
87+
break;
88+
}
89+
default:
90+
break;
91+
}
92+
}
93+
}
7294

7395
assignRegisterBanks(MF);
7496

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 118 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,26 @@ Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
718718
return Dst;
719719
}
720720

721+
// Create new vgpr destination register for MI then move it to current
722+
// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
723+
void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
724+
MachineInstr &MI) const {
725+
MachineRegisterInfo &MRI = *B.getMRI();
726+
Register Dst = MI.getOperand(0).getReg();
727+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
728+
if (DstBank != &AMDGPU::SGPRRegBank)
729+
return;
730+
731+
Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
732+
MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);
733+
734+
MI.getOperand(0).setReg(VgprDst);
735+
MachineBasicBlock *MBB = MI.getParent();
736+
B.setInsertPt(*MBB, std::next(MI.getIterator()));
737+
// readFirstLane VgprDst into Dst after MI.
738+
return buildReadFirstLaneForType(B, Dst, VgprDst);
739+
}
740+
721741
void AMDGPURegisterBankInfo::buildReadFirstLaneB32(MachineIRBuilder &B,
722742
Register SgprDst,
723743
Register VgprSrc) const {
@@ -741,32 +761,42 @@ void AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
741761
}
742762

743763
B.buildUnmerge(VgprSrcParts, VgprSrc);
744-
for (unsigned i = 0; i < NumElts; ++i) {
764+
for (unsigned i = 0; i < NumElts; ++i)
745765
buildReadFirstLaneB32(B, SgprDstParts[i], VgprSrcParts[i]);
746-
}
766+
747767
B.buildMergeLikeInstr(SgprDst, SgprDstParts);
748768
}
749769

750770
void AMDGPURegisterBankInfo::buildReadFirstLaneForType(MachineIRBuilder &B,
751771
Register SgprDst,
752772
Register VgprSrc) const {
753773
MachineRegisterInfo &MRI = *B.getMRI();
774+
LLT S16 = LLT::scalar(16);
754775
LLT S32 = LLT::scalar(32);
755776
LLT S64 = LLT::scalar(64);
756777
LLT Ty = MRI.getType(SgprDst);
757778

758-
if (Ty == S32 || Ty == LLT::pointer(3, 32)) {
759-
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
779+
if (Ty == S16) {
780+
Register VgprSrc32 = MRI.createGenericVirtualRegister(S32);
781+
MRI.setRegBank(VgprSrc32, AMDGPU::VGPRRegBank);
782+
Register SgprDst32 = MRI.createGenericVirtualRegister(S32);
783+
MRI.setRegBank(SgprDst32, AMDGPU::SGPRRegBank);
784+
785+
B.buildAnyExt(VgprSrc32, VgprSrc);
786+
buildReadFirstLaneB32(B, SgprDst32, VgprSrc32);
787+
B.buildTrunc(SgprDst, SgprDst32);
788+
return;
760789
}
761790

762-
if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64)) {
791+
if (Ty == S32 || Ty == LLT::pointer(3, 32))
792+
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
793+
794+
if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64))
763795
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2);
764-
}
765796

766-
if (Ty.isVector() && Ty.getElementType() == S32) {
797+
if (Ty.isVector() && Ty.getElementType() == S32)
767798
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc,
768799
Ty.getNumElements());
769-
}
770800

771801
llvm_unreachable("Type not supported");
772802
}
@@ -1041,6 +1071,17 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
10411071
MI.getOperand(OpIdx).setReg(Reg);
10421072
}
10431073

1074+
// MI has uniform inputs and output but only available machine instruction has
1075+
// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
1076+
void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
1077+
MachineIRBuilder &B, MachineInstr &MI,
1078+
const OperandsMapper &OpdMapper) const {
1079+
const RegisterBank *DstBank =
1080+
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1081+
if (DstBank != &AMDGPU::VGPRRegBank)
1082+
buildReadFirstLaneDst(B, MI);
1083+
}
1084+
10441085
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
10451086
/// rest will be in the remainder.
10461087
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
@@ -2197,6 +2238,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21972238
B.setInstrAndDebugLoc(MI);
21982239
unsigned Opc = MI.getOpcode();
21992240
MachineRegisterInfo &MRI = OpdMapper.getMRI();
2241+
2242+
// Switch for uniformity info based regbank selection.
2243+
// Keep in sinc with switches in AMDGPURegBankSelect and getInstrMapping.
2244+
switch (Opc) {
2245+
case AMDGPU::G_FADD: {
2246+
applyDefaultMapping(OpdMapper);
2247+
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2248+
if (!Subtarget.hasSALUFloatInsts() || (Size != 32 && Size != 16))
2249+
constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
2250+
return;
2251+
}
2252+
default:
2253+
break;
2254+
}
2255+
22002256
switch (Opc) {
22012257
case AMDGPU::G_CONSTANT:
22022258
case AMDGPU::G_IMPLICIT_DEF: {
@@ -3571,6 +3627,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
35713627
MI.getNumOperands());
35723628
}
35733629

3630+
const RegisterBankInfo::InstructionMapping &
3631+
AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
3632+
const MachineInstr &MI) const {
3633+
SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
3634+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3635+
// Dst reg bank should have been set already by uniformity info
3636+
OpdsMapping[0] =
3637+
getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3638+
3639+
for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
3640+
const MachineOperand &Op = MI.getOperand(i);
3641+
if (!Op.isReg())
3642+
continue;
3643+
3644+
unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3645+
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3646+
OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3647+
}
3648+
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3649+
MI.getNumOperands());
3650+
}
3651+
35743652
const RegisterBankInfo::InstructionMapping &
35753653
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
35763654
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -3723,6 +3801,20 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
37233801
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
37243802
}
37253803

3804+
const RegisterBankInfo::ValueMapping *
3805+
AMDGPURegisterBankInfo::getPreAssignedOpMapping(
3806+
Register Reg, const MachineRegisterInfo &MRI,
3807+
const TargetRegisterInfo &TRI) const {
3808+
const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
3809+
assert(Bank);
3810+
unsigned BankId = Bank->getID();
3811+
unsigned Size = getSizeInBits(Reg, MRI, TRI);
3812+
assert(BankId == AMDGPU::SGPRRegBankID ||
3813+
BankId == (Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID));
3814+
3815+
return AMDGPU::getValueMapping(BankId, Size);
3816+
}
3817+
37263818
const RegisterBankInfo::ValueMapping *
37273819
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
37283820
const MachineRegisterInfo &MRI,
@@ -3839,6 +3931,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
38393931

38403932
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
38413933

3934+
// Switch for uniformity info based regbank selection.
3935+
// Requires pre-selected, by AMDGPURegBankSelect, reg-banks on dst registers.
3936+
// Keep in sinc with switches in AMDGPURegBankSelect and applyMappingImpl.
3937+
switch (MI.getOpcode()) {
3938+
case AMDGPU::G_FADD: {
3939+
Register Dst = MI.getOperand(0).getReg();
3940+
unsigned Size = MRI.getType(Dst).getSizeInBits();
3941+
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
3942+
assert(DstBank);
3943+
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
3944+
DstBank == &AMDGPU::SGPRRegBank)
3945+
return getDefaultMappingSOP(MI);
3946+
return getDefaultMappingVOPWithPreassignedDef(MI);
3947+
}
3948+
default:
3949+
break;
3950+
}
3951+
38423952
switch (MI.getOpcode()) {
38433953
default:
38443954
return getInvalidInstructionMapping();
@@ -3936,7 +4046,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39364046
if (isSALUMapping(MI))
39374047
return getDefaultMappingSOP(MI);
39384048
return getDefaultMappingVOP(MI);
3939-
case AMDGPU::G_FADD:
39404049
case AMDGPU::G_FSUB:
39414050
case AMDGPU::G_FMUL:
39424051
case AMDGPU::G_FMA:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
5959

6060
Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;
6161

62+
void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;
63+
6264
void buildReadFirstLaneForType(MachineIRBuilder &B, Register SgprDst,
6365
Register VgprSrc) const;
6466

@@ -74,6 +76,10 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
7476

7577
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
7678
unsigned OpIdx) const;
79+
void
80+
constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
81+
const OperandsMapper &OpdMapper) const;
82+
7783
bool applyMappingDynStackAlloc(MachineIRBuilder &B,
7884
const OperandsMapper &OpdMapper,
7985
MachineInstr &MI) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,31 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
4949
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
5050
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
5151
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
52-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
53-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
5452
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
53+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
5554
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
56-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
5755
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
58-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
59-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
60-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
62-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
63-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
64-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
65-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
66-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
56+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
57+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
58+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
59+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
60+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
61+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
62+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
63+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
64+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
65+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
66+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
67+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
68+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
69+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v2
70+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
71+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
72+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
73+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
74+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
75+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
76+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
6777
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
6878
;
6979
; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -90,23 +100,35 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
90100
; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
91101
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
92102
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
93-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
94-
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
95103
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
104+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
96105
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
97-
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
98106
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
99-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
100-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
101-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
102-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
103-
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
104-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
105-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
106-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
107-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
108-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
109-
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8
107+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
108+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
109+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
110+
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
111+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
112+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v2
113+
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
114+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
115+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
116+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
117+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
118+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
119+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
120+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v5
121+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
122+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
123+
; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s11, v2
124+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
125+
; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s5, v0
126+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
127+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
128+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
129+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
130+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
131+
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s5
110132
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
111133
;
112134
; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:

llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,10 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
199199
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
200200
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
201201
; GFX6-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1]
202+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
203+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
204+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
205+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
202206
; GFX6-NEXT: ; return to shader part epilog
203207
;
204208
; GFX78-LABEL: s_floor_f64:
@@ -223,6 +227,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
223227
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
224228
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
225229
; GFX6-NEXT: v_add_f64 v[0:1], -s[2:3], -v[0:1]
230+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
231+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
232+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
233+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
226234
; GFX6-NEXT: ; return to shader part epilog
227235
;
228236
; GFX78-LABEL: s_floor_f64_fneg:
@@ -248,6 +256,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
248256
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
249257
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
250258
; GFX6-NEXT: v_add_f64 v[0:1], |s[2:3]|, -v[0:1]
259+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
260+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
261+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
262+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
251263
; GFX6-NEXT: ; return to shader part epilog
252264
;
253265
; GFX78-LABEL: s_floor_f64_fabs:
@@ -273,6 +285,10 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
273285
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
274286
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
275287
; GFX6-NEXT: v_add_f64 v[0:1], -|s[2:3]|, -v[0:1]
288+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
289+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
290+
; GFX6-NEXT: v_mov_b32_e32 v0, s0
291+
; GFX6-NEXT: v_mov_b32_e32 v1, s1
276292
; GFX6-NEXT: ; return to shader part epilog
277293
;
278294
; GFX78-LABEL: s_floor_f64_fneg_fabs:

0 commit comments

Comments
 (0)