-
Notifications
You must be signed in to change notification settings - Fork 13.6k
Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)" #108173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
)" This reverts commit c7a7767. The buildbots failed because we used to remove a MI from its parent before updating LIS.
This should fix the sanitizer issues found by the buildbots.
@llvm/pr-subscribers-llvm-ir Author: Diana Picus (rovka) ChangesThis reverts commit c7a7767. The buildbots failed because I removed a MI from its parent before updating LIS. This PR should fix that. Patch is 84.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108173.diff 22 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2085113992ad17..37db49e393232c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,6 +208,16 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+// Sets the function into whole-wave-mode and returns whether the lane was
+// active when entering the function. A branch depending on this return will
+// revert the EXEC mask to what it was when entering the function, thus
+// resulting in a no-op. This pattern is used to optimize branches when function
+// tails need to be run in whole-wave-mode. It may also have other consequences
+// (mostly related to WWM CSR handling) that differentiate it from using
+// a plain `amdgcn.init.exec -1`.
+def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
+ IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
+
def int_amdgcn_wavefrontsize :
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 0daaf6b6576030..380dc7d3312f32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
+ case Intrinsic::amdgcn_init_whole_wave:
+ CurDAG->getMachineFunction()
+ .getInfo<SIMachineFunctionInfo>()
+ ->setInitWholeWave();
+ break;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 4dfd3f087c1ae4..53085d423cefb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+ MFInfo->setInitWholeWave();
+ return selectImpl(MI, *CoverageInfo);
+}
+
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
+ case Intrinsic::amdgcn_init_whole_wave:
+ return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 068db5c1c14496..df39ecbd61bce6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+ bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 7efb7f825348e3..b1022e48b8d34f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;
+ bool HasInitWholeWave = false;
+
public:
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return WaveLimiter;
}
+ bool hasInitWholeWave() const { return HasInitWholeWave; }
+ void setInitWholeWave() { HasInitWholeWave = true; }
+
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
return allocateLDSGlobal(DL, GV, DynLDSAlign);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 46d98cad963bc3..f2c9619cb8276a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
+ case Intrinsic::amdgcn_init_whole_wave:
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 95c4859674ecc4..2cd5fb2b94285c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
+def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9c5051393730..7f659578a6d2d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
+ if (YamlMFI.HasInitWholeWave)
+ MFI->setInitWholeWave();
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 8c951105101d96..dfdc7ad32b00c7 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// Allocate spill slots for WWM reserved VGPRs.
// For chain functions, we only need to do this if we have calls to
- // llvm.amdgcn.cs.chain.
- bool IsChainWithoutCalls =
- FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
- if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
+ // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
+ // chain functions do not return) and the function did not contain a call to
+ // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
+ // when entering the function).
+ bool IsChainWithoutRestores =
+ FuncInfo->isChainFunction() &&
+ (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
+ if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b7543238c1300a..f3eee9c807c1eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -583,6 +583,16 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
let Defs = [EXEC];
}
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
+ (outs SReg_1:$dst), (ins),
+ [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+
+ let isConvergent = 1;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841e..7cebfa29fe7b8d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue SGPRForEXECCopy;
StringValue LongBranchReservedReg;
+ bool HasInitWholeWave = false;
+
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue()); // Don't print out when it's empty.
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
+ YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
}
};
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f9d7ead4ff3ecc..313fd82df69ca1 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
- Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
+ Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,33 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();
+ if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
+ assert(MBB == &MBB->getParent()->front() &&
+ "init whole wave not in entry block");
+ Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
+ MachineInstr *SaveExec =
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64),
+ EntryExec)
+ .addImm(-1);
+
+ // Replace all uses of MI's destination reg with EntryExec.
+ MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
+
+ if (LIS) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ }
+
+ MI.eraseFromParent();
+
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*SaveExec);
+ LIS->createAndComputeVirtRegInterval(EntryExec);
+ }
+ return;
+ }
+
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
new file mode 100644
index 00000000000000..353f4d90cad1f2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -0,0 +1,1127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+
+define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: basic:
+; GISEL12: ; %bb.0: ; %entry
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: s_wait_expcnt 0x0
+; GISEL12-NEXT: s_wait_samplecnt 0x0
+; GISEL12-NEXT: s_wait_bvhcnt 0x0
+; GISEL12-NEXT: s_wait_kmcnt 0x0
+; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL12-NEXT: s_mov_b32 s6, s3
+; GISEL12-NEXT: s_mov_b32 s7, s4
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: ; %bb.1: ; %shader
+; GISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; GISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; GISEL12-NEXT: s_mov_b32 exec_lo, s5
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL12-LABEL: basic:
+; DAGISEL12: ; %bb.0: ; %entry
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: s_wait_expcnt 0x0
+; DAGISEL12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL12-NEXT: s_mov_b32 s7, s4
+; DAGISEL12-NEXT: s_mov_b32 s6, s3
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: ; %bb.1: ; %shader
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; GISEL10-LABEL: basic:
+; GISEL10: ; %bb.0: ; %entry
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL10-NEXT: s_mov_b32 s6, s3
+; GISEL10-NEXT: s_mov_b32 s7, s4
+; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: ; %bb.1: ; %shader
+; GISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; GISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; GISEL10-NEXT: s_mov_b32 exec_lo, s5
+; GISEL10-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL10-LABEL: basic:
+; DAGISEL10: ; %bb.0: ; %entry
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL10-NEXT: s_mov_b32 s7, s4
+; DAGISEL10-NEXT: s_mov_b32 s6, s3
+; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL10-NEXT: ; %bb.1: ; %shader
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; DAGISEL10-NEXT: ; %bb.2: ; %tail
+; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL10-NEXT: s_setpc_b64 s[6:7]
+entry:
+ %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %entry_exec, label %shader, label %tail
+
+shader:
+ %newx = add i32 %x, 42
+ %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
+ %newval = add i32 %oldval, 5
+ %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
+
+ br label %tail
+
+tail:
+ %full.x = phi i32 [%x, %entry], [%newx, %shader]
+ %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
+ %modified.x = add i32 %full.x, 32
+ %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
+ call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
+ unreachable
+}
+
+define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: wwm_in_shader:
+; GISEL12: ; %bb.0: ; %entry
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: s_wait_expcnt 0x0
+; GISEL12-NEXT: s_wait_samplecnt 0x0
+; GISEL12-NEXT: s_wait_bvhcnt 0x0
+; GISEL12-NEXT: s_wait_kmcnt 0x0
+; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL12-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
+; GISEL12-NEXT: s_mov_b32 s6, s3
+; GISEL12-NEXT: s_mov_b32 s7, s4
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: ; %bb.1: ; %shader
+; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; GISEL12-NEXT: v_mov_b32_e32 v0, s8
+; GISEL12-NEXT: s_mov_b32 exec_lo, s4
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
+; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL12-NEXT: s_mov_b32 exec_lo, s5
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL12-LABEL: wwm_in_shader:
+; DAGISEL12: ; %bb.0: ; %entry
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: s_wait_expcnt 0x0
+; DAGISEL12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
+; DAGISEL12-NEXT: s_mov_b32 s7, s4
+; DAGISEL12-NEXT: s_mov_b32 s6, s3
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: ; %bb.1: ; %shader
+; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
+; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; GISEL10-LABEL: wwm_in_shader:
+; GISEL10: ; %bb.0: ; %entry
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL10-NEXT: v_mov_b32_e32 v10, v12
+; GISEL10-NEXT: v_mov_b32_e32 v11, v13
+; GISEL10-NEXT: s_mov_b32 s6, s3
+; GISEL10-NEXT: s_mov_b32 s7, s4
+; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: ; %bb.1: ; %shader
+; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; GISEL10-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; GISEL10-NEXT: v_mov_b32_e32 v0, s8
+; GISEL10-NEXT: s_mov_b32 exec_lo, s4
+; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10
+; GISEL10-NEXT: v_mov_b32_e32 v11, v0
+; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL10-NEXT: s_mov_b32 exec_lo, s5
+; GISEL10-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL10-LABEL: wwm_in_shader:
+; DAGISEL10: ; %bb.0: ; %entry
+; DAGISEL10-NEXT:...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) ChangesThis reverts commit c7a7767. The buildbots failed because I removed a MI from its parent before updating LIS. This PR should fix that. Patch is 84.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108173.diff 22 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2085113992ad17..37db49e393232c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,6 +208,16 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+// Sets the function into whole-wave-mode and returns whether the lane was
+// active when entering the function. A branch depending on this return will
+// revert the EXEC mask to what it was when entering the function, thus
+// resulting in a no-op. This pattern is used to optimize branches when function
+// tails need to be run in whole-wave-mode. It may also have other consequences
+// (mostly related to WWM CSR handling) that differentiate it from using
+// a plain `amdgcn.init.exec -1`.
+def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
+ IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
+
def int_amdgcn_wavefrontsize :
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 0daaf6b6576030..380dc7d3312f32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
+ case Intrinsic::amdgcn_init_whole_wave:
+ CurDAG->getMachineFunction()
+ .getInfo<SIMachineFunctionInfo>()
+ ->setInitWholeWave();
+ break;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 4dfd3f087c1ae4..53085d423cefb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+ MFInfo->setInitWholeWave();
+ return selectImpl(MI, *CoverageInfo);
+}
+
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
+ case Intrinsic::amdgcn_init_whole_wave:
+ return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 068db5c1c14496..df39ecbd61bce6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+ bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 7efb7f825348e3..b1022e48b8d34f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;
+ bool HasInitWholeWave = false;
+
public:
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return WaveLimiter;
}
+ bool hasInitWholeWave() const { return HasInitWholeWave; }
+ void setInitWholeWave() { HasInitWholeWave = true; }
+
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
return allocateLDSGlobal(DL, GV, DynLDSAlign);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 46d98cad963bc3..f2c9619cb8276a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
+ case Intrinsic::amdgcn_init_whole_wave:
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 95c4859674ecc4..2cd5fb2b94285c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
+def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9c5051393730..7f659578a6d2d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
+ if (YamlMFI.HasInitWholeWave)
+ MFI->setInitWholeWave();
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 8c951105101d96..dfdc7ad32b00c7 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// Allocate spill slots for WWM reserved VGPRs.
// For chain functions, we only need to do this if we have calls to
- // llvm.amdgcn.cs.chain.
- bool IsChainWithoutCalls =
- FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
- if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
+ // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
+ // chain functions do not return) and the function did not contain a call to
+ // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
+ // when entering the function).
+ bool IsChainWithoutRestores =
+ FuncInfo->isChainFunction() &&
+ (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
+ if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b7543238c1300a..f3eee9c807c1eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -583,6 +583,16 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
let Defs = [EXEC];
}
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
+ (outs SReg_1:$dst), (ins),
+ [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+
+ let isConvergent = 1;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841e..7cebfa29fe7b8d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue SGPRForEXECCopy;
StringValue LongBranchReservedReg;
+ bool HasInitWholeWave = false;
+
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue()); // Don't print out when it's empty.
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
+ YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
}
};
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f9d7ead4ff3ecc..313fd82df69ca1 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
- Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
+ Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,33 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();
+ if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
+ assert(MBB == &MBB->getParent()->front() &&
+ "init whole wave not in entry block");
+ Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
+ MachineInstr *SaveExec =
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64),
+ EntryExec)
+ .addImm(-1);
+
+ // Replace all uses of MI's destination reg with EntryExec.
+ MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
+
+ if (LIS) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ }
+
+ MI.eraseFromParent();
+
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*SaveExec);
+ LIS->createAndComputeVirtRegInterval(EntryExec);
+ }
+ return;
+ }
+
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
new file mode 100644
index 00000000000000..353f4d90cad1f2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -0,0 +1,1127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+
+define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: basic:
+; GISEL12: ; %bb.0: ; %entry
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: s_wait_expcnt 0x0
+; GISEL12-NEXT: s_wait_samplecnt 0x0
+; GISEL12-NEXT: s_wait_bvhcnt 0x0
+; GISEL12-NEXT: s_wait_kmcnt 0x0
+; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL12-NEXT: s_mov_b32 s6, s3
+; GISEL12-NEXT: s_mov_b32 s7, s4
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: ; %bb.1: ; %shader
+; GISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; GISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; GISEL12-NEXT: s_mov_b32 exec_lo, s5
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL12-LABEL: basic:
+; DAGISEL12: ; %bb.0: ; %entry
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: s_wait_expcnt 0x0
+; DAGISEL12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL12-NEXT: s_mov_b32 s7, s4
+; DAGISEL12-NEXT: s_mov_b32 s6, s3
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: ; %bb.1: ; %shader
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; GISEL10-LABEL: basic:
+; GISEL10: ; %bb.0: ; %entry
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL10-NEXT: s_mov_b32 s6, s3
+; GISEL10-NEXT: s_mov_b32 s7, s4
+; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: ; %bb.1: ; %shader
+; GISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; GISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; GISEL10-NEXT: s_mov_b32 exec_lo, s5
+; GISEL10-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL10-LABEL: basic:
+; DAGISEL10: ; %bb.0: ; %entry
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL10-NEXT: s_mov_b32 s7, s4
+; DAGISEL10-NEXT: s_mov_b32 s6, s3
+; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL10-NEXT: ; %bb.1: ; %shader
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
+; DAGISEL10-NEXT: ; %bb.2: ; %tail
+; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL10-NEXT: s_setpc_b64 s[6:7]
+entry:
+ %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %entry_exec, label %shader, label %tail
+
+shader:
+ %newx = add i32 %x, 42
+ %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
+ %newval = add i32 %oldval, 5
+ %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
+
+ br label %tail
+
+tail:
+ %full.x = phi i32 [%x, %entry], [%newx, %shader]
+ %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
+ %modified.x = add i32 %full.x, 32
+ %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
+ call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
+ unreachable
+}
+
+define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: wwm_in_shader:
+; GISEL12: ; %bb.0: ; %entry
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: s_wait_expcnt 0x0
+; GISEL12-NEXT: s_wait_samplecnt 0x0
+; GISEL12-NEXT: s_wait_bvhcnt 0x0
+; GISEL12-NEXT: s_wait_kmcnt 0x0
+; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL12-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
+; GISEL12-NEXT: s_mov_b32 s6, s3
+; GISEL12-NEXT: s_mov_b32 s7, s4
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: ; %bb.1: ; %shader
+; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; GISEL12-NEXT: v_mov_b32_e32 v0, s8
+; GISEL12-NEXT: s_mov_b32 exec_lo, s4
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
+; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL12-NEXT: s_mov_b32 exec_lo, s5
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL12-LABEL: wwm_in_shader:
+; DAGISEL12: ; %bb.0: ; %entry
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: s_wait_expcnt 0x0
+; DAGISEL12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
+; DAGISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
+; DAGISEL12-NEXT: s_mov_b32 s7, s4
+; DAGISEL12-NEXT: s_mov_b32 s6, s3
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: ; %bb.1: ; %shader
+; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
+; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
+;
+; GISEL10-LABEL: wwm_in_shader:
+; GISEL10: ; %bb.0: ; %entry
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
+; GISEL10-NEXT: v_mov_b32_e32 v10, v12
+; GISEL10-NEXT: v_mov_b32_e32 v11, v13
+; GISEL10-NEXT: s_mov_b32 s6, s3
+; GISEL10-NEXT: s_mov_b32 s7, s4
+; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: ; %bb.1: ; %shader
+; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
+; GISEL10-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
+; GISEL10-NEXT: v_mov_b32_e32 v0, s8
+; GISEL10-NEXT: s_mov_b32 exec_lo, s4
+; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10
+; GISEL10-NEXT: v_mov_b32_e32 v11, v0
+; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GISEL10-NEXT: s_mov_b32 exec_lo, s5
+; GISEL10-NEXT: s_setpc_b64 s[6:7]
+;
+; DAGISEL10-LABEL: wwm_in_shader:
+; DAGISEL10: ; %bb.0: ; %entry
+; DAGISEL10-NEXT:...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/6380 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/4962 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/5052 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/5098 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/5072 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/3613 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/2870 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/7164 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/7370 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/5199 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/8653 Here is the relevant piece of the build log for the reference
|
This reverts commit c7a7767.
The buildbots failed because I removed a MI from its parent before updating LIS. This PR should fix that.