Skip to content

AMDGPU: Remove ds atomic fadd intrinsics #95396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19084,7 +19084,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
} else {
// The ds_fadd_* builtins do not have syncscope/order arguments.
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
SSID = llvm::SyncScope::System;
AO = AtomicOrdering::SequentiallyConsistent;

Expand Down
5 changes: 0 additions & 5 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
def int_amdgcn_ds_fmax : AMDGPULDSIntrin;

Expand Down Expand Up @@ -2930,10 +2929,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
Expand Down
92 changes: 66 additions & 26 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
break; // No other 'amdgcn.atomic.*'
}

if (Name.starts_with("ds.fadd")) {
// Replaced with atomicrmw fadd, so there's no new declaration.
NewFn = nullptr;
return true;
}

if (Name.starts_with("ldexp.")) {
// Target specific intrinsic became redundant
NewFn = Intrinsic::getDeclaration(
Expand Down Expand Up @@ -2331,40 +2337,74 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
llvm_unreachable("Unknown function for ARM CallBase upgrade.");
}

// These are expected to have the arguments:
// atomic.intrin (ptr, rmw_value, ordering, scope, isVolatile)
//
// Except for int_amdgcn_ds_fadd_v2bf16 which only has (ptr, rmw_value).
//
static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
const bool IsInc = Name.starts_with("atomic.inc.");
if (IsInc || Name.starts_with("atomic.dec.")) {
if (CI->getNumOperands() != 6) // Malformed bitcode.
return nullptr;
AtomicRMWInst::BinOp RMWOp =
StringSwitch<AtomicRMWInst::BinOp>(Name)
.StartsWith("ds.fadd", AtomicRMWInst::FAdd)
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);

unsigned NumOperands = CI->getNumOperands();
if (NumOperands < 3) // Malformed bitcode.
return nullptr;

AtomicRMWInst::BinOp RMWOp =
IsInc ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
Value *Ptr = CI->getArgOperand(0);
if (!isa<PointerType>(Ptr->getType())) // Malformed.
return nullptr;

Value *Ptr = CI->getArgOperand(0);
Value *Val = CI->getArgOperand(1);
ConstantInt *OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
Value *Val = CI->getArgOperand(1);
if (Val->getType() != CI->getType()) // Malformed.
return nullptr;

ConstantInt *OrderArg = nullptr;
bool IsVolatile = false;

// These should have 5 arguments (plus the callee). A separate version of the
// ds_fadd intrinsic was defined for bf16 which was missing arguments.
if (NumOperands > 3)
OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));

// Ignore scope argument at 3

if (NumOperands > 5) {
ConstantInt *VolatileArg = dyn_cast<ConstantInt>(CI->getArgOperand(4));
IsVolatile = !VolatileArg || !VolatileArg->isZero();
}

AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
if (Order == AtomicOrdering::NotAtomic ||
Order == AtomicOrdering::Unordered)
Order = AtomicOrdering::SequentiallyConsistent;

// The scope argument never really worked correctly. Use agent as the most
// conservative option which should still always produce the instruction.
SyncScope::ID SSID = F->getContext().getOrInsertSyncScopeID("agent");
AtomicRMWInst *RMW =
Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);

if (!VolatileArg || !VolatileArg->isZero())
RMW->setVolatile(true);
return RMW;
AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
if (Order == AtomicOrdering::NotAtomic || Order == AtomicOrdering::Unordered)
Order = AtomicOrdering::SequentiallyConsistent;

LLVMContext &Ctx = F->getContext();

// Handle the v2bf16 intrinsic which used <2 x i16> instead of <2 x bfloat>
Type *RetTy = CI->getType();
if (VectorType *VT = dyn_cast<VectorType>(RetTy)) {
if (VT->getElementType()->isIntegerTy(16)) {
VectorType *AsBF16 =
VectorType::get(Type::getBFloatTy(Ctx), VT->getElementCount());
Val = Builder.CreateBitCast(Val, AsBF16);
}
}

llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
// The scope argument never really worked correctly. Use agent as the most
// conservative option which should still always produce the instruction.
SyncScope::ID SSID = Ctx.getOrInsertSyncScopeID("agent");
AtomicRMWInst *RMW =
Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);

if (IsVolatile)
RMW->setVolatile(true);

return Builder.CreateBitCast(RMW, RetTy);
}

/// Helper to unwrap intrinsic call MetadataAsValue operands.
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5403,8 +5403,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,

static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
case Intrinsic::amdgcn_ds_fadd:
return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
Expand Down Expand Up @@ -7333,7 +7331,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4905,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd_v2bf16: {
case Intrinsic::amdgcn_ds_ordered_swap: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
Expand Down Expand Up @@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
Expand All @@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1142,16 +1142,6 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret

let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;

def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
let AddedComplexity = 1 in
def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts

let OtherPredicates = [HasGDS] in
Expand Down
15 changes: 0 additions & 15 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Expand Down Expand Up @@ -1451,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
Expand Down Expand Up @@ -8700,19 +8698,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
case Intrinsic::amdgcn_ds_fadd: {
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc;
switch (IntrID) {
case Intrinsic::amdgcn_ds_fadd:
Opc = ISD::ATOMIC_LOAD_FADD;
break;
}

return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
M->getOperand(0), M->getOperand(2), M->getOperand(3),
M->getMemOperand());
}
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
Expand Down
Loading
Loading