Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 106 additions & 29 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1241,46 +1241,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
//
// We assume that shuffling at a register granularity can be done for free.
// This is not true for vectors fed into memory instructions, but it is
// effectively true for all other shuffling. The emphasis of the logic here
// is to assist generic transform in cleaning up / canonicalizing those
// shuffles.

// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
unsigned NumSrcElts = SrcVecTy->getNumElements();
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
(Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
Kind == TTI::SK_PermuteSingleSrc))
return 0;
}

unsigned EltsPerReg = 32 / ScalarSize;
if (RequestedElts == 0)
return 0;
switch (Kind) {
case TTI::SK_Broadcast:
// A single v_perm_b32 can be re-used for all destination registers.
return 1;
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}
// One instruction per register.
if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
return InstructionCost::getInvalid();
case TTI::SK_ExtractSubvector:
if (Index % EltsPerReg == 0)
return 0; // Shuffling at register granularity
if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
return InstructionCost::getInvalid();
case TTI::SK_InsertSubvector: {
// Even aligned accesses are free
if (!(Index % 2))
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
if (!DstVecTy)
return InstructionCost::getInvalid();
unsigned NumDstElts = DstVecTy->getNumElements();
unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
unsigned EndIndex = Index + NumInsertElts;
unsigned BeginSubIdx = Index % EltsPerReg;
unsigned EndSubIdx = EndIndex % EltsPerReg;
unsigned Cost = 0;

if (BeginSubIdx != 0) {
// Need to shift the inserted vector into place. The cost is the number
// of destination registers overlapped by the inserted vector.
Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
}

// If the last register overlap is partial, there may be three source
// registers feeding into it; that takes an extra instruction.
if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
Cost += 1;

return Cost;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
case TTI::SK_Splice: {
auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
if (!DstVecTy)
return InstructionCost::getInvalid();
unsigned NumElts = DstVecTy->getNumElements();
assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
// Determine the sub-region of the result vector that requires
// sub-register shuffles / mixing.
unsigned EltsFromLHS = NumElts - Index;
bool LHSIsAligned = (Index % EltsPerReg) == 0;
bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
if (LHSIsAligned && RHSIsAligned)
return 0;
if (LHSIsAligned && !RHSIsAligned)
return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
if (!LHSIsAligned && RHSIsAligned)
return divideCeil(EltsFromLHS, EltsPerReg);
return divideCeil(NumElts, EltsPerReg);
}

default:
break;
}

if (!Mask.empty()) {
unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();

// Generically estimate the cost by assuming that each destination
// register is derived from sources via v_perm_b32 instructions if it
// can't be copied as-is.
//
// For each destination register, derive the cost of obtaining it based
// on the number of source registers that feed into it.
unsigned Cost = 0;
for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
SmallVector<int, 4> Regs;
bool Aligned = true;
for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
int SrcIdx = Mask[DstIdx + I];
if (SrcIdx == -1)
continue;
int Reg;
if (SrcIdx < (int)NumSrcElts) {
Reg = SrcIdx / EltsPerReg;
if (SrcIdx % EltsPerReg != I)
Aligned = false;
} else {
Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
Aligned = false;
}
if (!llvm::is_contained(Regs, Reg))
Regs.push_back(Reg);
}
if (Regs.size() >= 2)
Cost += Regs.size() - 1;
else if (!Aligned)
Cost += 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this accounting for?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the case where this destination register needs bytes from only a single source register, but those bytes aren't at the same position. So, one instruction is needed to produce the destination register.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. Could do with a comment.

}
return Cost;
}
}

return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
Expand Down
Loading
Loading