diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 052b231d62a3e..e2210c1c5dea2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -22,6 +22,7 @@ #include "AMDKernelCodeT.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "R600AsmPrinter.h" @@ -134,6 +135,15 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { getTargetStreamer()->getPALMetadata()->readFromIR(M); } +uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) { + int64_t Val; + if (!Value->evaluateAsAbsolute(Val)) { + Ctx.reportError(SMLoc(), "could not resolve expression when required."); + return 0; + } + return static_cast(Val); +} + void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { // Init target streamer if it has not yet happened if (!IsTargetStreamerInitialized) @@ -237,12 +247,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), - CurrentProgramInfo.NumVGPRsForWavesPerEU, - CurrentProgramInfo.NumSGPRsForWavesPerEU - + getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context), + getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) - IsaInfo::getNumExtraSGPRs( - &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context), + getMCExprValue(CurrentProgramInfo.FlatUsed, Context), getTargetStreamer()->getTargetID()->isXnackOnOrAny()), - CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); + getMCExprValue(CurrentProgramInfo.VCCUsed, Context), + getMCExprValue(CurrentProgramInfo.FlatUsed, Context)); Streamer.popSection(); } @@ -422,7 +434,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (CurrentProgramInfo.DynamicCallStack && + if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) && CodeObjectVersion >= AMDGPU::AMDHSA_COV5) KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; @@ -439,29 +451,22 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, MCKernelDescriptor KernelDescriptor; - assert(isUInt<32>(PI.ScratchSize)); - assert(isUInt<32>(PI.getComputePGMRSrc1(STM))); - assert(isUInt<32>(PI.getComputePGMRSrc2())); - KernelDescriptor.group_segment_fixed_size = MCConstantExpr::create(PI.LDSSize, Ctx); - KernelDescriptor.private_segment_fixed_size = - MCConstantExpr::create(PI.ScratchSize, Ctx); + KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; Align MaxKernArgAlign; KernelDescriptor.kernarg_size = MCConstantExpr::create( STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); - KernelDescriptor.compute_pgm_rsrc1 = - MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx); - KernelDescriptor.compute_pgm_rsrc2 = - MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx); + KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); + KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); KernelDescriptor.kernel_code_properties = MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx); - assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); - KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create( - STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx); + assert(STM.hasGFX90AInsts() || + getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); + KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; KernelDescriptor.kernarg_preload = MCConstantExpr::create( AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0, @@ -477,9 +482,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { initTargetStreamer(*MF.getFunction().getParent()); ResourceUsage = &getAnalysis(); - CurrentProgramInfo = SIProgramInfo(); + CurrentProgramInfo.reset(MF); const AMDGPUMachineFunction *MFI = MF.getInfo(); + MCContext &Ctx = MF.getContext(); // The starting address of all shader programs must be 256 bytes aligned. // Regular functions just need the basic required instruction alignment. @@ -550,11 +556,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" Kernel info:", false); emitCommonFunctionComments( - CurrentProgramInfo.NumArchVGPR, - STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR + getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx), + STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx) : std::optional(), - CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, - CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); + getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx), + getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx), + getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), + getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -565,32 +573,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " bytes/workgroup (compile time only)", false); OutStreamer->emitRawComment( - " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); + " SGPRBlocks: " + + Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)), + false); OutStreamer->emitRawComment( - " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); + " VGPRBlocks: " + + Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)), + false); OutStreamer->emitRawComment( - " NumSGPRsForWavesPerEU: " + - Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); + " NumSGPRsForWavesPerEU: " + + Twine( + getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)), + false); OutStreamer->emitRawComment( - " NumVGPRsForWavesPerEU: " + - Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); + " NumVGPRsForWavesPerEU: " + + Twine( + getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)), + false); if (STM.hasGFX90AInsts()) OutStreamer->emitRawComment( - " AccumOffset: " + - Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false); + " AccumOffset: " + + Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) * + 4), + false); OutStreamer->emitRawComment( - " Occupancy: " + - Twine(CurrentProgramInfo.Occupancy), false); + " Occupancy: " + + Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)), + false); OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " + - Twine(CurrentProgramInfo.ScratchEnable), - false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + + Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)), + false); OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(CurrentProgramInfo.UserSGPR), false); @@ -611,18 +631,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); assert(STM.hasGFX90AInsts() || - CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); + getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); if (STM.hasGFX90AInsts()) { OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + - Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), - false); + " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + + Twine((AMDHSA_BITS_GET( + getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), + false); OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + - Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), - false); + " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + + Twine((AMDHSA_BITS_GET( + getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), + false); } } @@ -702,23 +724,40 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = ResourceUsage->getResourceInfo(&MF.getFunction()); const GCNSubtarget &STM = MF.getSubtarget(); + MCContext &Ctx = MF.getContext(); + + auto CreateExpr = [&Ctx](int64_t Value) { + return MCConstantExpr::create(Value, Ctx); + }; - ProgInfo.NumArchVGPR = Info.NumVGPR; - ProgInfo.NumAccVGPR = Info.NumAGPR; - ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); - ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; + auto TryGetMCExprValue = [&Ctx](const MCExpr *Value, uint64_t &Res) -> bool { + int64_t Val; + if (Value->evaluateAsAbsolute(Val)) { + Res = Val; + return true; + } + return false; + }; + + ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR); + ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR); + ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM)); + ProgInfo.AccumOffset = + CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1); ProgInfo.TgSplit = STM.isTgSplitEnabled(); - ProgInfo.NumSGPR = Info.NumExplicitSGPR; - ProgInfo.ScratchSize = Info.PrivateSegmentSize; - ProgInfo.VCCUsed = Info.UsesVCC; - ProgInfo.FlatUsed = Info.UsesFlatScratch; - ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR); + ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize); + ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC); + ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch); + ProgInfo.DynamicCallStack = + CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion); const uint64_t MaxScratchPerWorkitem = STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); - if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { - DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), - ProgInfo.ScratchSize, + uint64_t ScratchSize; + if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) && + ScratchSize > MaxScratchPerWorkitem) { + DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize, MaxScratchPerWorkitem, DS_Error); MF.getFunction().getContext().diagnose(DiagStackSize); } @@ -728,27 +767,29 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. - unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, - getTargetStreamer()->getTargetID()->isXnackOnOrAny()); + const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs( + ProgInfo.VCCUsed, ProgInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); - if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { + uint64_t NumSgpr; + if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && + NumSgpr > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction().getContext(); DiagnosticInfoResourceLimit Diag( - MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR, + MF.getFunction(), "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); Ctx.diagnose(Diag); - ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; + ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1); } } // Account for extra SGPRs and VGPRs reserved for debugger use. - ProgInfo.NumSGPR += ExtraSGPRs; + ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx); const Function &F = MF.getFunction(); @@ -819,40 +860,51 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } } - ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); - ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); - ProgInfo.NumVGPR = - Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR); + ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax( + {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + + ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax( + {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); + + ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR( + ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. - ProgInfo.NumSGPRsForWavesPerEU = std::max( - std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); - ProgInfo.NumVGPRsForWavesPerEU = std::max( - std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); + unsigned MaxWaves = MFI->getMaxWavesPerEU(); + ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( + {ProgInfo.NumSGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, + Ctx); + ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( + {ProgInfo.NumVGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, + Ctx); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); - if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { + uint64_t NumSgpr; + if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && + NumSgpr > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm to use // the registers which are usually reserved for vcc etc. LLVMContext &Ctx = MF.getFunction().getContext(); DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", - ProgInfo.NumSGPR, MaxAddressableNumSGPRs, + NumSgpr, MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); Ctx.diagnose(Diag); - ProgInfo.NumSGPR = MaxAddressableNumSGPRs; - ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; + ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs); + ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs); } } if (STM.hasSGPRInitBug()) { ProgInfo.NumSGPR = - AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); ProgInfo.NumSGPRsForWavesPerEU = - AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); } if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { @@ -871,11 +923,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, STM.getAddressableLocalMemorySize(), DS_Error); Ctx.diagnose(Diag); } + // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: + // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 + auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR, + unsigned Granule) { + const MCExpr *OneConst = CreateExpr(1ul); + const MCExpr *GranuleConst = CreateExpr(Granule); + const MCExpr *MaxNumGPR = + AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx); + const MCExpr *AlignToGPR = + AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); + const MCExpr *DivGPR = + MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); + const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); + return SubGPR; + }; - ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( - &STM, ProgInfo.NumSGPRsForWavesPerEU); - ProgInfo.VGPRBlocks = - IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU); + ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, + IsaInfo::getSGPREncodingGranule(&STM)); + ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, + IsaInfo::getVGPREncodingGranule(&STM)); const SIModeRegisterDefaults Mode = MFI->getMode(); @@ -904,14 +971,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; + // The MCExpr equivalent of divideCeil. + auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { + const MCExpr *Ceil = + AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx); + return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); + }; + // Scratch is allocated in 64-dword or 256-dword blocks. unsigned ScratchAlignShift = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; // We need to program the hardware with the amount of scratch memory that // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - ProgInfo.ScratchBlocks = divideCeil( - ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); + ProgInfo.ScratchBlocks = DivideCeil( + MCBinaryExpr::createMul(ProgInfo.ScratchSize, + CreateExpr(STM.getWavefrontSize()), Ctx), + CreateExpr(1ULL << ScratchAlignShift)); if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; @@ -930,8 +1006,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // anything to disable it if we know the stack isn't used here. We may still // have emitted code reading it to initialize scratch, but if that's unused // reading garbage should be OK. - ProgInfo.ScratchEnable = - ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; + ProgInfo.ScratchEnable = MCBinaryExpr::createLOr( + MCBinaryExpr::createGT(ProgInfo.ScratchBlocks, + MCConstantExpr::create(0, Ctx), Ctx), + ProgInfo.DynamicCallStack, Ctx); + ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. ProgInfo.TrapHandlerEnable = @@ -947,26 +1026,41 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.EXCPEnable = 0; if (STM.hasGFX90AInsts()) { - AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, - ProgInfo.AccumOffset); - AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, - ProgInfo.TgSplit); + // return ((Dst & ~Mask) | (Value << Shift)) + auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, + uint32_t Shift) { + auto Shft = MCConstantExpr::create(Shift, Ctx); + auto Msk = MCConstantExpr::create(Mask, Ctx); + Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); + Dst = MCBinaryExpr::createOr( + Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx); + return Dst; + }; + + ProgInfo.ComputePGMRSrc3GFX90A = + SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); + ProgInfo.ComputePGMRSrc3GFX90A = + SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit), + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } - ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, - ProgInfo.NumSGPRsForWavesPerEU, - ProgInfo.NumVGPRsForWavesPerEU); + ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy( + STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, + ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); + const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); - if (ProgInfo.Occupancy < MinWEU) { + uint64_t Occupancy; + if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) { DiagnosticInfoOptimizationFailure Diag( F, F.getSubprogram(), "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " "'" + F.getName() + "': desired occupancy was " + Twine(MinWEU) + - ", final occupancy is " + Twine(ProgInfo.Occupancy)); + ", final occupancy is " + Twine(Occupancy)); F.getContext().diagnose(Diag); } } @@ -989,36 +1083,78 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &STM = MF.getSubtarget(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); + MCContext &Ctx = MF.getContext(); + + // (((Value) & Mask) << Shift) + auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) { + const MCExpr *msk = MCConstantExpr::create(Mask, Ctx); + const MCExpr *shft = MCConstantExpr::create(Shift, Ctx); + return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx), + shft, Ctx); + }; + + auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) { + int64_t Val; + if (Value->evaluateAsAbsolute(Val)) + OutStreamer->emitIntValue(static_cast(Val), Size); + else + OutStreamer->emitValue(Value, Size); + }; if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); - OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM)); + EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx), + /*Size=*/4); OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); - OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2()); + EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); - OutStreamer->emitInt32( - STM.getGeneration() >= AMDGPUSubtarget::GFX12 - ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) - : STM.getGeneration() == AMDGPUSubtarget::GFX11 - ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) - : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); + + // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the + // appropriate generation. + if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x3FFFF, /*Shift=*/12), + /*Size=*/4); + } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x7FFF, /*Shift=*/12), + /*Size=*/4); + } else { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x1FFF, /*Shift=*/12), + /*Size=*/4); + } // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer->emitInt32(RsrcReg); - OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | - S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); + + const MCExpr *GPRBlocks = MCBinaryExpr::createOr( + SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0), + SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6), + MF.getContext()); + EmitResolvedOrExpr(GPRBlocks, /*Size=*/4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); - OutStreamer->emitInt32( - STM.getGeneration() >= AMDGPUSubtarget::GFX12 - ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) - : STM.getGeneration() == AMDGPUSubtarget::GFX11 - ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) - : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); + + // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the + // appropriate generation. + if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x3FFFF, /*Shift=*/12), + /*Size=*/4); + } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x7FFF, /*Shift=*/12), + /*Size=*/4); + } else { + EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, + /*Mask=*/0x1FFF, /*Shift=*/12), + /*Size=*/4); + } } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { @@ -1070,33 +1206,38 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, const SIMachineFunctionInfo *MFI = MF.getInfo(); auto CC = MF.getFunction().getCallingConv(); auto MD = getTargetStreamer()->getPALMetadata(); + auto &Ctx = MF.getContext(); MD->setEntryPoint(CC, MF.getFunction().getName()); - MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setNumUsedVgprs( + CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); // Only set AGPRs for supported devices const GCNSubtarget &STM = MF.getSubtarget(); if (STM.hasMAIInsts()) { - MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); + MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)); } - MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); + MD->setNumUsedSgprs( + CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); if (MD->getPALMajorVersion() < 3) { MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM)); if (AMDGPU::isCompute(CC)) { MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); } else { - if (CurrentProgramInfo.ScratchBlocks > 0) + if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0) MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); } } else { MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); - MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); + MD->setHwStage(CC, ".scratch_en", + (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)); EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); } // ScratchSize is in bytes, 16 aligned. - MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); + MD->setScratchSize( + CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) @@ -1145,6 +1286,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { StringRef FnName = MF.getFunction().getName(); MD->setFunctionScratchSize(FnName, MFI.getStackSize()); const GCNSubtarget &ST = MF.getSubtarget(); + MCContext &Ctx = MF.getContext(); if (MD->getPALMajorVersion() < 3) { // Set compute registers @@ -1158,8 +1300,10 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { // Set optional info MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); - MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); - MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); + MD->setFunctionNumUsedVgprs( + FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); + MD->setFunctionNumUsedSgprs( + FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); } // This is supposed to be log2(Size) @@ -1185,6 +1329,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &STM = MF.getSubtarget(); + MCContext &Ctx = MF.getContext(); AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); @@ -1193,7 +1338,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, (CurrentProgramInfo.getComputePGMRSrc2() << 32); Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; - if (CurrentProgramInfo.DynamicCallStack) + if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx)) Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; AMD_HSA_BITS_SET(Out.code_properties, @@ -1229,9 +1374,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, Align MaxKernArgAlign; Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); - Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; - Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; - Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; + Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx); + Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx); + Out.workitem_private_segment_byte_size = + getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx); Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; // kernarg_segment_alignment is specified as log of the alignment. @@ -1322,19 +1468,28 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( // remarks to simulate newlines. If and when clang does accept newlines, this // formatting should be aggregated into one remark with newlines to avoid // printing multiple diagnostic location and diag opts. + MCContext &MCCtx = MF.getContext(); EmitResourceUsageRemark("FunctionName", "Function Name", MF.getFunction().getName()); - EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR); - EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR); - if (hasMAIInsts) - EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR); - EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", - CurrentProgramInfo.ScratchSize); + EmitResourceUsageRemark("NumSGPR", "SGPRs", + getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx)); + EmitResourceUsageRemark( + "NumVGPR", "VGPRs", + getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx)); + if (hasMAIInsts) { + EmitResourceUsageRemark( + "NumAGPR", "AGPRs", + getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx)); + } + EmitResourceUsageRemark( + "ScratchSize", "ScratchSize [bytes/lane]", + getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx)); StringRef DynamicStackStr = - CurrentProgramInfo.DynamicCallStack ? "True" : "False"; + getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True" + : "False"; EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", - CurrentProgramInfo.Occupancy); + getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx)); EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", CurrentProgramInfo.SGPRSpill); EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index b8b2718d293e6..16d8952a533ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -78,6 +78,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter { void initTargetStreamer(Module &M); + static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx); + public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 9e288ab50e170..7ab9ba2851332 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -19,6 +19,8 @@ #include "SIMachineFunctionInfo.h" #include "SIProgramInfo.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" using namespace llvm; static std::pair getArgumentTypeAlign(const Argument &Arg, @@ -462,6 +464,16 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, const SIMachineFunctionInfo &MFI = *MF.getInfo(); const Function &F = MF.getFunction(); + auto GetMCExprValue = [&MF](const MCExpr *Value) { + int64_t Val; + if (!Value->evaluateAsAbsolute(Val)) { + MCContext &Ctx = MF.getContext(); + Ctx.reportError(SMLoc(), "could not resolve expression when required."); + Val = 0; + } + return static_cast(Val); + }; + auto Kern = HSAMetadataDoc->getMapNode(); Align MaxKernArgAlign; @@ -470,10 +482,11 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, Kern[".group_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = - Kern.getDocument()->getNode(ProgramInfo.ScratchSize); - if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) - Kern[".uses_dynamic_stack"] = - Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack); + Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize)); + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) { + Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode( + static_cast(GetMCExprValue(ProgramInfo.DynamicCallStack))); + } if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP()) Kern[".workgroup_processor_mode"] = @@ -484,12 +497,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Kern.getDocument()->getNode(STM.getWavefrontSize()); - Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); - Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR); + Kern[".sgpr_count"] = + Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR)); + Kern[".vgpr_count"] = + Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR)); // Only add AGPR count to metadata for supported devices if (STM.hasMAIInsts()) { - Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR); + Kern[".agpr_count"] = + Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR)); } Kern[".max_flat_workgroup_size"] = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 8f0eae362ecae..2e68e723283c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -664,29 +664,8 @@ bool GCNSubtarget::useVGPRIndexMode() const { bool GCNSubtarget::useAA() const { return UseAA; } unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { - if (getGeneration() >= AMDGPUSubtarget::GFX10) - return getMaxWavesPerEU(); - - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - if (SGPRs <= 80) - return 10; - if (SGPRs <= 88) - return 9; - if (SGPRs <= 100) - return 8; - return 7; - } - if (SGPRs <= 48) - return 10; - if (SGPRs <= 56) - return 9; - if (SGPRs <= 64) - return 8; - if (SGPRs <= 72) - return 7; - if (SGPRs <= 80) - return 6; - return 5; + return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), + getGeneration()); } unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3866723521147..7760af27a5ae9 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8406,12 +8406,16 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { AGVK VK = StringSwitch(TokenId) .Case("max", AGVK::AGVK_Max) .Case("or", AGVK::AGVK_Or) + .Case("extrasgprs", AGVK::AGVK_ExtraSGPRs) + .Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs) + .Case("alignto", AGVK::AGVK_AlignTo) + .Case("occupancy", AGVK::AGVK_Occupancy) .Default(AGVK::AGVK_None); if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) { SmallVector Exprs; uint64_t CommaCount = 0; - lex(); // Eat 'max'/'or' + lex(); // Eat Arg ('or', 'max', 'occupancy', etc.) lex(); // Eat '(' while (true) { if (trySkipToken(AsmToken::RParen)) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 4578c33d92dce..159664faf983f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -7,6 +7,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCExpr.h" +#include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -16,6 +19,7 @@ #include using namespace llvm; +using namespace llvm::AMDGPU; AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef Args, @@ -61,6 +65,18 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS, case AGVK_Max: OS << "max("; break; + case AGVK_ExtraSGPRs: + OS << "extrasgprs("; + break; + case AGVK_TotalNumVGPRs: + OS << "totalnumvgprs("; + break; + case AGVK_AlignTo: + OS << "alignto("; + break; + case AGVK_Occupancy: + OS << "occupancy("; + break; } for (auto It = Args.begin(); It != Args.end(); ++It) { (*It)->print(OS, MAI, /*InParens=*/false); @@ -82,10 +98,151 @@ static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1, } } +bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { + MCValue MCVal; + if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || + !MCVal.isAbsolute()) + return false; + + ConstantValue = MCVal.getConstant(); + return true; + }; + + assert(Args.size() == 3 && + "AMDGPUVariadic Argument count incorrect for ExtraSGPRs"); + const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); + uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0; + + bool Success = TryGetMCExprValue(Args[2], XNACKUsed); + + assert(Success && "Arguments 3 for ExtraSGPRs should be a known constant"); + if (!Success || !TryGetMCExprValue(Args[0], VCCUsed) || + !TryGetMCExprValue(Args[1], FlatScrUsed)) + return false; + + uint64_t ExtraSGPRs = IsaInfo::getNumExtraSGPRs( + STI, (bool)VCCUsed, (bool)FlatScrUsed, (bool)XNACKUsed); + Res = MCValue::get(ExtraSGPRs); + return true; +} + +bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { + MCValue MCVal; + if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || + !MCVal.isAbsolute()) + return false; + + ConstantValue = MCVal.getConstant(); + return true; + }; + assert(Args.size() == 2 && + "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs"); + const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); + uint64_t NumAGPR = 0, NumVGPR = 0; + + bool Has90AInsts = AMDGPU::isGFX90A(*STI); + + if (!TryGetMCExprValue(Args[0], NumAGPR) || + !TryGetMCExprValue(Args[1], NumVGPR)) + return false; + + uint64_t TotalNum = Has90AInsts && NumAGPR ? alignTo(NumVGPR, 4) + NumAGPR + : std::max(NumVGPR, NumAGPR); + Res = MCValue::get(TotalNum); + return true; +} + +bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { + MCValue MCVal; + if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || + !MCVal.isAbsolute()) + return false; + + ConstantValue = MCVal.getConstant(); + return true; + }; + + assert(Args.size() == 2 && + "AMDGPUVariadic Argument count incorrect for AlignTo"); + uint64_t Value = 0, Align = 0; + if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align)) + return false; + + Res = MCValue::get(alignTo(Value, Align)); + return true; +} + +bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { + MCValue MCVal; + if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || + !MCVal.isAbsolute()) + return false; + + ConstantValue = MCVal.getConstant(); + return true; + }; + assert(Args.size() == 7 && + "AMDGPUVariadic Argument count incorrect for Occupancy"); + uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation, + NumSGPRs, NumVGPRs; + + bool Success = true; + Success &= TryGetMCExprValue(Args[0], MaxWaves); + Success &= TryGetMCExprValue(Args[1], Granule); + Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs); + Success &= TryGetMCExprValue(Args[3], Generation); + Success &= TryGetMCExprValue(Args[4], InitOccupancy); + + assert(Success && "Arguments 1 to 5 for Occupancy should be known constants"); + + if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) || + !TryGetMCExprValue(Args[6], NumVGPRs)) + return false; + + unsigned Occupancy = InitOccupancy; + if (NumSGPRs) + Occupancy = std::min( + Occupancy, IsaInfo::getOccupancyWithNumSGPRs( + NumSGPRs, MaxWaves, + static_cast(Generation))); + if (NumVGPRs) + Occupancy = std::min(Occupancy, + IsaInfo::getNumWavesPerEUWithNumVGPRs( + NumVGPRs, Granule, MaxWaves, TargetTotalNumVGPRs)); + + Res = MCValue::get(Occupancy); + return true; +} + bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl( MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const { std::optional Total; + switch (Kind) { + default: + break; + case AGVK_ExtraSGPRs: + return evaluateExtraSGPRs(Res, Layout, Fixup); + case AGVK_AlignTo: + return evaluateAlignTo(Res, Layout, Fixup); + case AGVK_TotalNumVGPRs: + return evaluateTotalNumVGPR(Res, Layout, Fixup); + case AGVK_Occupancy: + return evaluateOccupancy(Res, Layout, Fixup); + } + for (const MCExpr *Arg : Args) { MCValue ArgRes; if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) || @@ -113,3 +270,47 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const { } return nullptr; } + +/// Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed +/// are unresolvable but needed for further MCExprs). Derived from +/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp. +/// +const AMDGPUVariadicMCExpr * +AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, MCContext &Ctx) { + + return create(AGVK_ExtraSGPRs, + {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)}, + Ctx); +} + +const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR( + const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) { + return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); +} + +/// Mimics GCNSubtarget::computeOccupancy for MCExpr. +/// +/// Remove dependency on GCNSubtarget and depend only only the necessary values +/// for said occupancy computation. Should match computeOccupancy implementation +/// without passing \p STM on. +const AMDGPUVariadicMCExpr * +AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, MCContext &Ctx) { + unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); + unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM); + unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); + unsigned Generation = STM.getGeneration(); + + auto CreateExpr = [&Ctx](unsigned Value) { + return MCConstantExpr::create(Value, Ctx); + }; + + return create(AGVK_Occupancy, + {CreateExpr(MaxWaves), CreateExpr(Granule), + CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation), + CreateExpr(InitOcc), NumSGPRs, NumVGPRs}, + Ctx); +} diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 238e0dea791b2..f92350b592350 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -14,6 +14,9 @@ namespace llvm { +class Function; +class GCNSubtarget; + /// AMDGPU target specific variadic MCExpr operations. /// /// Takes in a minimum of 1 argument to be used with an operation. The supported @@ -26,7 +29,15 @@ namespace llvm { /// class AMDGPUVariadicMCExpr : public MCTargetExpr { public: - enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max }; + enum VariadicKind { + AGVK_None, + AGVK_Or, + AGVK_Max, + AGVK_ExtraSGPRs, + AGVK_TotalNumVGPRs, + AGVK_AlignTo, + AGVK_Occupancy + }; private: VariadicKind Kind; @@ -38,6 +49,15 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr { MCContext &Ctx); ~AMDGPUVariadicMCExpr(); + bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const; + bool evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const; + bool evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const; + bool evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const; + public: static const AMDGPUVariadicMCExpr * create(VariadicKind Kind, ArrayRef Args, MCContext &Ctx); @@ -52,6 +72,26 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr { return create(VariadicKind::AGVK_Max, Args, Ctx); } + static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, + MCContext &Ctx); + + static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR, + const MCExpr *NumVGPR, + MCContext &Ctx); + + static const AMDGPUVariadicMCExpr * + createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) { + return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx); + } + + static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc, + const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, + MCContext &Ctx); + VariadicKind getKind() const { return Kind; } const MCExpr *getSubExpr(size_t Index) const; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 9ed7aacc0538e..0d40816cdd4b8 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -18,57 +18,114 @@ #include "GCNSubtarget.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCExpr.h" using namespace llvm; -uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const { - uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) | - S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) | - S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) | - S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered); +void SIProgramInfo::reset(const MachineFunction &MF) { + MCContext &Ctx = MF.getContext(); + + const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx); + + VGPRBlocks = ZeroExpr; + SGPRBlocks = ZeroExpr; + Priority = 0; + FloatMode = 0; + Priv = 0; + DX10Clamp = 0; + DebugMode = 0; + IEEEMode = 0; + WgpMode = 0; + MemOrdered = 0; + RrWgMode = 0; + ScratchSize = ZeroExpr; + + LDSBlocks = 0; + ScratchBlocks = ZeroExpr; + + ScratchEnable = ZeroExpr; + UserSGPR = 0; + TrapHandlerEnable = 0; + TGIdXEnable = 0; + TGIdYEnable = 0; + TGIdZEnable = 0; + TGSizeEnable = 0; + TIdIGCompCount = 0; + EXCPEnMSB = 0; + LdsSize = 0; + EXCPEnable = 0; + + ComputePGMRSrc3GFX90A = ZeroExpr; + + NumVGPR = ZeroExpr; + NumArchVGPR = ZeroExpr; + NumAccVGPR = ZeroExpr; + AccumOffset = ZeroExpr; + TgSplit = 0; + NumSGPR = ZeroExpr; + SGPRSpill = 0; + VGPRSpill = 0; + LDSSize = 0; + FlatUsed = ZeroExpr; + + NumSGPRsForWavesPerEU = ZeroExpr; + NumVGPRsForWavesPerEU = ZeroExpr; + Occupancy = ZeroExpr; + DynamicCallStack = ZeroExpr; + VCCUsed = ZeroExpr; +} + +static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo, + const GCNSubtarget &ST) { + uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | + S_00B848_WGP_MODE(ProgInfo.WgpMode) | + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); if (ST.hasDX10ClampMode()) - Reg |= S_00B848_DX10_CLAMP(DX10Clamp); + Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp); if (ST.hasIEEEMode()) - Reg |= S_00B848_IEEE_MODE(IEEEMode); + Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); if (ST.hasRrWGMode()) - Reg |= S_00B848_RR_WG_MODE(RrWgMode); + Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); return Reg; } -uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, - const GCNSubtarget &ST) const { - if (AMDGPU::isCompute(CC)) { - return getComputePGMRSrc1(ST); - } - uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) | - S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) | - S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode); +static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo, + CallingConv::ID CC, const GCNSubtarget &ST) { + uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode); if (ST.hasDX10ClampMode()) - Reg |= S_00B848_DX10_CLAMP(DX10Clamp); + Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp); if (ST.hasIEEEMode()) - Reg |= S_00B848_IEEE_MODE(IEEEMode); + Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); if (ST.hasRrWGMode()) - Reg |= S_00B848_RR_WG_MODE(RrWgMode); + Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); switch (CC) { case CallingConv::AMDGPU_PS: - Reg |= S_00B028_MEM_ORDERED(MemOrdered); + Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered); break; case CallingConv::AMDGPU_VS: - Reg |= S_00B128_MEM_ORDERED(MemOrdered); + Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered); break; case CallingConv::AMDGPU_GS: - Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered); + Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) | + S_00B228_MEM_ORDERED(ProgInfo.MemOrdered); break; case CallingConv::AMDGPU_HS: - Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered); + Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) | + S_00B428_MEM_ORDERED(ProgInfo.MemOrdered); break; default: break; @@ -76,22 +133,108 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, return Reg; } -uint64_t SIProgramInfo::getComputePGMRSrc2() const { - uint64_t Reg = - S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) | - S_00B84C_TRAP_HANDLER(TrapHandlerEnable) | - S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) | - S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) | - S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) | - S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) | - S_00B84C_EXCP_EN(EXCPEnable); +static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) { + uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) | + S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) | + S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) | + S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) | + S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) | + S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) | + S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) | + S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) | + S_00B84C_LDS_SIZE(ProgInfo.LdsSize) | + S_00B84C_EXCP_EN(ProgInfo.EXCPEnable); + + return Reg; +} + +static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, + MCContext &Ctx) { + if (Mask) { + const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); + Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); + } + if (Shift) { + const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); + Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); + } + return Val; +} + +uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const { + int64_t VBlocks, SBlocks; + VGPRBlocks->evaluateAsAbsolute(VBlocks); + SGPRBlocks->evaluateAsAbsolute(SBlocks); + + uint64_t Reg = S_00B848_VGPRS(static_cast(VBlocks)) | + S_00B848_SGPRS(static_cast(SBlocks)) | + getComputePGMRSrc1Reg(*this, ST); return Reg; } +uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, + const GCNSubtarget &ST) const { + if (AMDGPU::isCompute(CC)) { + return getComputePGMRSrc1(ST); + } + int64_t VBlocks, SBlocks; + VGPRBlocks->evaluateAsAbsolute(VBlocks); + SGPRBlocks->evaluateAsAbsolute(SBlocks); + + return getPGMRSrc1Reg(*this, CC, ST) | + S_00B848_VGPRS(static_cast(VBlocks)) | + S_00B848_SGPRS(static_cast(SBlocks)); +} + +uint64_t SIProgramInfo::getComputePGMRSrc2() const { + int64_t ScratchEn; + ScratchEnable->evaluateAsAbsolute(ScratchEn); + return ScratchEn | getComputePGMRSrc2Reg(*this); +} + uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const { if (AMDGPU::isCompute(CC)) return getComputePGMRSrc2(); return 0; } + +const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST, + MCContext &Ctx) const { + uint64_t Reg = getComputePGMRSrc1Reg(*this, ST); + const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); + const MCExpr *Res = MCBinaryExpr::createOr( + MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx), + MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx); + return MCBinaryExpr::createOr(RegExpr, Res, Ctx); +} + +const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, + const GCNSubtarget &ST, + MCContext &Ctx) const { + if (AMDGPU::isCompute(CC)) { + return getComputePGMRSrc1(ST, Ctx); + } + + uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST); + const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); + const MCExpr *Res = MCBinaryExpr::createOr( + MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx), + MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx); + return MCBinaryExpr::createOr(RegExpr, Res, Ctx); +} + +const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const { + uint64_t Reg = getComputePGMRSrc2Reg(*this); + const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); + return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx); +} + +const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, + MCContext &Ctx) const { + if (AMDGPU::isCompute(CC)) + return getComputePGMRSrc2(Ctx); + + return MCConstantExpr::create(0, Ctx); +} diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 8c26789f936cf..c0a353033c3c5 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -22,12 +22,15 @@ namespace llvm { class GCNSubtarget; +class MCContext; +class MCExpr; +class MachineFunction; /// Track resource usage for kernels / entry functions. struct SIProgramInfo { // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks = 0; - uint32_t SGPRBlocks = 0; + const MCExpr *VGPRBlocks = nullptr; + const MCExpr *SGPRBlocks = nullptr; uint32_t Priority = 0; uint32_t FloatMode = 0; uint32_t Priv = 0; @@ -37,14 +40,14 @@ struct SIProgramInfo { uint32_t WgpMode = 0; // GFX10+ uint32_t MemOrdered = 0; // GFX10+ uint32_t RrWgMode = 0; // GFX12+ - uint64_t ScratchSize = 0; + const MCExpr *ScratchSize = nullptr; // State used to calculate fields set in PGM_RSRC2 pm4 packet. uint32_t LDSBlocks = 0; - uint32_t ScratchBlocks = 0; + const MCExpr *ScratchBlocks = nullptr; // Fields set in PGM_RSRC2 pm4 packet - uint32_t ScratchEnable = 0; + const MCExpr *ScratchEnable = nullptr; uint32_t UserSGPR = 0; uint32_t TrapHandlerEnable = 0; uint32_t TGIdXEnable = 0; @@ -56,44 +59,56 @@ struct SIProgramInfo { uint32_t LdsSize = 0; uint32_t EXCPEnable = 0; - uint64_t ComputePGMRSrc3GFX90A = 0; + const MCExpr *ComputePGMRSrc3GFX90A = nullptr; - uint32_t NumVGPR = 0; - uint32_t NumArchVGPR = 0; - uint32_t NumAccVGPR = 0; - uint32_t AccumOffset = 0; + const MCExpr *NumVGPR = nullptr; + const MCExpr *NumArchVGPR = nullptr; + const MCExpr *NumAccVGPR = nullptr; + const MCExpr *AccumOffset = nullptr; uint32_t TgSplit = 0; - uint32_t NumSGPR = 0; + const MCExpr *NumSGPR = nullptr; unsigned SGPRSpill = 0; unsigned VGPRSpill = 0; uint32_t LDSSize = 0; - bool FlatUsed = false; + const MCExpr *FlatUsed = nullptr; // Number of SGPRs that meets number of waves per execution unit request. - uint32_t NumSGPRsForWavesPerEU = 0; + const MCExpr *NumSGPRsForWavesPerEU = nullptr; // Number of VGPRs that meets number of waves per execution unit request. - uint32_t NumVGPRsForWavesPerEU = 0; + const MCExpr *NumVGPRsForWavesPerEU = nullptr; // Final occupancy. - uint32_t Occupancy = 0; + const MCExpr *Occupancy = nullptr; // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. - bool DynamicCallStack = false; + const MCExpr *DynamicCallStack = nullptr; // Bonus information for debugging. - bool VCCUsed = false; + const MCExpr *VCCUsed = nullptr; SIProgramInfo() = default; + // The constructor sets the values for each member as shown in the struct. + // However, setting the MCExpr members to their zero value equivalent + // happens in reset together with (duplicated) value re-set for the + // non-MCExpr members. + void reset(const MachineFunction &MF); + /// Compute the value of the ComputePGMRsrc1 register. uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const; uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const; + const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, + MCContext &Ctx) const; + const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, + MCContext &Ctx) const; /// Compute the value of the ComputePGMRsrc2 register. uint64_t getComputePGMRSrc2() const; uint64_t getPGMRSrc2(CallingConv::ID CC) const; + const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const; + const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4e0074451aa58..05ff357f6676c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1129,12 +1129,45 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs) { - unsigned MaxWaves = getMaxWavesPerEU(STI); - unsigned Granule = getVGPRAllocGranule(STI); + return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI), + getMaxWavesPerEU(STI), + getTotalNumVGPRs(STI)); +} + +unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule, + unsigned MaxWaves, + unsigned TotalNumVGPRs) { if (NumVGPRs < Granule) return MaxWaves; unsigned RoundedRegs = alignTo(NumVGPRs, Granule); - return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves); + return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves); +} + +unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, + AMDGPUSubtarget::Generation Gen) { + if (Gen >= AMDGPUSubtarget::GFX10) + return MaxWaves; + + if (Gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (SGPRs <= 80) + return 10; + if (SGPRs <= 88) + return 9; + if (SGPRs <= 100) + return 8; + return 7; + } + if (SGPRs <= 48) + return 10; + if (SGPRs <= 56) + return 9; + if (SGPRs <= 64) + return 8; + if (SGPRs <= 72) + return 7; + if (SGPRs <= 80) + return 6; + return 5; } unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 943588fe701cc..905ac4d36153a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H +#include "AMDGPUSubtarget.h" #include "SIDefines.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/InstrTypes.h" @@ -311,6 +312,17 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs); +/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule +/// size, \p MaxWaves possible, and \p TotalNumVGPRs available. +unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule, + unsigned MaxWaves, + unsigned TotalNumVGPRs); + +/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p +/// Gen. +unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, + AMDGPUSubtarget::Generation Gen); + /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. We actually return the number of blocks -1, since /// that's what we encode. diff --git a/llvm/test/MC/AMDGPU/alignto_mcexpr.s b/llvm/test/MC/AMDGPU/alignto_mcexpr.s new file mode 100644 index 0000000000000..e864f3736828c --- /dev/null +++ b/llvm/test/MC/AMDGPU/alignto_mcexpr.s @@ -0,0 +1,15 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s + +// ASM: .set alignto_zero_eight, 0 +// ASM: .set alignto_one_eight, 8 +// ASM: .set alignto_five_eight, 8 +// ASM: .set alignto_seven_eight, 8 +// ASM: .set alignto_eight_eight, 8 +// ASM: .set alignto_ten_eight, 16 + +.set alignto_zero_eight, alignto(0, 8) +.set alignto_one_eight, alignto(1, 8) +.set alignto_five_eight, alignto(5, 8) +.set alignto_seven_eight, alignto(7, 8) +.set alignto_eight_eight, alignto(8, 8) +.set alignto_ten_eight, alignto(10, 8) diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s new file mode 100644 index 0000000000000..e88b23bb34d4f --- /dev/null +++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s @@ -0,0 +1,31 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefix=GFX7 %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=GFX940 %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s + +// gfx940 has architected flat scratch enabled. + +// GFX7: .set extrasgpr_none, 0 +// GFX7: .set extrasgpr_vcc, 2 +// GFX7: .set extrasgpr_flatscr, 4 +// GFX7: .set extrasgpr_xnack, 0 + +// GFX90A: .set extrasgpr_none, 0 +// GFX90A: .set extrasgpr_vcc, 2 +// GFX90A: .set extrasgpr_flatscr, 6 +// GFX90A: .set extrasgpr_xnack, 4 + +// GFX940: .set extrasgpr_none, 6 +// GFX940: .set extrasgpr_vcc, 6 +// GFX940: .set extrasgpr_flatscr, 6 +// GFX940: .set extrasgpr_xnack, 6 + +// GFX10: .set extrasgpr_none, 0 +// GFX10: .set extrasgpr_vcc, 2 +// GFX10: .set extrasgpr_flatscr, 0 +// GFX10: .set extrasgpr_xnack, 0 + +.set extrasgpr_none, extrasgprs(0, 0, 0) +.set extrasgpr_vcc, extrasgprs(1, 0, 0) +.set extrasgpr_flatscr, extrasgprs(0, 1, 0) +.set extrasgpr_xnack, extrasgprs(0, 0, 1) diff --git a/llvm/test/MC/AMDGPU/occupancy_mcexpr.s b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s new file mode 100644 index 0000000000000..06bec8c538dae --- /dev/null +++ b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s @@ -0,0 +1,61 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s + +// ASM: .set occupancy_init_one, 1 +// ASM: .set occupancy_init_seven, 7 +// ASM: .set occupancy_init_eight, 8 + +.set occupancy_init_one, occupancy(0, 0, 0, 0, 1, 0, 0) +.set occupancy_init_seven, occupancy(0, 0, 0, 0, 7, 0, 0) +.set occupancy_init_eight, occupancy(0, 0, 0, 0, 8, 0, 0) + +// ASM: .set occupancy_numsgpr_seaisle_ten, 10 +// ASM: .set occupancy_numsgpr_seaisle_nine, 9 +// ASM: .set occupancy_numsgpr_seaisle_eight, 8 +// ASM: .set occupancy_numsgpr_seaisle_seven, 7 +// ASM: .set occupancy_numsgpr_seaisle_six, 6 +// ASM: .set occupancy_numsgpr_seaisle_five, 5 + +.set occupancy_numsgpr_seaisle_ten, occupancy(0, 0, 0, 6, 11, 1, 0) +.set occupancy_numsgpr_seaisle_nine, occupancy(0, 0, 0, 6, 11, 49, 0) +.set occupancy_numsgpr_seaisle_eight, occupancy(0, 0, 0, 6, 11, 57, 0) +.set occupancy_numsgpr_seaisle_seven, occupancy(0, 0, 0, 6, 11, 65, 0) +.set occupancy_numsgpr_seaisle_six, occupancy(0, 0, 0, 6, 11, 73, 0) +.set occupancy_numsgpr_seaisle_five, occupancy(0, 0, 0, 6, 11, 81, 0) + +// ASM: .set occupancy_numsgpr_gfx9_ten, 10 +// ASM: .set occupancy_numsgpr_gfx9_nine, 9 +// ASM: .set occupancy_numsgpr_gfx9_eight, 8 +// ASM: .set occupancy_numsgpr_gfx9_seven, 7 + +.set occupancy_numsgpr_gfx9_ten, occupancy(0, 0, 0, 8, 11, 1, 0) +.set occupancy_numsgpr_gfx9_nine, occupancy(0, 0, 0, 8, 11, 81, 0) +.set occupancy_numsgpr_gfx9_eight, occupancy(0, 0, 0, 8, 11, 89, 0) +.set occupancy_numsgpr_gfx9_seven, occupancy(0, 0, 0, 8, 11, 101, 0) + +// ASM: .set occupancy_numsgpr_gfx10_one, 1 +// ASM: .set occupancy_numsgpr_gfx10_seven, 7 +// ASM: .set occupancy_numsgpr_gfx10_eight, 8 + +.set occupancy_numsgpr_gfx10_one, occupancy(1, 0, 0, 9, 11, 1, 0) +.set occupancy_numsgpr_gfx10_seven, occupancy(7, 0, 0, 9, 11, 1, 0) +.set occupancy_numsgpr_gfx10_eight, occupancy(8, 0, 0, 9, 11, 1, 0) + +// ASM: .set occupancy_numvgpr_high_granule_one, 1 +// ASM: .set occupancy_numvgpr_high_granule_seven, 7 +// ASM: .set occupancy_numvgpr_high_granule_eight, 8 + +.set occupancy_numvgpr_high_granule_one, occupancy(1, 2, 0, 0, 11, 0, 1) +.set occupancy_numvgpr_high_granule_seven, occupancy(7, 2, 0, 0, 11, 0, 1) +.set occupancy_numvgpr_high_granule_eight, occupancy(8, 2, 0, 0, 11, 0, 1) + +// ASM: .set occupancy_numvgpr_low_total_one, 1 +// ASM: .set occupancy_numvgpr_one, 1 +// ASM: .set occupancy_numvgpr_seven, 7 +// ASM: .set occupancy_numvgpr_eight, 8 +// ASM: .set occupancy_numvgpr_ten, 10 + +.set occupancy_numvgpr_low_total_one, occupancy(11, 4, 2, 0, 11, 0, 4) +.set occupancy_numvgpr_one, occupancy(11, 4, 4, 0, 11, 0, 4) +.set occupancy_numvgpr_seven, occupancy(11, 4, 28, 0, 11, 0, 4) +.set occupancy_numvgpr_eight, occupancy(11, 4, 32, 0, 11, 0, 4) +.set occupancy_numvgpr_ten, occupancy(11, 4, 40, 0, 11, 0, 4) diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s new file mode 100644 index 0000000000000..29bb885b20804 --- /dev/null +++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s + +// GFX10: .set totalvgpr_none, 0 +// GFX10: .set totalvgpr_one, 1 +// GFX10: .set totalvgpr_two, 2 + +.set totalvgpr_none, totalnumvgprs(0, 0) +.set totalvgpr_one, totalnumvgprs(1, 0) +.set totalvgpr_two, totalnumvgprs(1, 2) + +// GFX90A: .set totalvgpr90a_none, 0 +// GFX90A: .set totalvgpr90a_one, 1 +// GFX90A: .set totalvgpr90a_two, 2 + +.set totalvgpr90a_none, totalnumvgprs(0, 0) +.set totalvgpr90a_one, totalnumvgprs(0, 1) +.set totalvgpr90a_two, totalnumvgprs(0, 2) + +// GFX90A: .set totalvgpr90a_agpr_minimal, 1 +// GFX90A: .set totalvgpr90a_agpr_rounded_eight, 8 +// GFX90A: .set totalvgpr90a_agpr_exact_eight, 8 + +.set totalvgpr90a_agpr_minimal, totalnumvgprs(1, 0) +.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs(4, 2) +.set totalvgpr90a_agpr_exact_eight, totalnumvgprs(4, 4) diff --git a/llvm/unittests/MC/AMDGPU/CMakeLists.txt b/llvm/unittests/MC/AMDGPU/CMakeLists.txt index 06ca89a72a7cd..be8ff572e6f7d 100644 --- a/llvm/unittests/MC/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/MC/AMDGPU/CMakeLists.txt @@ -1,12 +1,20 @@ +include_directories( + ${PROJECT_SOURCE_DIR}/lib/Target/AMDGPU + ${PROJECT_BINARY_DIR}/lib/Target/AMDGPU + ) + set(LLVM_LINK_COMPONENTS AMDGPUCodeGen AMDGPUDesc AMDGPUInfo + CodeGen + Core MC Support TargetParser ) -add_llvm_unittest(AMDGPUDwarfTests +add_llvm_unittest(AMDGPUMCTests DwarfRegMappings.cpp + SIProgramInfoMCExprs.cpp ) diff --git a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp new file mode 100644 index 0000000000000..f2161f71e6e99 --- /dev/null +++ b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp @@ -0,0 +1,81 @@ +//===- llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUHSAMetadataStreamer.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "SIProgramInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" + +using namespace llvm; + +class SIProgramInfoMCExprsTest : public testing::Test { +protected: + std::unique_ptr TM; + std::unique_ptr Ctx; + std::unique_ptr ST; + std::unique_ptr MMI; + std::unique_ptr MF; + std::unique_ptr M; + + SIProgramInfo PI; + + static void SetUpTestSuite() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + } + + SIProgramInfoMCExprsTest() { + std::string Triple = "amdgcn-amd-amdhsa"; + std::string CPU = "gfx1010"; + std::string FS = ""; + + std::string Error; + const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error); + TargetOptions Options; + + TM.reset(static_cast(TheTarget->createTargetMachine( + Triple, CPU, FS, Options, std::nullopt, std::nullopt))); + + Ctx = std::make_unique(); + M = std::make_unique("Module", *Ctx); + M->setDataLayout(TM->createDataLayout()); + auto *FType = FunctionType::get(Type::getVoidTy(*Ctx), false); + auto *F = Function::Create(FType, GlobalValue::ExternalLinkage, "Test", *M); + MMI = std::make_unique(TM.get()); + + ST = std::make_unique(TM->getTargetTriple(), + TM->getTargetCPU(), + TM->getTargetFeatureString(), *TM); + + MF = std::make_unique(*F, *TM, *ST, 1, *MMI); + PI.reset(*MF.get()); + } +}; + +TEST_F(SIProgramInfoMCExprsTest, TestDeathHSAKernelEmit) { + MCContext &Ctx = MF->getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol("Unknown"); + PI.ScratchSize = MCSymbolRefExpr::create(Sym, Ctx); + + auto &Func = MF->getFunction(); + Func.setCallingConv(CallingConv::AMDGPU_KERNEL); + AMDGPU::HSAMD::MetadataStreamerMsgPackV4 MD; + EXPECT_DEATH(MD.emitKernel(*MF, PI), + "could not resolve expression when required."); +}