Skip to content

Commit fd3a6b6

Browse files
authored
[NVPTX] Improve modeling of inline PTX (#130675)
Improve the modeling of the memory effects and instruction cost of inline assembly. - MemoryEffects: The CUDA spec states that inline assembly is not assumed to have any side-effects or read or write to memory. An inline assembly may be treated as NoModRef unless it is explictly marked as having side effects or has an explicit memory clobber. https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#incorrect-optimization > Normally any memory that is written to will be specified as an out operand, but if there is a hidden read or write on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon. - InstructionCost: This change implements very rough string parsing system to count the number of instructions in an inline-asm. There are corner cases it will not handle well, but in general this is an improvement over the current cost of the number of arguments plus one.
1 parent e7e242e commit fd3a6b6

File tree

7 files changed

+137
-0
lines changed

7 files changed

+137
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "MCTargetDesc/NVPTXBaseInfo.h"
1414
#include "NVPTX.h"
1515
#include "llvm/Analysis/ValueTracking.h"
16+
#include "llvm/IR/InlineAsm.h"
1617
#include "llvm/IR/Instructions.h"
1718
#include "llvm/Support/CommandLine.h"
1819

@@ -115,3 +116,29 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
115116

116117
return ModRefInfo::ModRef;
117118
}
119+
120+
MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
121+
AAQueryInfo &AAQI) {
122+
// Inline assembly with no side-effect or memory clobbers should not
123+
// indirectly access memory in the PTX specification.
124+
if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
125+
// Volatile is translated as side-effects.
126+
if (IA->hasSideEffects())
127+
return MemoryEffects::unknown();
128+
129+
for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
130+
// Indirect constraints (e.g. =*m) are unsupported in inline PTX.
131+
if (Constraint.isIndirect)
132+
return MemoryEffects::unknown();
133+
134+
// Memory clobbers prevent optimization.
135+
if ((Constraint.Type & InlineAsm::ConstraintPrefix::isClobber) &&
136+
any_of(Constraint.Codes,
137+
[](const auto &Code) { return Code == "{memory}"; }))
138+
return MemoryEffects::unknown();
139+
}
140+
return MemoryEffects::none();
141+
}
142+
143+
return MemoryEffects::unknown();
144+
}

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
3636

3737
ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
3838
bool IgnoreLocals);
39+
40+
MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
41+
42+
MemoryEffects getMemoryEffects(const Function *F) {
43+
return MemoryEffects::unknown();
44+
}
3945
};
4046

4147
/// Analysis pass providing a never-invalidated alias analysis result.

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "NVPTXTargetTransformInfo.h"
1010
#include "NVPTXUtilities.h"
11+
#include "llvm/ADT/STLExtras.h"
1112
#include "llvm/Analysis/LoopInfo.h"
1213
#include "llvm/Analysis/TargetTransformInfo.h"
1314
#include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,35 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
483484
return std::nullopt;
484485
}
485486

487+
InstructionCost
488+
NVPTXTTIImpl::getInstructionCost(const User *U,
489+
ArrayRef<const Value *> Operands,
490+
TTI::TargetCostKind CostKind) {
491+
if (const auto *CI = dyn_cast<CallInst>(U))
492+
if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
493+
// Without this implementation getCallCost() would return the number
494+
// of arguments+1 as the cost. Because the cost-model assumes it is a call
495+
// since it is classified as a call in the IR. A better cost model would
496+
// be to return the number of asm instructions embedded in the asm
497+
// string.
498+
auto &AsmStr = IA->getAsmString();
499+
const unsigned InstCount =
500+
count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
501+
// Trim off scopes denoted by '{' and '}' as these can be ignored
502+
AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
503+
// This is pretty coarse but does a reasonably good job of
504+
// identifying things that look like instructions, possibly with a
505+
// predicate ("@").
506+
return !AsmInst.empty() &&
507+
(AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
508+
AsmInst.find(".pragma") != StringRef::npos);
509+
});
510+
return InstCount * TargetTransformInfo::TCC_Basic;
511+
}
512+
513+
return BaseT::getInstructionCost(U, Operands, CostKind);
514+
}
515+
486516
InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
487517
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
488518
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
9494
// calls are particularly expensive in NVPTX.
9595
unsigned getInliningThresholdMultiplier() const { return 11; }
9696

97+
InstructionCost getInstructionCost(const User *U,
98+
ArrayRef<const Value *> Operands,
99+
TTI::TargetCostKind CostKind);
100+
97101
InstructionCost getArithmeticInstrCost(
98102
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
99103
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
define void @test1() {
7+
; CHECK-LABEL: 'test1'
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
14+
;
15+
%1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
16+
%2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
17+
%3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
18+
%4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
19+
call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
20+
ret void
21+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not "NVPTX" in config.root.targets:
2+
config.unsupported = True
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa,basic-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
2+
; RUN: | FileCheck %s --check-prefixes CHECK-ALIAS
3+
4+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
;;CHECK-ALIAS-LABEL: Function: test_sideeffect
8+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
9+
define void @test_sideeffect(ptr %out) {
10+
entry:
11+
%0 = addrspacecast ptr %out to ptr addrspace(1)
12+
call void asm sideeffect "membar.gl;", ""()
13+
store i32 5, ptr addrspace(1) %0, align 4
14+
ret void
15+
}
16+
17+
;;CHECK-ALIAS-LABEL: Function: test_indirect
18+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
19+
define i32 @test_indirect(ptr %out) {
20+
entry:
21+
%0 = addrspacecast ptr %out to ptr addrspace(1)
22+
store i32 0, ptr addrspace(1) %0, align 4
23+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
24+
store i32 0, ptr addrspace(1) %0, align 4
25+
ret i32 %1
26+
}
27+
28+
;;CHECK-ALIAS-LABEL: Function: test_memory
29+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
30+
define i32 @test_memory(ptr %out) {
31+
entry:
32+
%0 = addrspacecast ptr %out to ptr addrspace(1)
33+
store i32 0, ptr addrspace(1) %0, align 4
34+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
35+
store i32 0, ptr addrspace(1) %0, align 4
36+
ret i32 %1
37+
}
38+
39+
;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
40+
;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
41+
define void @test_no_sideeffect(ptr %in, ptr %out) {
42+
entry:
43+
%0 = addrspacecast ptr %out to ptr addrspace(1)
44+
%1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
45+
store i32 %1, ptr addrspace(1) %0, align 4
46+
ret void
47+
}

0 commit comments

Comments
 (0)