Skip to content

Commit e4fa477

Browse files
committed
[NVPTX] Improve modeling of inline PTX
1 parent ca1833b commit e4fa477

File tree

7 files changed

+137
-0
lines changed

7 files changed

+137
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "MCTargetDesc/NVPTXBaseInfo.h"
1414
#include "NVPTX.h"
1515
#include "llvm/Analysis/ValueTracking.h"
16+
#include "llvm/IR/InlineAsm.h"
1617
#include "llvm/IR/Instructions.h"
1718
#include "llvm/Support/CommandLine.h"
1819

@@ -115,3 +116,30 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
115116

116117
return ModRefInfo::ModRef;
117118
}
119+
120+
MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
121+
AAQueryInfo &AAQI) {
122+
// Inline assembly with no side-effect or memory clobbers should not
123+
// indirectly access memory in the PTX specification.
124+
if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
125+
// Volatile is translated as side-effects.
126+
if (IA->hasSideEffects())
127+
return MemoryEffects::unknown();
128+
129+
for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
130+
// Indirect constraints (e.g. =*m) are unsupported in inline PTX.
131+
if (Constraint.isIndirect)
132+
return MemoryEffects::unknown();
133+
134+
// Memory clobbers prevent optimization.
135+
if (!(Constraint.Type & InlineAsm::ConstraintPrefix::isClobber))
136+
continue;
137+
for (const std::string &Code : Constraint.Codes)
138+
if (Code == "{memory}")
139+
return MemoryEffects::unknown();
140+
}
141+
return MemoryEffects::none();
142+
}
143+
144+
return MemoryEffects::unknown();
145+
}

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
3636

3737
ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
3838
bool IgnoreLocals);
39+
40+
MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
41+
42+
MemoryEffects getMemoryEffects(const Function *F) {
43+
return MemoryEffects::unknown();
44+
}
3945
};
4046

4147
/// Analysis pass providing a never-invalidated alias analysis result.

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "NVPTXTargetTransformInfo.h"
1010
#include "NVPTXUtilities.h"
11+
#include "llvm/ADT/STLExtras.h"
1112
#include "llvm/Analysis/LoopInfo.h"
1213
#include "llvm/Analysis/TargetTransformInfo.h"
1314
#include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,34 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
483484
return std::nullopt;
484485
}
485486

487+
InstructionCost
488+
NVPTXTTIImpl::getInstructionCost(const User *U,
489+
ArrayRef<const Value *> Operands,
490+
TTI::TargetCostKind CostKind) {
491+
if (const auto *CI = dyn_cast<CallInst>(U))
492+
if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
493+
// Without this implementation getCallCost() would return the number
494+
// of arguments+1 as the cost. Because the cost-model assumes it is a call
495+
// since it is classified as a call in the IR. A better cost model would
496+
// be to return the number of asm instructions embedded in the asm
497+
// string.
498+
auto &AsmStr = IA->getAsmString();
499+
SmallVector<StringRef, 4> AsmPieces;
500+
SplitString(AsmStr, AsmPieces, ";\n");
501+
502+
const unsigned InstCount = count_if(AsmPieces, [](StringRef AsmInst) {
503+
AsmInst = AsmInst.trim();
504+
// This is pretty course but does a reasonably good job of identifying
505+
// things that look like instructions, possibly with a predicate ("@").
506+
return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
507+
AsmInst.find(".pragma") != StringRef::npos);
508+
});
509+
return InstCount * TargetTransformInfo::TCC_Basic;
510+
}
511+
512+
return BaseT::getInstructionCost(U, Operands, CostKind);
513+
}
514+
486515
InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
487516
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
488517
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
9494
// calls are particularly expensive in NVPTX.
9595
unsigned getInliningThresholdMultiplier() const { return 11; }
9696

97+
InstructionCost getInstructionCost(const User *U,
98+
ArrayRef<const Value *> Operands,
99+
TTI::TargetCostKind CostKind);
100+
97101
InstructionCost getArithmeticInstrCost(
98102
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
99103
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
define void @test1() {
7+
; CHECK-LABEL: 'test1'
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
14+
;
15+
%1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
16+
%2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
17+
%3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
18+
%4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
19+
call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
20+
ret void
21+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not "NVPTX" in config.root.targets:
2+
config.unsupported = True
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
2+
; RUN: | FileCheck %s --check-prefixes CHECK-ALIAS
3+
4+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
;;CHECK-ALIAS-LABEL: Function: test_sideeffect
8+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
9+
define void @test_sideeffect(ptr %out) {
10+
entry:
11+
%0 = addrspacecast ptr %out to ptr addrspace(1)
12+
call void asm sideeffect "membar.gl;", ""()
13+
store i32 5, ptr addrspace(1) %0, align 4
14+
ret void
15+
}
16+
17+
;;CHECK-ALIAS-LABEL: Function: test_indirect
18+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
19+
define i32 @test_indirect(ptr %out) {
20+
entry:
21+
%0 = addrspacecast ptr %out to ptr addrspace(1)
22+
store i32 0, ptr addrspace(1) %0, align 4
23+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
24+
store i32 0, ptr addrspace(1) %0, align 4
25+
ret i32 %1
26+
}
27+
28+
;;CHECK-ALIAS-LABEL: Function: test_memory
29+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
30+
define i32 @test_memory(ptr %out) {
31+
entry:
32+
%0 = addrspacecast ptr %out to ptr addrspace(1)
33+
store i32 0, ptr addrspace(1) %0, align 4
34+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
35+
store i32 0, ptr addrspace(1) %0, align 4
36+
ret i32 %1
37+
}
38+
39+
;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
40+
;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
41+
define void @test_no_sideeffect(ptr %in, ptr %out) {
42+
entry:
43+
%0 = addrspacecast ptr %out to ptr addrspace(1)
44+
%1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
45+
store i32 %1, ptr addrspace(1) %0, align 4
46+
ret void
47+
}

0 commit comments

Comments
 (0)