Skip to content

Commit 560b72c

Browse files
authored
[NVPTX] Support address offsets added with disjoint or (#122042)
Sometime DAGCombiner gets a little too clever and converts an add of a small constant offset to a highly aligned pointer into a 'disjoint or'. When looking for address operands handle this case as well.
1 parent 8948340 commit 560b72c

File tree

4 files changed

+62
-37
lines changed

4 files changed

+62
-37
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "NVPTXUtilities.h"
1515
#include "llvm/Analysis/ValueTracking.h"
1616
#include "llvm/CodeGen/ISDOpcodes.h"
17+
#include "llvm/CodeGen/SelectionDAGNodes.h"
1718
#include "llvm/IR/GlobalValue.h"
1819
#include "llvm/IR/Instructions.h"
1920
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
24492450
return true;
24502451
}
24512452

2453+
static inline bool isAddLike(const SDValue V) {
2454+
return V.getOpcode() == ISD::ADD ||
2455+
(V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
2456+
}
2457+
24522458
// SelectDirectAddr - Match a direct address for DAG.
24532459
// A direct address could be a globaladdress or externalsymbol.
24542460
bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
24752481
// symbol+offset
24762482
bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
24772483
SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
2478-
if (Addr.getOpcode() == ISD::ADD) {
2484+
if (isAddLike(Addr)) {
24792485
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
24802486
SDValue base = Addr.getOperand(0);
24812487
if (SelectDirectAddr(base, Base)) {
@@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
25122518
Addr.getOpcode() == ISD::TargetGlobalAddress)
25132519
return false; // direct calls.
25142520

2515-
if (Addr.getOpcode() == ISD::ADD) {
2521+
if (isAddLike(Addr)) {
25162522
if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
25172523
return false;
25182524
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
@a = external global ptr align 16
7+
8+
define i32 @test_disjoint_or_addr(i16 %a) {
9+
; CHECK-LABEL: test_disjoint_or_addr(
10+
; CHECK: {
11+
; CHECK-NEXT: .reg .b32 %r<2>;
12+
; CHECK-NEXT: .reg .b64 %rd<3>;
13+
; CHECK-EMPTY:
14+
; CHECK-NEXT: // %bb.0:
15+
; CHECK-NEXT: mov.u64 %rd1, a;
16+
; CHECK-NEXT: cvta.global.u64 %rd2, %rd1;
17+
; CHECK-NEXT: ld.u32 %r1, [%rd2+8];
18+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
19+
; CHECK-NEXT: ret;
20+
%a1 = ptrtoint ptr @a to i64
21+
%a2 = or disjoint i64 %a1, 8
22+
%a3 = inttoptr i64 %a2 to ptr
23+
%v = load i32, ptr %a3
24+
ret i32 %v
25+
}

llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
2929
; PTX-NEXT: .reg .pred %p<2>;
3030
; PTX-NEXT: .reg .b16 %rs<3>;
3131
; PTX-NEXT: .reg .b32 %r<11>;
32-
; PTX-NEXT: .reg .b64 %rd<10>;
32+
; PTX-NEXT: .reg .b64 %rd<9>;
3333
; PTX-EMPTY:
3434
; PTX-NEXT: // %bb.0: // %entry
3535
; PTX-NEXT: mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
3838
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
3939
; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
4040
; PTX-NEXT: ld.param.s32 %rd1, [non_kernel_function_param_2];
41-
; PTX-NEXT: add.u64 %rd2, %SP, 0;
42-
; PTX-NEXT: or.b64 %rd3, %rd2, 8;
43-
; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8];
44-
; PTX-NEXT: st.u64 [%rd3], %rd4;
45-
; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0];
46-
; PTX-NEXT: st.u64 [%SP], %rd5;
47-
; PTX-NEXT: mov.u64 %rd6, gi;
48-
; PTX-NEXT: cvta.global.u64 %rd7, %rd6;
49-
; PTX-NEXT: selp.b64 %rd8, %rd2, %rd7, %p1;
50-
; PTX-NEXT: add.s64 %rd9, %rd8, %rd1;
51-
; PTX-NEXT: ld.u8 %r1, [%rd9];
52-
; PTX-NEXT: ld.u8 %r2, [%rd9+1];
41+
; PTX-NEXT: ld.param.u64 %rd2, [non_kernel_function_param_0+8];
42+
; PTX-NEXT: st.u64 [%SP+8], %rd2;
43+
; PTX-NEXT: ld.param.u64 %rd3, [non_kernel_function_param_0];
44+
; PTX-NEXT: st.u64 [%SP], %rd3;
45+
; PTX-NEXT: mov.u64 %rd4, gi;
46+
; PTX-NEXT: cvta.global.u64 %rd5, %rd4;
47+
; PTX-NEXT: add.u64 %rd6, %SP, 0;
48+
; PTX-NEXT: selp.b64 %rd7, %rd6, %rd5, %p1;
49+
; PTX-NEXT: add.s64 %rd8, %rd7, %rd1;
50+
; PTX-NEXT: ld.u8 %r1, [%rd8];
51+
; PTX-NEXT: ld.u8 %r2, [%rd8+1];
5352
; PTX-NEXT: shl.b32 %r3, %r2, 8;
5453
; PTX-NEXT: or.b32 %r4, %r3, %r1;
55-
; PTX-NEXT: ld.u8 %r5, [%rd9+2];
54+
; PTX-NEXT: ld.u8 %r5, [%rd8+2];
5655
; PTX-NEXT: shl.b32 %r6, %r5, 16;
57-
; PTX-NEXT: ld.u8 %r7, [%rd9+3];
56+
; PTX-NEXT: ld.u8 %r7, [%rd8+3];
5857
; PTX-NEXT: shl.b32 %r8, %r7, 24;
5958
; PTX-NEXT: or.b32 %r9, %r8, %r6;
6059
; PTX-NEXT: or.b32 %r10, %r9, %r4;

llvm/test/CodeGen/NVPTX/variadics-backend.ll

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
153153
; CHECK-PTX-NEXT: .reg .b64 %SPL;
154154
; CHECK-PTX-NEXT: .reg .b16 %rs<6>;
155155
; CHECK-PTX-NEXT: .reg .b32 %r<7>;
156-
; CHECK-PTX-NEXT: .reg .b64 %rd<11>;
156+
; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
157157
; CHECK-PTX-EMPTY:
158158
; CHECK-PTX-NEXT: // %bb.0: // %entry
159159
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
163163
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
164164
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
165165
; CHECK-PTX-NEXT: ld.u32 %r2, [%rd3];
166-
; CHECK-PTX-NEXT: or.b64 %rd4, %rd3, 4;
167-
; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4];
168-
; CHECK-PTX-NEXT: or.b64 %rd5, %rd3, 5;
169-
; CHECK-PTX-NEXT: or.b64 %rd6, %rd3, 7;
170-
; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd6];
166+
; CHECK-PTX-NEXT: ld.s8 %r3, [%rd3+4];
167+
; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd3+7];
171168
; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs1;
172-
; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd5];
173-
; CHECK-PTX-NEXT: or.b64 %rd7, %rd3, 6;
174-
; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd7];
169+
; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd3+5];
170+
; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd3+6];
175171
; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8;
176172
; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2;
177173
; CHECK-PTX-NEXT: st.u16 [%SP], %rs5;
178-
; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8];
174+
; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3+8];
179175
; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2;
180176
; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3;
181-
; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5;
182-
; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8;
183-
; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10;
177+
; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r5;
178+
; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4;
179+
; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd6;
184180
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r6;
185181
; CHECK-PTX-NEXT: ret;
186182
entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
219215
; CHECK-PTX-NEXT: .reg .b64 %SPL;
220216
; CHECK-PTX-NEXT: .reg .b16 %rs<10>;
221217
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
222-
; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
218+
; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
223219
; CHECK-PTX-EMPTY:
224220
; CHECK-PTX-NEXT: // %bb.0: // %entry
225221
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
240236
; CHECK-PTX-NEXT: st.u16 [%SP], %rs8;
241237
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
242238
; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1;
243-
; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8;
244-
; CHECK-PTX-NEXT: or.b64 %rd6, %rd5, 4;
245239
; CHECK-PTX-NEXT: mov.b16 %rs9, 1;
246-
; CHECK-PTX-NEXT: st.u8 [%rd6], %rs9;
247-
; CHECK-PTX-NEXT: mov.b64 %rd7, 1;
248-
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7;
240+
; CHECK-PTX-NEXT: st.u8 [%SP+12], %rs9;
241+
; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
242+
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5;
243+
; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 8;
249244
; CHECK-PTX-NEXT: { // callseq 1, 0
250245
; CHECK-PTX-NEXT: .param .b32 param0;
251246
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
252247
; CHECK-PTX-NEXT: .param .b64 param1;
253-
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5;
248+
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd6;
254249
; CHECK-PTX-NEXT: .param .b32 retval0;
255250
; CHECK-PTX-NEXT: call.uni (retval0),
256251
; CHECK-PTX-NEXT: variadics2,

0 commit comments

Comments
 (0)