-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[NVPTX] Support address offsets added with disjoint or #122042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Support address offsets added with disjoint or #122042
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesSometime DAGCombiner gets a little too clever and converts an add of a small constant offset to a highly aligned pointer into a 'disjoint or'. When looking for address operands handle this case as well. Full diff: https://github.com/llvm/llvm-project/pull/122042.diff 4 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c51729e224bf54..75a2bb157c99be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
return true;
}
+static inline bool isAddLike(const SDValue V) {
+ return V.getOpcode() == ISD::ADD ||
+ (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
// SelectDirectAddr - Match a direct address for DAG.
// A direct address could be a globaladdress or externalsymbol.
bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
// symbol+offset
bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
- if (Addr.getOpcode() == ISD::ADD) {
+ if (isAddLike(Addr)) {
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
SDValue base = Addr.getOperand(0);
if (SelectDirectAddr(base, Base)) {
@@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
Addr.getOpcode() == ISD::TargetGlobalAddress)
return false; // direct calls.
- if (Addr.getOpcode() == ISD::ADD) {
+ if (isAddLike(Addr)) {
if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
return false;
}
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
new file mode 100644
index 00000000000000..17c8b9cdc343ce
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+@a = external global ptr align 16
+
+define i32 @test_v2i8(i16 %a) {
+; CHECK-LABEL: test_v2i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd1, a;
+; CHECK-NEXT: cvta.global.u64 %rd2, %rd1;
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd2+8];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a1 = ptrtoint ptr @a to i64
+ %a2 = or disjoint i64 %a1, 8
+ %a3 = inttoptr i64 %a2 to ptr
+ %v = load volatile i32, ptr %a3
+ ret i32 %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 9cfe9192772b89..69d51fd1fcc1b3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b16 %rs<3>;
; PTX-NEXT: .reg .b32 %r<11>;
-; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-NEXT: .reg .b64 %rd<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
; PTX-NEXT: ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT: add.u64 %rd2, %SP, 0;
-; PTX-NEXT: or.b64 %rd3, %rd2, 8;
-; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8];
-; PTX-NEXT: st.u64 [%rd3], %rd4;
-; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0];
-; PTX-NEXT: st.u64 [%SP], %rd5;
-; PTX-NEXT: mov.u64 %rd6, gi;
-; PTX-NEXT: cvta.global.u64 %rd7, %rd6;
-; PTX-NEXT: selp.b64 %rd8, %rd2, %rd7, %p1;
-; PTX-NEXT: add.s64 %rd9, %rd8, %rd1;
-; PTX-NEXT: ld.u8 %r1, [%rd9];
-; PTX-NEXT: ld.u8 %r2, [%rd9+1];
+; PTX-NEXT: ld.param.u64 %rd2, [non_kernel_function_param_0+8];
+; PTX-NEXT: st.u64 [%SP+8], %rd2;
+; PTX-NEXT: ld.param.u64 %rd3, [non_kernel_function_param_0];
+; PTX-NEXT: st.u64 [%SP], %rd3;
+; PTX-NEXT: mov.u64 %rd4, gi;
+; PTX-NEXT: cvta.global.u64 %rd5, %rd4;
+; PTX-NEXT: add.u64 %rd6, %SP, 0;
+; PTX-NEXT: selp.b64 %rd7, %rd6, %rd5, %p1;
+; PTX-NEXT: add.s64 %rd8, %rd7, %rd1;
+; PTX-NEXT: ld.u8 %r1, [%rd8];
+; PTX-NEXT: ld.u8 %r2, [%rd8+1];
; PTX-NEXT: shl.b32 %r3, %r2, 8;
; PTX-NEXT: or.b32 %r4, %r3, %r1;
-; PTX-NEXT: ld.u8 %r5, [%rd9+2];
+; PTX-NEXT: ld.u8 %r5, [%rd8+2];
; PTX-NEXT: shl.b32 %r6, %r5, 16;
-; PTX-NEXT: ld.u8 %r7, [%rd9+3];
+; PTX-NEXT: ld.u8 %r7, [%rd8+3];
; PTX-NEXT: shl.b32 %r8, %r7, 24;
; PTX-NEXT: or.b32 %r9, %r8, %r6;
; PTX-NEXT: or.b32 %r10, %r9, %r4;
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index cb54812dea6d98..f7ed690efabcf3 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<6>;
; CHECK-PTX-NEXT: .reg .b32 %r<7>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
; CHECK-PTX-NEXT: ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT: or.b64 %rd4, %rd3, 4;
-; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4];
-; CHECK-PTX-NEXT: or.b64 %rd5, %rd3, 5;
-; CHECK-PTX-NEXT: or.b64 %rd6, %rd3, 7;
-; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd6];
+; CHECK-PTX-NEXT: ld.s8 %r3, [%rd3+4];
+; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd3+7];
; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd5];
-; CHECK-PTX-NEXT: or.b64 %rd7, %rd3, 6;
-; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd7];
+; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd3+5];
+; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd3+6];
; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8;
; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2;
; CHECK-PTX-NEXT: st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8];
+; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3+8];
; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2;
; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10;
+; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r5;
+; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4;
+; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd6;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-PTX-NEXT: ret;
entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<10>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: st.u16 [%SP], %rs8;
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8;
-; CHECK-PTX-NEXT: or.b64 %rd6, %rd5, 4;
; CHECK-PTX-NEXT: mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT: st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT: mov.b64 %rd7, 1;
-; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7;
+; CHECK-PTX-NEXT: st.u8 [%SP+12], %rs9;
+; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 8;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd6;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0),
; CHECK-PTX-NEXT: variadics2,
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM :)
%a1 = ptrtoint ptr @a to i64 | ||
%a2 = or disjoint i64 %a1, 8 | ||
%a3 = inttoptr i64 %a2 to ptr | ||
%v = load volatile i32, ptr %a3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this load
need to be volatile
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. Removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice. LGTM.
|
||
@a = external global ptr align 16 | ||
|
||
define i32 @test_v2i8(i16 %a) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v2i8?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, copied from somewhere else. Fixed.
Sometime DAGCombiner gets a little too clever and converts an add of a small constant offset to a highly aligned pointer into a 'disjoint or'. When looking for address operands handle this case as well.