Skip to content

[NVPTX] Support address offsets added with disjoint or #122042

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Conversation

AlexMaclean
Copy link
Member

Sometime DAGCombiner gets a little too clever and converts an add of a small constant offset to a highly aligned pointer into a 'disjoint or'. When looking for address operands handle this case as well.

@llvmbot
Copy link
Member

llvmbot commented Jan 8, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Sometime DAGCombiner gets a little too clever and converts an add of a small constant offset to a highly aligned pointer into a 'disjoint or'. When looking for address operands handle this case as well.


Full diff: https://github.com/llvm/llvm-project/pull/122042.diff

4 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+8-2)
  • (added) llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll (+25)
  • (modified) llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (+14-15)
  • (modified) llvm/test/CodeGen/NVPTX/variadics-backend.ll (+15-20)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c51729e224bf54..75a2bb157c99be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
 #include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
@@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   return true;
 }
 
+static inline bool isAddLike(const SDValue V) {
+  return V.getOpcode() == ISD::ADD ||
+         (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       SDValue base = Addr.getOperand(0);
       if (SelectDirectAddr(base, Base)) {
@@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
       Addr.getOpcode() == ISD::TargetGlobalAddress)
     return false; // direct calls.
 
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
       return false;
     }
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
new file mode 100644
index 00000000000000..17c8b9cdc343ce
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+@a = external global ptr align 16
+
+define i32  @test_v2i8(i16 %a) {
+; CHECK-LABEL: test_v2i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, a;
+; CHECK-NEXT:    cvta.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd2+8];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a1 = ptrtoint ptr @a to i64
+  %a2 = or disjoint i64 %a1, 8
+  %a3 = inttoptr i64 %a2 to ptr
+  %v = load volatile i32, ptr %a3
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 9cfe9192772b89..69d51fd1fcc1b3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<11>;
-; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
 ; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT:    add.u64 %rd2, %SP, 0;
-; PTX-NEXT:    or.b64 %rd3, %rd2, 8;
-; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
-; PTX-NEXT:    st.u64 [%rd3], %rd4;
-; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd5;
-; PTX-NEXT:    mov.u64 %rd6, gi;
-; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
-; PTX-NEXT:    selp.b64 %rd8, %rd2, %rd7, %p1;
-; PTX-NEXT:    add.s64 %rd9, %rd8, %rd1;
-; PTX-NEXT:    ld.u8 %r1, [%rd9];
-; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
+; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.u64 [%SP+8], %rd2;
+; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
+; PTX-NEXT:    st.u64 [%SP], %rd3;
+; PTX-NEXT:    mov.u64 %rd4, gi;
+; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
+; PTX-NEXT:    add.u64 %rd6, %SP, 0;
+; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
+; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
+; PTX-NEXT:    ld.u8 %r1, [%rd8];
+; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
+; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
+; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index cb54812dea6d98..f7ed690efabcf3 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
 ; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT:    or.b64 %rd4, %rd3, 4;
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4];
-; CHECK-PTX-NEXT:    or.b64 %rd5, %rd3, 5;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd3, 7;
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd6];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
+; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
 ; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5];
-; CHECK-PTX-NEXT:    or.b64 %rd7, %rd3, 6;
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd7];
+; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
+; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
+; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
+; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-PTX-NEXT:    ret;
 entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd5, 4;
 ; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT:    st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT:    mov.b64 %rd7, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd7;
+; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,

@justinfargnoli justinfargnoli requested a review from kalxr January 8, 2025 19:06
Copy link
Contributor

@justinfargnoli justinfargnoli left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM :)

%a1 = ptrtoint ptr @a to i64
%a2 = or disjoint i64 %a1, 8
%a3 = inttoptr i64 %a2 to ptr
%v = load volatile i32, ptr %a3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this load need to be volatile?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Removed.

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. LGTM.


@a = external global ptr align 16

define i32 @test_v2i8(i16 %a) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

v2i8?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, copied from somewhere else. Fixed.

@AlexMaclean AlexMaclean merged commit 560b72c into llvm:main Jan 8, 2025
6 of 8 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants