Skip to content

Commit b2dcf62

Browse files
authored
[NVPTX] fix emission for i1 load and extload (#99392)
Currently, an illegal 2-byte load from a 1-byte global variable is being generated. This change instead generates a 1-byte load and zero-extends it to i16 register. This was always the intended behavior of the function. In addition, an i1 ext load of any kind needs to be promoted. A missing setLoadExtAction for ISD::EXTLOAD was causing an "Unhandled source type" unreachable due to an illegal i1 ext load during ISelDAGtoDAG (see below bug). Bug #98033.
1 parent ce8c43f commit b2dcf62

File tree

3 files changed

+70
-3
lines changed

3 files changed

+70
-3
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
629629
for (MVT VT : MVT::integer_valuetypes()) {
630630
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
631631
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
632+
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
632633
setTruncStoreAction(VT, MVT::i1, Expand);
633634
}
634635

@@ -2920,9 +2921,10 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
29202921
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
29212922
assert(Node->getValueType(0) == MVT::i1 &&
29222923
"Custom lowering for i1 load only");
2923-
SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2924-
LD->getPointerInfo(), LD->getAlign(),
2925-
LD->getMemOperand()->getFlags());
2924+
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2925+
LD->getBasePtr(), LD->getPointerInfo(),
2926+
MVT::i8, LD->getAlign(),
2927+
LD->getMemOperand()->getFlags());
29262928
SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
29272929
// The legalizer (the caller) is expecting two values from the legalized
29282930
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\{\}\(\)]" --version 5
2+
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
4+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
5+
6+
target triple = "nvptx-nvidia-cuda"
7+
8+
define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
9+
; CHECK-LABEL: foo(
10+
; CHECK: .reg .b16 %rs<2>;
11+
; CHECK: .reg .b32 %r<4>;
12+
; CHECK: .reg .b64 %rd<5>;
13+
; CHECK-EMPTY:
14+
; CHECK: ld.param.u64 %rd1, [foo_param_0];
15+
; CHECK: ld.param.u64 %rd2, [foo_param_1];
16+
; CHECK: cvta.to.global.u64 %rd3, %rd2;
17+
; CHECK: cvta.to.global.u64 %rd4, %rd1;
18+
; CHECK: ld.global.nc.u8 %rs1, [%rd4];
19+
; CHECK: cvt.u32.u8 %r1, %rs1;
20+
; CHECK: add.s32 %r2, %r1, 1;
21+
; CHECK: and.b32 %r3, %r2, 1;
22+
; CHECK: st.global.u32 [%rd3], %r3;
23+
; CHECK: ret;
24+
%ld = load i1, ptr %ptr, align 1
25+
%zext = zext i1 %ld to i32
26+
%add = add i32 %zext, 1
27+
%and = and i32 %add, 1
28+
store i32 %and, ptr %retval
29+
ret void
30+
}
31+
32+
!nvvm.annotations = !{!0}
33+
34+
!0 = !{ptr @foo, !"kernel", i32 1}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\(\)\{\}]" --version 5
2+
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
4+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
5+
6+
target triple = "nvptx-nvidia-cuda"
7+
8+
@i1g = addrspace(1) global i1 false, align 2
9+
10+
define void @foo() {
11+
; CHECK-LABEL: foo(
12+
; CHECK: .reg .pred %p<2>;
13+
; CHECK: .reg .b16 %rs<4>;
14+
; CHECK-EMPTY:
15+
; CHECK: ld.global.u8 %rs1, [i1g];
16+
; CHECK: and.b16 %rs2, %rs1, 1;
17+
; CHECK: setp.eq.b16 %p1, %rs2, 1;
18+
; CHECK: @%p1 bra $L__BB0_2;
19+
; CHECK: mov.u16 %rs3, 1;
20+
; CHECK: st.global.u8 [i1g], %rs3;
21+
; CHECK: ret;
22+
%tmp = load i1, ptr addrspace(1) @i1g, align 2
23+
br i1 %tmp, label %if.end, label %if.then
24+
25+
if.then:
26+
store i1 true, ptr addrspace(1) @i1g, align 2
27+
br label %if.end
28+
29+
if.end:
30+
ret void
31+
}

0 commit comments

Comments
 (0)