Skip to content

Commit 8ff60c4

Browse files
authored
[NVPTX] Add support for nvvm.flo.[us] intrinsics (#114489)
Add support for '`llvm.nvvm.flo.[su].*`' intrinsics which correspond to a PTX `bfind` instruction. See [PTX ISA 9.7.1.16. Integer Arithmetic Instructions: bfind] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind) The '`llvm.nvvm.flo.u`' family of intrinsics identifies the bit position of the leading one, returning either it's offset from the most or least significant bit. The '`llvm.nvvm.flo.s`' family of intrinsics identifies the bit position of the leading non-sign bit, returning either it's offset from the most or least significant bit.
1 parent 9a450a0 commit 8ff60c4

File tree

4 files changed

+225
-0
lines changed

4 files changed

+225
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,59 @@ right, and the least significant bits are extracted to produce a result that is
378378
the same size as the original arguments. The shift amount is the minimum of the
379379
value of %n and the bit width of the integer type.
380380

381+
'``llvm.nvvm.flo.u.*``' Intrinsic
382+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
383+
384+
Syntax:
385+
"""""""
386+
387+
.. code-block:: llvm
388+
389+
declare i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 %shiftamt)
390+
declare i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 %shiftamt)
391+
392+
Overview:
393+
"""""""""
394+
395+
The '``llvm.nvvm.flo.u``' family of intrinsics identifies the bit position of the
396+
leading one, returning either it's offset from the most or least significant bit.
397+
398+
Semantics:
399+
""""""""""
400+
401+
The '``llvm.nvvm.flo.u``' family of intrinsics returns the bit position of the
402+
most significant 1. If %shiftamt is true, The result is the shift amount needed
403+
to left-shift the found bit into the most-significant bit position, otherwise
404+
the result is the shift amount needed to right-shift the found bit into the
405+
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
406+
407+
'``llvm.nvvm.flo.s.*``' Intrinsic
408+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
409+
410+
Syntax:
411+
"""""""
412+
413+
.. code-block:: llvm
414+
415+
declare i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 %shiftamt)
416+
declare i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 %shiftamt)
417+
418+
Overview:
419+
"""""""""
420+
421+
The '``llvm.nvvm.flo.s``' family of intrinsics identifies the bit position of the
422+
leading non-sign bit, returning either it's offset from the most or least
423+
significant bit.
424+
425+
Semantics:
426+
""""""""""
427+
428+
The '``llvm.nvvm.flo.s``' family of intrinsics returns the bit position of the
429+
most significant 0 for negative inputs and the most significant 1 for
430+
non-negative inputs. If %shiftamt is true, The result is the shift amount needed
431+
to left-shift the found bit into the most-significant bit position, otherwise
432+
the result is the shift amount needed to right-shift the found bit into the
433+
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
381434

382435
Other Intrinsics
383436
----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,14 @@ let TargetPrefix = "nvvm" in {
10921092
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
10931093
[IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
10941094

1095+
//
1096+
// FLO - Find Leading One
1097+
//
1098+
foreach sign = ["s", "u"] in
1099+
def int_nvvm_flo_ # sign :
1100+
DefaultAttrsIntrinsic<[llvm_i32_ty],
1101+
[llvm_anyint_ty, llvm_i1_ty],
1102+
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
10951103

10961104
//
10971105
// Convert

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,19 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
// Utility class to wrap up information about a register and DAG type for more
10+
// convenient iteration and parameterization
11+
class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> {
12+
ValueType Ty = ty;
13+
NVPTXRegClass RC = rc;
14+
Operand Imm = imm;
15+
int Size = ty.Size;
16+
}
17+
18+
def I32RT : RegTyInfo<i32, Int32Regs, i32imm>;
19+
def I64RT : RegTyInfo<i64, Int64Regs, i64imm>;
20+
21+
922
def immFloat0 : PatLeaf<(fpimm), [{
1023
float f = (float)N->getValueAPF().convertToFloat();
1124
return (f==0.0f);
@@ -1299,6 +1312,25 @@ def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
12991312
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
13001313
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
13011314

1315+
//
1316+
// BFIND
1317+
//
1318+
1319+
foreach t = [I32RT, I64RT] in {
1320+
foreach sign = ["s", "u"] in {
1321+
defvar flo_intrin = !cast<Intrinsic>("int_nvvm_flo_" # sign);
1322+
def BFIND_ # sign # t.Size
1323+
: NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
1324+
"bfind." # sign # t.Size # " \t$dst, $src;",
1325+
[(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), 0))]>;
1326+
1327+
def BFIND_SHIFTAMT_ # sign # t.Size
1328+
: NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
1329+
"bfind.shiftamt." # sign # t.Size # " \t$dst, $src;",
1330+
[(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), -1))]>;
1331+
}
1332+
}
1333+
13021334
//
13031335
// Convert
13041336
//

llvm/test/CodeGen/NVPTX/flo.ll

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
define i32 @flo_1(i32 %a) {
8+
; CHECK-LABEL: flo_1(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b32 %r<3>;
11+
; CHECK-EMPTY:
12+
; CHECK-NEXT: // %bb.0:
13+
; CHECK-NEXT: ld.param.u32 %r1, [flo_1_param_0];
14+
; CHECK-NEXT: bfind.s32 %r2, %r1;
15+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
16+
; CHECK-NEXT: ret;
17+
%r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 false)
18+
ret i32 %r
19+
}
20+
21+
22+
define i32 @flo_2(i32 %a) {
23+
; CHECK-LABEL: flo_2(
24+
; CHECK: {
25+
; CHECK-NEXT: .reg .b32 %r<3>;
26+
; CHECK-EMPTY:
27+
; CHECK-NEXT: // %bb.0:
28+
; CHECK-NEXT: ld.param.u32 %r1, [flo_2_param_0];
29+
; CHECK-NEXT: bfind.shiftamt.s32 %r2, %r1;
30+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
31+
; CHECK-NEXT: ret;
32+
%r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 true)
33+
ret i32 %r
34+
}
35+
36+
define i32 @flo_3(i32 %a) {
37+
; CHECK-LABEL: flo_3(
38+
; CHECK: {
39+
; CHECK-NEXT: .reg .b32 %r<3>;
40+
; CHECK-EMPTY:
41+
; CHECK-NEXT: // %bb.0:
42+
; CHECK-NEXT: ld.param.u32 %r1, [flo_3_param_0];
43+
; CHECK-NEXT: bfind.u32 %r2, %r1;
44+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
45+
; CHECK-NEXT: ret;
46+
%r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 false)
47+
ret i32 %r
48+
}
49+
50+
51+
define i32 @flo_4(i32 %a) {
52+
; CHECK-LABEL: flo_4(
53+
; CHECK: {
54+
; CHECK-NEXT: .reg .b32 %r<3>;
55+
; CHECK-EMPTY:
56+
; CHECK-NEXT: // %bb.0:
57+
; CHECK-NEXT: ld.param.u32 %r1, [flo_4_param_0];
58+
; CHECK-NEXT: bfind.shiftamt.u32 %r2, %r1;
59+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
60+
; CHECK-NEXT: ret;
61+
%r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 true)
62+
ret i32 %r
63+
}
64+
65+
66+
67+
define i32 @flo_5(i64 %a) {
68+
; CHECK-LABEL: flo_5(
69+
; CHECK: {
70+
; CHECK-NEXT: .reg .b32 %r<2>;
71+
; CHECK-NEXT: .reg .b64 %rd<2>;
72+
; CHECK-EMPTY:
73+
; CHECK-NEXT: // %bb.0:
74+
; CHECK-NEXT: ld.param.u64 %rd1, [flo_5_param_0];
75+
; CHECK-NEXT: bfind.s64 %r1, %rd1;
76+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
77+
; CHECK-NEXT: ret;
78+
%r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 false)
79+
ret i32 %r
80+
}
81+
82+
83+
define i32 @flo_6(i64 %a) {
84+
; CHECK-LABEL: flo_6(
85+
; CHECK: {
86+
; CHECK-NEXT: .reg .b32 %r<2>;
87+
; CHECK-NEXT: .reg .b64 %rd<2>;
88+
; CHECK-EMPTY:
89+
; CHECK-NEXT: // %bb.0:
90+
; CHECK-NEXT: ld.param.u64 %rd1, [flo_6_param_0];
91+
; CHECK-NEXT: bfind.shiftamt.s64 %r1, %rd1;
92+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
93+
; CHECK-NEXT: ret;
94+
%r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 true)
95+
ret i32 %r
96+
}
97+
98+
define i32 @flo_7(i64 %a) {
99+
; CHECK-LABEL: flo_7(
100+
; CHECK: {
101+
; CHECK-NEXT: .reg .b32 %r<2>;
102+
; CHECK-NEXT: .reg .b64 %rd<2>;
103+
; CHECK-EMPTY:
104+
; CHECK-NEXT: // %bb.0:
105+
; CHECK-NEXT: ld.param.u64 %rd1, [flo_7_param_0];
106+
; CHECK-NEXT: bfind.u64 %r1, %rd1;
107+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
108+
; CHECK-NEXT: ret;
109+
%r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 false)
110+
ret i32 %r
111+
}
112+
113+
114+
define i32 @flo_8(i64 %a) {
115+
; CHECK-LABEL: flo_8(
116+
; CHECK: {
117+
; CHECK-NEXT: .reg .b32 %r<2>;
118+
; CHECK-NEXT: .reg .b64 %rd<2>;
119+
; CHECK-EMPTY:
120+
; CHECK-NEXT: // %bb.0:
121+
; CHECK-NEXT: ld.param.u64 %rd1, [flo_8_param_0];
122+
; CHECK-NEXT: bfind.shiftamt.u64 %r1, %rd1;
123+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
124+
; CHECK-NEXT: ret;
125+
%r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 true)
126+
ret i32 %r
127+
}
128+
129+
declare i32 @llvm.nvvm.flo.s.i32(i32, i1)
130+
declare i32 @llvm.nvvm.flo.u.i32(i32, i1)
131+
declare i32 @llvm.nvvm.flo.s.i64(i64, i1)
132+
declare i32 @llvm.nvvm.flo.u.i64(i64, i1)

0 commit comments

Comments
 (0)