Skip to content

Commit e7412a5

Browse files
authored
[AMDGPU][True16][CodeGen] uaddsat/usubsat sdag for true16 format (#118708)
uaddsat and usubsat SDAG codeGen pattern for True16 format witth V_ADD/SUB_NC_U16
1 parent 27eaa8a commit e7412a5

File tree

3 files changed

+235
-28
lines changed

3 files changed

+235
-28
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,6 +1268,11 @@ let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in
12681268
>;
12691269
} // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus
12701270

1271+
let True16Predicate = UseRealTrue16Insts in {
1272+
def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_t16_e64>;
1273+
def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_t16_e64>;
1274+
} // End OtherPredicates = [UseRealTrue16Insts]
1275+
12711276
let True16Predicate = UseFakeTrue16Insts in {
12721277
def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_fake16_e64>;
12731278
def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_fake16_e64>;

llvm/test/CodeGen/AMDGPU/uaddsat.ll

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
44
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
55
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
68

79
define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
810
; GFX6-LABEL: v_uaddsat_i8:
@@ -36,6 +38,28 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
3638
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
3739
; GFX10-NEXT: v_min_u16 v0, 0xff, v0
3840
; GFX10-NEXT: s_setpc_b64 s[30:31]
41+
;
42+
; GFX11-TRUE16-LABEL: v_uaddsat_i8:
43+
; GFX11-TRUE16: ; %bb.0:
44+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
46+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
47+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
48+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
49+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
50+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
51+
; GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l
52+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
53+
;
54+
; GFX11-FAKE16-LABEL: v_uaddsat_i8:
55+
; GFX11-FAKE16: ; %bb.0:
56+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57+
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
58+
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
59+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
60+
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1
61+
; GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0
62+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
3963
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
4064
ret i8 %result
4165
}
@@ -67,6 +91,20 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
6791
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6892
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
6993
; GFX10-NEXT: s_setpc_b64 s[30:31]
94+
;
95+
; GFX11-TRUE16-LABEL: v_uaddsat_i16:
96+
; GFX11-TRUE16: ; %bb.0:
97+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
99+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
100+
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp
101+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
102+
;
103+
; GFX11-FAKE16-LABEL: v_uaddsat_i16:
104+
; GFX11-FAKE16: ; %bb.0:
105+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106+
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp
107+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
70108
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
71109
ret i16 %result
72110
}
@@ -97,6 +135,12 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
97135
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98136
; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp
99137
; GFX10-NEXT: s_setpc_b64 s[30:31]
138+
;
139+
; GFX11-LABEL: v_uaddsat_i32:
140+
; GFX11: ; %bb.0:
141+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp
143+
; GFX11-NEXT: s_setpc_b64 s[30:31]
100144
%result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
101145
ret i32 %result
102146
}
@@ -136,6 +180,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
136180
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137181
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
138182
; GFX10-NEXT: s_setpc_b64 s[30:31]
183+
;
184+
; GFX11-LABEL: v_uaddsat_v2i16:
185+
; GFX11: ; %bb.0:
186+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187+
; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
188+
; GFX11-NEXT: s_setpc_b64 s[30:31]
139189
%result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
140190
ret <2 x i16> %result
141191
}
@@ -184,6 +234,13 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
184234
; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp
185235
; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp
186236
; GFX10-NEXT: s_setpc_b64 s[30:31]
237+
;
238+
; GFX11-LABEL: v_uaddsat_v3i16:
239+
; GFX11: ; %bb.0:
240+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241+
; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp
242+
; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp
243+
; GFX11-NEXT: s_setpc_b64 s[30:31]
187244
%result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
188245
ret <3 x i16> %result
189246
}
@@ -238,6 +295,13 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
238295
; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp
239296
; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp
240297
; GFX10-NEXT: s_setpc_b64 s[30:31]
298+
;
299+
; GFX11-LABEL: v_uaddsat_v4i16:
300+
; GFX11: ; %bb.0:
301+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302+
; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp
303+
; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp
304+
; GFX11-NEXT: s_setpc_b64 s[30:31]
241305
%result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
242306
%cast = bitcast <4 x i16> %result to <2 x float>
243307
ret <2 x float> %cast
@@ -275,6 +339,13 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
275339
; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp
276340
; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp
277341
; GFX10-NEXT: s_setpc_b64 s[30:31]
342+
;
343+
; GFX11-LABEL: v_uaddsat_v2i32:
344+
; GFX11: ; %bb.0:
345+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp
347+
; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp
348+
; GFX11-NEXT: s_setpc_b64 s[30:31]
278349
%result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
279350
ret <2 x i32> %result
280351
}
@@ -317,6 +388,14 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
317388
; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp
318389
; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp
319390
; GFX10-NEXT: s_setpc_b64 s[30:31]
391+
;
392+
; GFX11-LABEL: v_uaddsat_v3i32:
393+
; GFX11: ; %bb.0:
394+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp
396+
; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp
397+
; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp
398+
; GFX11-NEXT: s_setpc_b64 s[30:31]
320399
%result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
321400
ret <3 x i32> %result
322401
}
@@ -365,6 +444,15 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
365444
; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp
366445
; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp
367446
; GFX10-NEXT: s_setpc_b64 s[30:31]
447+
;
448+
; GFX11-LABEL: v_uaddsat_v4i32:
449+
; GFX11: ; %bb.0:
450+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp
452+
; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp
453+
; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp
454+
; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp
455+
; GFX11-NEXT: s_setpc_b64 s[30:31]
368456
%result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
369457
ret <4 x i32> %result
370458
}
@@ -437,6 +525,19 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
437525
; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp
438526
; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp
439527
; GFX10-NEXT: s_setpc_b64 s[30:31]
528+
;
529+
; GFX11-LABEL: v_uaddsat_v8i32:
530+
; GFX11: ; %bb.0:
531+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v8 clamp
533+
; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v9 clamp
534+
; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v10 clamp
535+
; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v11 clamp
536+
; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v12 clamp
537+
; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v13 clamp
538+
; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp
539+
; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp
540+
; GFX11-NEXT: s_setpc_b64 s[30:31]
440541
%result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
441542
ret <8 x i32> %result
442543
}
@@ -565,6 +666,29 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
565666
; GFX10-NEXT: s_waitcnt vmcnt(0)
566667
; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp
567668
; GFX10-NEXT: s_setpc_b64 s[30:31]
669+
;
670+
; GFX11-LABEL: v_uaddsat_v16i32:
671+
; GFX11: ; %bb.0:
672+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673+
; GFX11-NEXT: scratch_load_b32 v31, off, s32
674+
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp
675+
; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp
676+
; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp
677+
; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp
678+
; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp
679+
; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp
680+
; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp
681+
; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp
682+
; GFX11-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp
683+
; GFX11-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp
684+
; GFX11-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp
685+
; GFX11-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp
686+
; GFX11-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp
687+
; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp
688+
; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp
689+
; GFX11-NEXT: s_waitcnt vmcnt(0)
690+
; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp
691+
; GFX11-NEXT: s_setpc_b64 s[30:31]
568692
%result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
569693
ret <16 x i32> %result
570694
}
@@ -610,6 +734,17 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
610734
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
611735
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
612736
; GFX10-NEXT: s_setpc_b64 s[30:31]
737+
;
738+
; GFX11-LABEL: v_uaddsat_i64:
739+
; GFX11: ; %bb.0:
740+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741+
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
742+
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
743+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
744+
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
745+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
746+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
747+
; GFX11-NEXT: s_setpc_b64 s[30:31]
613748
%result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
614749
ret i64 %result
615750
}

0 commit comments

Comments
 (0)