|
3 | 3 | ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
|
4 | 4 | ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
|
5 | 5 | ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
|
| 6 | +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s |
| 7 | +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s |
6 | 8 |
|
7 | 9 | define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
|
8 | 10 | ; GFX6-LABEL: v_uaddsat_i8:
|
@@ -36,6 +38,28 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
|
36 | 38 | ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
|
37 | 39 | ; GFX10-NEXT: v_min_u16 v0, 0xff, v0
|
38 | 40 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 41 | +; |
| 42 | +; GFX11-TRUE16-LABEL: v_uaddsat_i8: |
| 43 | +; GFX11-TRUE16: ; %bb.0: |
| 44 | +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 45 | +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 |
| 46 | +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| 47 | +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| 48 | +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l |
| 49 | +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| 50 | +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 51 | +; GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l |
| 52 | +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| 53 | +; |
| 54 | +; GFX11-FAKE16-LABEL: v_uaddsat_i8: |
| 55 | +; GFX11-FAKE16: ; %bb.0: |
| 56 | +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 57 | +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 |
| 58 | +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| 59 | +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| 60 | +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 |
| 61 | +; GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 |
| 62 | +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
39 | 63 | %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
|
40 | 64 | ret i8 %result
|
41 | 65 | }
|
@@ -67,6 +91,20 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
|
67 | 91 | ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
68 | 92 | ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
|
69 | 93 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 94 | +; |
| 95 | +; GFX11-TRUE16-LABEL: v_uaddsat_i16: |
| 96 | +; GFX11-TRUE16: ; %bb.0: |
| 97 | +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 98 | +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l |
| 99 | +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 100 | +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp |
| 101 | +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| 102 | +; |
| 103 | +; GFX11-FAKE16-LABEL: v_uaddsat_i16: |
| 104 | +; GFX11-FAKE16: ; %bb.0: |
| 105 | +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 106 | +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp |
| 107 | +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
70 | 108 | %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
|
71 | 109 | ret i16 %result
|
72 | 110 | }
|
@@ -97,6 +135,12 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
|
97 | 135 | ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
98 | 136 | ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp
|
99 | 137 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 138 | +; |
| 139 | +; GFX11-LABEL: v_uaddsat_i32: |
| 140 | +; GFX11: ; %bb.0: |
| 141 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 142 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp |
| 143 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
100 | 144 | %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
|
101 | 145 | ret i32 %result
|
102 | 146 | }
|
@@ -136,6 +180,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
|
136 | 180 | ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
137 | 181 | ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
|
138 | 182 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 183 | +; |
| 184 | +; GFX11-LABEL: v_uaddsat_v2i16: |
| 185 | +; GFX11: ; %bb.0: |
| 186 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 187 | +; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp |
| 188 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
139 | 189 | %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
|
140 | 190 | ret <2 x i16> %result
|
141 | 191 | }
|
@@ -184,6 +234,13 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
|
184 | 234 | ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp
|
185 | 235 | ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp
|
186 | 236 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 237 | +; |
| 238 | +; GFX11-LABEL: v_uaddsat_v3i16: |
| 239 | +; GFX11: ; %bb.0: |
| 240 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 241 | +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp |
| 242 | +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp |
| 243 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
187 | 244 | %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
|
188 | 245 | ret <3 x i16> %result
|
189 | 246 | }
|
@@ -238,6 +295,13 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
|
238 | 295 | ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp
|
239 | 296 | ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp
|
240 | 297 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 298 | +; |
| 299 | +; GFX11-LABEL: v_uaddsat_v4i16: |
| 300 | +; GFX11: ; %bb.0: |
| 301 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 302 | +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp |
| 303 | +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp |
| 304 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
241 | 305 | %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
|
242 | 306 | %cast = bitcast <4 x i16> %result to <2 x float>
|
243 | 307 | ret <2 x float> %cast
|
@@ -275,6 +339,13 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
|
275 | 339 | ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp
|
276 | 340 | ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp
|
277 | 341 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 342 | +; |
| 343 | +; GFX11-LABEL: v_uaddsat_v2i32: |
| 344 | +; GFX11: ; %bb.0: |
| 345 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 346 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp |
| 347 | +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp |
| 348 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
278 | 349 | %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
|
279 | 350 | ret <2 x i32> %result
|
280 | 351 | }
|
@@ -317,6 +388,14 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
|
317 | 388 | ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp
|
318 | 389 | ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp
|
319 | 390 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 391 | +; |
| 392 | +; GFX11-LABEL: v_uaddsat_v3i32: |
| 393 | +; GFX11: ; %bb.0: |
| 394 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 395 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp |
| 396 | +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp |
| 397 | +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp |
| 398 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
320 | 399 | %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
|
321 | 400 | ret <3 x i32> %result
|
322 | 401 | }
|
@@ -365,6 +444,15 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
|
365 | 444 | ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp
|
366 | 445 | ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp
|
367 | 446 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 447 | +; |
| 448 | +; GFX11-LABEL: v_uaddsat_v4i32: |
| 449 | +; GFX11: ; %bb.0: |
| 450 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 451 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp |
| 452 | +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp |
| 453 | +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp |
| 454 | +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp |
| 455 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
368 | 456 | %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
|
369 | 457 | ret <4 x i32> %result
|
370 | 458 | }
|
@@ -437,6 +525,19 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
|
437 | 525 | ; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp
|
438 | 526 | ; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp
|
439 | 527 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 528 | +; |
| 529 | +; GFX11-LABEL: v_uaddsat_v8i32: |
| 530 | +; GFX11: ; %bb.0: |
| 531 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 532 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v8 clamp |
| 533 | +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v9 clamp |
| 534 | +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v10 clamp |
| 535 | +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v11 clamp |
| 536 | +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v12 clamp |
| 537 | +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v13 clamp |
| 538 | +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp |
| 539 | +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp |
| 540 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
440 | 541 | %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
|
441 | 542 | ret <8 x i32> %result
|
442 | 543 | }
|
@@ -565,6 +666,29 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
|
565 | 666 | ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
566 | 667 | ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp
|
567 | 668 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 669 | +; |
| 670 | +; GFX11-LABEL: v_uaddsat_v16i32: |
| 671 | +; GFX11: ; %bb.0: |
| 672 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 673 | +; GFX11-NEXT: scratch_load_b32 v31, off, s32 |
| 674 | +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp |
| 675 | +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp |
| 676 | +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp |
| 677 | +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp |
| 678 | +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp |
| 679 | +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp |
| 680 | +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp |
| 681 | +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp |
| 682 | +; GFX11-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp |
| 683 | +; GFX11-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp |
| 684 | +; GFX11-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp |
| 685 | +; GFX11-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp |
| 686 | +; GFX11-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp |
| 687 | +; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp |
| 688 | +; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp |
| 689 | +; GFX11-NEXT: s_waitcnt vmcnt(0) |
| 690 | +; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp |
| 691 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
568 | 692 | %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
|
569 | 693 | ret <16 x i32> %result
|
570 | 694 | }
|
@@ -610,6 +734,17 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
|
610 | 734 | ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
|
611 | 735 | ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
|
612 | 736 | ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
| 737 | +; |
| 738 | +; GFX11-LABEL: v_uaddsat_i64: |
| 739 | +; GFX11: ; %bb.0: |
| 740 | +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 741 | +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 |
| 742 | +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo |
| 743 | +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 744 | +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] |
| 745 | +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo |
| 746 | +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo |
| 747 | +; GFX11-NEXT: s_setpc_b64 s[30:31] |
613 | 748 | %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
|
614 | 749 | ret i64 %result
|
615 | 750 | }
|
|
0 commit comments