@@ -3249,3 +3249,209 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
3249
3249
%max1 = call double @llvm.maximum.f64 (double %max0 , double 16 .0 )
3250
3250
ret double %max1
3251
3251
}
3252
+
3253
+ define <2 x float > @v_no_fmaximum3_f32__multi_use (float %a , float %b , float %c ) {
3254
+ ; GFX12-LABEL: v_no_fmaximum3_f32__multi_use:
3255
+ ; GFX12: ; %bb.0:
3256
+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3257
+ ; GFX12-NEXT: s_wait_expcnt 0x0
3258
+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3259
+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3260
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3261
+ ; GFX12-NEXT: v_maximum_f32 v0, v0, v1
3262
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3263
+ ; GFX12-NEXT: v_maximum_f32 v1, v0, v2
3264
+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3265
+ ;
3266
+ ; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
3267
+ ; GFX9: ; %bb.0:
3268
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3269
+ ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
3270
+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
3271
+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
3272
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
3273
+ ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
3274
+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
3275
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
3276
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3277
+ %max0 = call float @llvm.maximum.f32 (float %a , float %b )
3278
+ %max1 = call float @llvm.maximum.f32 (float %max0 , float %c )
3279
+ %insert.0 = insertelement <2 x float > poison, float %max0 , i32 0
3280
+ %insert.1 = insertelement <2 x float > %insert.0 , float %max1 , i32 1
3281
+ ret <2 x float > %insert.1
3282
+ }
3283
+
3284
+ define amdgpu_ps <2 x i32 > @s_no_fmaximum3_f32__multi_use (float inreg %a , float inreg %b , float inreg %c ) {
3285
+ ; GFX12-LABEL: s_no_fmaximum3_f32__multi_use:
3286
+ ; GFX12: ; %bb.0:
3287
+ ; GFX12-NEXT: s_maximum_f32 s0, s0, s1
3288
+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
3289
+ ; GFX12-NEXT: s_maximum_f32 s1, s0, s2
3290
+ ; GFX12-NEXT: ; return to shader part epilog
3291
+ ;
3292
+ ; GFX9-LABEL: s_no_fmaximum3_f32__multi_use:
3293
+ ; GFX9: ; %bb.0:
3294
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
3295
+ ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
3296
+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
3297
+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
3298
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
3299
+ ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0
3300
+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0
3301
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3302
+ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3303
+ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3304
+ ; GFX9-NEXT: ; return to shader part epilog
3305
+ %max0 = call float @llvm.maximum.f32 (float %a , float %b )
3306
+ %max1 = call float @llvm.maximum.f32 (float %max0 , float %c )
3307
+ %cast0 = bitcast float %max0 to i32
3308
+ %cast1 = bitcast float %max1 to i32
3309
+ %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane (i32 %cast0 )
3310
+ %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane (i32 %cast1 )
3311
+ %insert.0 = insertelement <2 x i32 > poison, i32 %readfirstlane0 , i32 0
3312
+ %insert.1 = insertelement <2 x i32 > %insert.0 , i32 %readfirstlane1 , i32 1
3313
+ ret <2 x i32 > %insert.1
3314
+ }
3315
+
3316
+ define <2 x half > @v_no_fmaximum3_f16__multi_use (half %a , half %b , half %c ) {
3317
+ ; GFX12-LABEL: v_no_fmaximum3_f16__multi_use:
3318
+ ; GFX12: ; %bb.0:
3319
+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3320
+ ; GFX12-NEXT: s_wait_expcnt 0x0
3321
+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3322
+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3323
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3324
+ ; GFX12-NEXT: v_maximum_f16 v0, v0, v1
3325
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3326
+ ; GFX12-NEXT: v_maximum_f16 v1, v0, v2
3327
+ ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
3328
+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3329
+ ;
3330
+ ; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
3331
+ ; GFX9: ; %bb.0:
3332
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333
+ ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
3334
+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
3335
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
3336
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
3337
+ ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
3338
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
3339
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
3340
+ ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
3341
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3342
+ %max0 = call half @llvm.maximum.f16 (half %a , half %b )
3343
+ %max1 = call half @llvm.maximum.f16 (half %max0 , half %c )
3344
+ %insert.0 = insertelement <2 x half > poison, half %max0 , i32 0
3345
+ %insert.1 = insertelement <2 x half > %insert.0 , half %max1 , i32 1
3346
+ ret <2 x half > %insert.1
3347
+ }
3348
+
3349
+ define amdgpu_ps <2 x i32 > @s_no_fmaximum3_f16__multi_use (half inreg %a , half inreg %b , half inreg %c ) {
3350
+ ; GFX12-LABEL: s_no_fmaximum3_f16__multi_use:
3351
+ ; GFX12: ; %bb.0:
3352
+ ; GFX12-NEXT: s_maximum_f16 s0, s0, s1
3353
+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3354
+ ; GFX12-NEXT: s_maximum_f16 s1, s0, s2
3355
+ ; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
3356
+ ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
3357
+ ; GFX12-NEXT: ; return to shader part epilog
3358
+ ;
3359
+ ; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
3360
+ ; GFX9: ; %bb.0:
3361
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
3362
+ ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
3363
+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
3364
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
3365
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
3366
+ ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
3367
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
3368
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3369
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
3370
+ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
3371
+ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3372
+ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3373
+ ; GFX9-NEXT: ; return to shader part epilog
3374
+ %max0 = call half @llvm.maximum.f16 (half %a , half %b )
3375
+ %max1 = call half @llvm.maximum.f16 (half %max0 , half %c )
3376
+ %cast0 = bitcast half %max0 to i16
3377
+ %cast1 = bitcast half %max1 to i16
3378
+ %ext0 = zext i16 %cast0 to i32
3379
+ %ext1 = zext i16 %cast1 to i32
3380
+ %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane (i32 %ext0 )
3381
+ %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane (i32 %ext1 )
3382
+ %insert.0 = insertelement <2 x i32 > poison, i32 %readfirstlane0 , i32 0
3383
+ %insert.1 = insertelement <2 x i32 > %insert.0 , i32 %readfirstlane1 , i32 1
3384
+ ret <2 x i32 > %insert.1
3385
+ }
3386
+
3387
+ define <4 x half > @v_no_fmaximum3_v2f16__multi_use (<2 x half > %a , <2 x half > %b , <2 x half > %c ) {
3388
+ ; GFX12-LABEL: v_no_fmaximum3_v2f16__multi_use:
3389
+ ; GFX12: ; %bb.0:
3390
+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3391
+ ; GFX12-NEXT: s_wait_expcnt 0x0
3392
+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3393
+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3394
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3395
+ ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
3396
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3397
+ ; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2
3398
+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3399
+ ;
3400
+ ; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
3401
+ ; GFX9: ; %bb.0:
3402
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3403
+ ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1
3404
+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
3405
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
3406
+ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
3407
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3408
+ ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
3409
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
3410
+ ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3411
+ ; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4
3412
+ ; GFX9-NEXT: v_pk_max_f16 v3, v0, v2
3413
+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2
3414
+ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
3415
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3416
+ ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
3417
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
3418
+ ; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4
3419
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3420
+ %max0 = call <2 x half > @llvm.maximum.f16 (<2 x half > %a , <2 x half > %b )
3421
+ %max1 = call <2 x half > @llvm.maximum.f16 (<2 x half > %max0 , <2 x half > %c )
3422
+ %concat = shufflevector <2 x half > %max0 , <2 x half > %max1 , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
3423
+ ret <4 x half > %concat
3424
+ }
3425
+
3426
+ define <2 x double > @v_no_fmaximum3_f64__multi_use (double %a , double %b , double %c ) {
3427
+ ; GFX12-LABEL: v_no_fmaximum3_f64__multi_use:
3428
+ ; GFX12: ; %bb.0:
3429
+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3430
+ ; GFX12-NEXT: s_wait_expcnt 0x0
3431
+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3432
+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3433
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3434
+ ; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
3435
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3436
+ ; GFX12-NEXT: v_maximum_f64 v[2:3], v[0:1], v[4:5]
3437
+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3438
+ ;
3439
+ ; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
3440
+ ; GFX9: ; %bb.0:
3441
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3442
+ ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
3443
+ ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3444
+ ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3445
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
3446
+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3447
+ ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
3448
+ ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
3449
+ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3450
+ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
3451
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3452
+ %max0 = call double @llvm.maximum.f64 (double %a , double %b )
3453
+ %max1 = call double @llvm.maximum.f64 (double %max0 , double %c )
3454
+ %insert.0 = insertelement <2 x double > poison, double %max0 , i32 0
3455
+ %insert.1 = insertelement <2 x double > %insert.0 , double %max1 , i32 1
3456
+ ret <2 x double > %insert.1
3457
+ }
0 commit comments