@@ -154,3 +154,27 @@ define float @fold_fmul_distributive(float %x, float %y) {
154154 %fmul = fmul contract float %fadd , %x
155155 ret float %fmul
156156}
157+
158+ ; test to make sure contract is not dropped such that we can generate fma from following mul/add
159+ define amdgpu_kernel void @vec_mul_scalar_add_fma (<2 x float > %a , <2 x float > %b , float %c1 , ptr addrspace (1 ) %inptr ) {
160+ ; GFX906-LABEL: vec_mul_scalar_add_fma:
161+ ; GFX906: ; %bb.0:
162+ ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
163+ ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
164+ ; GFX906-NEXT: s_load_dword s5, s[0:1], 0x34
165+ ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
166+ ; GFX906-NEXT: v_mov_b32_e32 v0, 0
167+ ; GFX906-NEXT: v_mov_b32_e32 v1, s6
168+ ; GFX906-NEXT: v_mul_f32_e32 v1, s4, v1
169+ ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
170+ ; GFX906-NEXT: v_add_f32_e32 v1, s5, v1
171+ ; GFX906-NEXT: global_store_dword v0, v1, s[2:3] offset:4
172+ ; GFX906-NEXT: s_endpgm
173+ %gep = getelementptr float , ptr addrspace (1 ) %inptr , i32 1
174+ %c = shufflevector <2 x float > %a , <2 x float > poison, <2 x i32 > zeroinitializer
175+ %mul = fmul contract <2 x float > %c , %b
176+ %elv = extractelement <2 x float > %mul , i64 0
177+ %add = fadd contract float %elv , %c1
178+ store float %add , ptr addrspace (1 ) %gep , align 4
179+ ret void
180+ }
0 commit comments